In [None]:
# import packages
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from pydub import AudioSegment

In [None]:
# initialize raw features data structures
existing_raw = []
new_raw_dict = {}

In [None]:
# check and save if there are existing raw features
if os.path.exists("../data/raw_features.csv"):
    prev_raw = pd.read_csv("../data/raw_features.csv")
    prev_raw.dropna(how = 'any')
    prev_raw['song'] = prev_raw['title'] + " - " + prev_raw['artist']
    existing_raw = prev_raw['song'].tolist()

In [None]:
# scan the directory for mp3 files
for song in os.scandir("."):
    if song.path.endswith(".mp3") and song.is_file():
        # skip ./ in the path to the mp3 file
        file_name = song.path[2:]
        song_name = file_name[:len(file_name) - 4]
        if song_name not in existing_raw:
            # set the channel and framerate
            sound = AudioSegment.from_mp3(file_name)
            sound = sound.set_channels(1)
            sound = sound.set_frame_rate(16000)
            sound.export("temp.wav", format = "wav")
            # raw feature extraction does not work for mp3s shorter than 35 seconds
            for block in sf.blocks("temp.wav", blocksize = 100000, start = 50000):
                new_raw_dict[song_name] = block
                break

# if the temp.wav file exists, then conversion and extraction occurred
if os.path.exists("temp.wav"):
    os.remove("temp.wav")
else:
    print ("No temporary wav file found, extraction failed.")

In [None]:
# format and sort the DataFrame with raw features
new_raw = pd.DataFrame(new_raw_dict)
new_raw = new_raw.transpose()
new_raw.index.name = 'song'
new_raw.reset_index(inplace = True)
new_raw = new_raw.sort_values(by = ['song'])
new_raw.reset_index(drop = True, inplace = True)

In [None]:
# write raw features to csv
if not existing_raw:
    new_raw.to_csv("../data/raw_features.csv", header = True, index = False)
else:
    all_raw = pd.concat([prev_raw, new_raw], ignore_index = True)
    all_raw.to_csv("../data/raw_features.csv", header = True, index = False)