In [1]:
import librosa
import pyAudioAnalysis as pyaudio
import numpy as np
import pandas as pd
import os

In [2]:
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import MidTermFeatures

In [3]:
features = []
existing = []
missing = []
prev_features = None

In [4]:
if os.path.exists("../data/features.csv"):
    prev_features = pd.read_csv("../data/features.csv")
    prev_features.dropna(how = 'any')
    prev_features['song'] = prev_features['title'] + " - " + prev_features['artist']
    existing = prev_features['song'].tolist()

In [5]:
for song in os.scandir("."):
    if song.path.endswith(".mp3") and song.is_file():
        # skip ./ in the path to the mp3 file
        file_name = song.path[2:]
        song_name = file_name[:len(file_name) - 4]
        if song_name not in existing:
            # add it to the missing list so we can use pyAudioAnalysis on it
            missing.append(file_name)
            
            # get the tempo of the song
            waveform, samp_rate = librosa.load(file_name)
            tempo, beat_frames = librosa.beat.beat_track(waveform, samp_rate)

            # get the chroma number of the song
            beat_times = librosa.frames_to_time(beat_frames, samp_rate)
            y_harmonic, y_percussive = librosa.effects.hpss(waveform)
            chromagram = librosa.feature.chroma_cqt(y_harmonic, samp_rate)
            beat_chroma = librosa.util.sync(chromagram, beat_frames, aggregate=np.median)
            # make beat chroma into a DataFrame and calculate the diff
            chroma_df = pd.DataFrame(beat_chroma)
            diff_values = chroma_df.diff()
            diff_mean = diff_values.mean(axis = 0, skipna = True)
            chroma_num = sum(diff_mean) / len(diff_mean)

            # add song, tempo and chroma number to the features list
            print ([song_name, tempo, chroma_num])
            features.append([song_name, tempo, chroma_num])



["baby don't cry - exo", 69.83741554054055, -0.01112832059264044]


In [6]:
features_df = pd.DataFrame(features, columns = ['song', 'tempo', 'chroma_number'])
features_df = features_df.sort_values(by = ['song'])

In [7]:
# move all the mp3 files that need to have features extracted into some folder
with open('missing.txt', 'w+') as f:
    for song in missing:
        f.write("%s\n" % song)
# use command line here
'''
mkdir missing
cat missing.txt | while read line
do
    cp $line missing
done
'''
# change the path in directory_feature_extraction

'\nmkdir missing\ncat missing.txt | while read line\ndo\n    cp $line missing\ndone\n'

In [3]:
mid_term_window = 1
mid_term_step = 1
short_term_window = 0.05
short_term_step = 0.05
# use compute_beat = True if we want the extra beat features
pyaudio_feat, files, feat_names = MidTermFeatures.directory_feature_extraction("trial", 
                                                                               mid_term_window, 
                                                                               mid_term_step, 
                                                                               short_term_window, 
                                                                               short_term_step,
                                                                               False)

Analyzing file 1 of 1: trial/traveler of the magic borders - takanashi yasuharu.mp3
Feature extraction complexity ratio: 43.9 x realtime


In [25]:
# neural network features
features_df.reset_index(drop = True, inplace = True)
if len(features_df) == 1:
    nn_features_df = pd.DataFrame([pyaudio_feat], columns = feat_names)
else:
    nn_features_df = pd.DataFrame(pyaudio_feat, columns = feat_names)
nn_features_df['song'] = features_df['song']

In [26]:
zcr_ind = feat_names.index('zcr_mean')
ee_ind = feat_names.index('energy_entropy_mean')
spc_ind = feat_names.index('spectral_centroid_mean')

features_df['zero_crossing_rate'] = nn_features_df.iloc[:, zcr_ind]
features_df['energy_entropy'] = nn_features_df.iloc[:, ee_ind]
features_df['spectral_centroid'] = nn_features_df.iloc[:, spc_ind]

In [27]:
features_df

Unnamed: 0,song,tempo,chroma_number,zero_crossing_rate,energy_entropy,spectral_centroid
0,baby don't cry - exo,69.837416,-0.011128,0.053817,3.185847,0.136686


In [62]:
moods = pd.read_csv("../data/mood_data.csv")
moods

Unnamed: 0,title,artist,primary,secondary
0,who says,selena gomez,2,3
1,don't stop,5 seconds of summer,3,2
2,do you hear what i hear,carrie underwood,4,2
3,"alliance force, assemble",takanashi yasuharu,1,6
4,ni pa bu pa shi qu wo,liu zeng tong,2,4
...,...,...,...,...
195,nine four two zero,mai xiao er,2,4
196,zui mei hun li,bai xiao bai,2,4
197,jiu shi xi huan ni,li meng yin,2,6
198,xiao xing xing,wang su long,2,4


In [63]:
# merging is incorrect, does not work
moods['song'] = moods['title'] + " - " + moods['artist']
moods = moods.sort_values(by = ['song'])
moods.reset_index(drop = True, inplace = True)

new_features = features_df.merge(moods, on = "song", how = "left")
new_features = new_features.drop(columns = ['song'])
new_features

Unnamed: 0,tempo,chroma_number,zero_crossing_rate,energy_entropy,spectral_centroid,title,artist,primary,secondary
0,69.837416,-0.011128,0.053817,3.185847,0.136686,"baby, don't cry",exo,2,4


In [15]:
if not existing:
    new_features = new_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    new_features.to_csv("../data/features.csv", header = True, index = False)
    
    nn_features_df.to_csv("../data/nn_features.csv", header = True, index = False)
    
    feat_names_df = pd.DataFrame(feat_names)
    feat_names_df.to_csv("../data/nn_feature_names.csv", header = True, index = False)
else:
    prev_features = prev_features.drop(columns = ['song'])
    all_features = pd.concat([prev_features, new_features], ignore_index = True)
    all_features = all_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    all_features.to_csv("../data/features.csv", header = True, index = False)
    
    prev_nn = pd.read_csv("../data/nn_features.csv")
    all_nn = pd.concat([prev_nn, nn_features_df], ignore_index = True)
    all_nn.to_csv("../data/nn_features.csv", header = True, index = False)

In [None]:
# make sure to delete the directory missing and the text file missing.txt
'''
rm -r missing
rm missing.txt
'''