In [1]:
# import general packages
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from pydub import AudioSegment

In [2]:
# import pyAudioAnalysis package utility
import pyAudioAnalysis as pyaudio
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import MidTermFeatures

In [3]:
# feature extraction data structures
features = []
existing = []
missing = []
prev_features = None

In [4]:
# check and save existing basic features
if os.path.exists("../data/features.csv"):
    prev_features = pd.read_csv("../data/features.csv")
    # drop all rows with any number of NAs
    prev_features.dropna(how = 'any')
    prev_features['song'] = prev_features['title'] + " - " + prev_features['artist']
    existing = prev_features['song'].tolist()

In [5]:
# feature extraction notebook should be in the same folder as the mp3s
for song in os.scandir("."):
    if song.path.endswith(".mp3") and song.is_file():
        # skip ./ in the path to the mp3 file
        file_name = song.path[2:]
        song_name = file_name[:len(file_name) - 4]
        
        if song_name not in existing:
            # add it to the missing list so we can use pyAudioAnalysis on it
            missing.append(file_name)
            
            # get the tempo of the song
            waveform, samp_rate = librosa.load(file_name)
            tempo, beat_frames = librosa.beat.beat_track(waveform, samp_rate)

            # get the chroma number of the song
            beat_times = librosa.frames_to_time(beat_frames, samp_rate)
            y_harmonic, y_percussive = librosa.effects.hpss(waveform)
            chromagram = librosa.feature.chroma_cqt(y_harmonic, samp_rate)
            beat_chroma = librosa.util.sync(chromagram, beat_frames, aggregate = np.median)
            # make beat chroma into a DataFrame and calculate the diff
            chroma_df = pd.DataFrame(beat_chroma)
            diff_values = chroma_df.diff()
            diff_mean = diff_values.mean(axis = 0, skipna = True)
            chroma_num = sum(diff_mean) / len(diff_mean)

            # add song, tempo and chroma number to the features list
            print ([song_name, tempo, chroma_num])
            features.append([song_name, tempo, chroma_num])

In [6]:
# sort the features DataFrame alphabetically by song name
features_df = pd.DataFrame(features, columns = ['song', 'tempo', 'chroma_number'])
features_df = features_df.sort_values(by = ['song'])
features_df.reset_index(drop = True, inplace = True)

In [7]:
# move all the mp3 files that need to have features extracted into some folder
with open('missing.txt', 'w+') as f:
    for song in missing:
        f.write("%s\n" % song)
# use command line here
'''
mkdir missing
cat missing.txt | while read line
do
    cp $line missing
done
'''
# make sure to change the path in directory_feature_extraction

'\nmkdir missing\ncat missing.txt | while read line\ndo\n    cp $line missing\ndone\n'

In [8]:
# extract pyAudioAnalysis features
mid_term_window = 1
mid_term_step = 1
short_term_window = 0.05
short_term_step = 0.05
# use compute_beat = True if we want the extra beat features
pyaudio_feat, files, feat_names = MidTermFeatures.directory_feature_extraction(".", 
                                                                               mid_term_window, 
                                                                               mid_term_step, 
                                                                               short_term_window, 
                                                                               short_term_step,
                                                                               False)

In [20]:
# save pyAudioAnalysis features as neural network features
if len(features_df) == 1:
    # special case when only features from one song are extracted
    nn_features_df = pd.DataFrame([pyaudio_feat], columns = feat_names)
else:
    nn_features_df = pd.DataFrame(pyaudio_feat, columns = feat_names)
nn_features_df['song'] = features_df['song']

ValueError: Empty data passed with indices specified.

In [19]:
# get the basic features by indexing into the neural network feature DataFrame
zcr_ind = feat_names.index('zcr_mean')
ee_ind = feat_names.index('energy_entropy_mean')
spc_ind = feat_names.index('spectral_centroid_mean')

features_df['zero_crossing_rate'] = nn_features_df.iloc[:, zcr_ind]
features_df['energy_entropy'] = nn_features_df.iloc[:, ee_ind]
features_df['spectral_centroid'] = nn_features_df.iloc[:, spc_ind]

ValueError: 'zcr_mean' is not in list

In [11]:
# load the mood label data
moods = pd.read_csv("../data/mood_data.csv")

In [12]:
# merge the features with the mood label data
moods['song'] = moods['title'] + " - " + moods['artist']
# sort the column song to use it for merging
moods = moods.sort_values(by = ['song'])
moods.reset_index(drop = True, inplace = True)

new_features = features_df.merge(moods, on = "song", how = "left")
new_features = new_features.drop(columns = ['song'])
new_features

Unnamed: 0,tempo,chroma_number,title,artist,primary,secondary


In [13]:
# save the basic features, neural network features and the feature names
if not existing:
    new_features = new_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    new_features.to_csv("../data/features.csv", header = True, index = False)
    
    nn_features_df.to_csv("../data/nn_features.csv", header = True, index = False)
    
    feat_names_df = pd.DataFrame(feat_names)
    feat_names_df.to_csv("../data/nn_feature_names.csv", header = True, index = False)
else:
    # concatenate the previous features with new features and overwrite the existing files
    prev_features = prev_features.drop(columns = ['song'])
    all_features = pd.concat([prev_features, new_features], ignore_index = True)
    all_features = all_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    all_features.to_csv("../data/features.csv", header = True, index = False)
    
    prev_nn = pd.read_csv("../data/nn_features.csv")
    all_nn = pd.concat([prev_nn, nn_features_df], ignore_index = True)
    all_nn.to_csv("../data/nn_features.csv", header = True, index = False)

NameError: name 'nn_features_df' is not defined

In [None]:
# make sure to delete the directory missing and the text file missing.txt
'''
rm -r missing
rm missing.txt
'''

In [14]:
# feature engineering
feat_names_df = pd.read_csv("../data/nn_feature_names.csv")
current_features = pd.read_csv("../data/features.csv")
feats_to_extract = pd.read_csv("../data/nn_features.csv")
engineered_features = current_features[['title', 'artist', 'tempo', 'chroma_number']]

# check that the ordering of songs is the same
engineered_features['song'] = feats_to_extract['song']
engineered_features['song_check'] = engineered_features['title'] + ' - ' + engineered_features['artist']
if engineered_features['song'].equals(engineered_features['song_check']):
    print ("Song ordering matches.")
    engineered_features = engineered_features.drop(columns = ['song', 'song_check'])
else:
    print ("Song ordering DOES NOT match.")

# convert the feature names into a list
feat_names_list = feat_names_df['0'].to_list()
# selected 28 total features based on distribution boxplots by primary mood
selected_feats = ['zcr_mean', 'zcr_std', 'energy_mean', 'energy_entropy_mean', 'spectral_centroid_mean', 
                  'spectral_spread_mean', 'spectral_entropy_mean', 'mfcc_2_mean', 'mfcc_5_mean', 'mfcc_6_mean',
                  'spectral_centroid_std', 'spectral_entropy_std', 'spectral_spread_std', 'chroma_7_std',
                  'delta chroma_2_std', 'delta chroma_3_std', 'delta chroma_9_std', 'delta chroma_std_std',
                  'delta energy_std', 'delta mfcc_1_std', 'delta mfcc_3_std', 'delta mfcc_13_std',
                  'delta spectral_centroid_std', 'delta spectral_entropy_std', 'delta spectral_flux_std',
                  'delta spectral_spread_std']
for feat in selected_feats:
    engineered_features[feat] = feats_to_extract.iloc[:, feat_names_list.index(feat)]

# add mood labels to the final features dataframe
engineered_features['primary'] = current_features['primary']
engineered_features['secondary'] = current_features['secondary']

# save the engineered features
engineered_features.to_csv("../data/engineered_features.csv", header = True, index = False)
engineered_features

Song ordering matches.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered_features['song'] = feats_to_extract['song']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered_features['song_check'] = engineered_features['title'] + ' - ' + engineered_features['artist']


Unnamed: 0,title,artist,tempo,chroma_number,zcr_mean,zcr_std,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,...,delta energy_std,delta mfcc_1_std,delta mfcc_3_std,delta mfcc_13_std,delta spectral_centroid_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_spread_std,primary,secondary
0,a new adventure,takanashi yasuharu,117.453835,-0.010635,0.082014,0.031156,0.061664,3.164051,0.186062,0.201923,...,0.031983,0.619485,0.254529,0.189750,0.039689,0.304516,0.003464,0.026444,1,6
1,a oh,super junior,129.199219,-0.005782,0.078039,0.028662,0.087948,3.213386,0.179459,0.196234,...,0.031600,0.785050,0.345785,0.225282,0.035337,0.333457,0.002662,0.021355,3,2
2,a thousand years,christina perri,92.285156,-0.016321,0.044617,0.018092,0.052477,3.217903,0.122203,0.170703,...,0.016750,0.521221,0.241593,0.196299,0.028887,0.155287,0.002794,0.030025,6,5
3,adore u,seventeen,103.359375,-0.007745,0.058125,0.027802,0.113696,3.142738,0.145660,0.170424,...,0.064444,1.280268,0.469696,0.248865,0.035615,0.324714,0.007247,0.028047,3,2
4,after rain,aimer,117.453835,-0.010324,0.068620,0.026402,0.083934,3.211794,0.147982,0.169066,...,0.064776,1.005204,0.362243,0.220264,0.033298,0.300292,0.005283,0.025979,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,you he bu ke,xu song,99.384014,0.015339,0.056268,0.042850,0.073693,3.166332,0.156298,0.198088,...,0.041610,1.132257,0.402372,0.242564,0.071073,0.410712,0.009546,0.050956,2,4
196,youth,troye sivan,123.046875,-0.002242,0.056377,0.035249,0.081590,3.192966,0.140693,0.169824,...,0.036208,0.903736,0.388618,0.235086,0.039178,0.372678,0.005693,0.027476,5,6
197,zhe shi ai,"henry, donghae",151.999081,-0.020014,0.019520,0.004853,0.010187,3.202357,0.075014,0.148539,...,0.005241,1.118669,0.309914,0.191399,0.036687,0.037481,0.012192,0.058416,4,6
198,zui mei hun li,bai xiao bai,112.347147,0.022650,0.051512,0.039506,0.056785,3.168251,0.134493,0.169568,...,0.025834,1.111881,0.477303,0.251263,0.042643,0.410848,0.009373,0.031184,2,4
