In [4]:
#Necessary imports for the feature_extraction
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from pydub import AudioSegment

#PyAudioAnalysis
import pyAudioAnalysis as pyaudio
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import MidTermFeatures

In [2]:
# feature extraction data structures
features = []
existing_features = []
missing_features = []
previous_features = None

In [3]:
# check and save existing basic features
# check and save existing basic features
if os.path.exists("../data/features.csv"):
    previous_features = pd.read_csv("../data/features.csv")
    # drop all rows with any number of NAs
    previous_features.dropna(how = 'any')
    previous_features['song'] = previous_features['title'] + " - " + previous_features['artist']
    existing = previous_features['song'].tolist()

In [4]:
# feature extraction notebook should be in the same folder as the mp3s
for song in os.scandir("C:\\University\\Applied Machine Learning\\Project AML\\AppliedMachineLearning\\Main Project\\ProjectMusic"):
    if song.path.endswith(".mp3") and song.is_file():
        file_name = song.path[2:]
        song_name = file_name[98:len(file_name) - 4 ]

        if song_name not in existing_features:
            # add it to the missing list so we can use pyAudioAnalysis on it 
            missing_features.append(file_name)

            #tempo of the song
            waveform, sample_rate = librosa.load(file_name)
            tempo, beat_frames = librosa.beat.beat_track(y=waveform, sr=sample_rate)
             # get the chroma number of the song
            beat_times = librosa.frames_to_time(beat_frames, sr=sample_rate)
            y_harmonic, y_percussive = librosa.effects.hpss(waveform)
            chromagram = librosa.feature.chroma_cqt(y=y_harmonic, sr=sample_rate)
            beat_chroma = librosa.util.sync(chromagram, beat_frames, aggregate = np.median)
            # make beat chroma into a DataFrame and calculate the diff
            chroma_df = pd.DataFrame(beat_chroma)
            diff_values = chroma_df.diff()
            diff_mean = diff_values.mean(axis = 0, skipna = True)
            chroma_num = sum(diff_mean) / len(diff_mean)

            # add song, tempo and chroma number to the features list
            print ([song_name, tempo, chroma_num])
            features.append([song_name, tempo, chroma_num])

['boy pablo - i hope she loves me back', 172.265625, -0.014764327699690405]
['Chase Shakur - Pray Slow', 80.74951171875, -0.0004073584414761695]
['Colin Stetson - Reborn', 143.5546875, -0.009726424886046749]
['Ethan Fields - Roll', 112.34714673913044, 0.013494097133865936]
['Home - 12', 89.10290948275862, 0.04064002817880582]
['Home - Byzantium', 135.99917763157896, 0.010142184480126814]
['Home - Drawing The Blinds', 89.10290948275862, -0.01080438335652505]
['Home - Head First', 135.99917763157896, -0.009489427418101144]
['Home - Intro', 99.38401442307692, -0.044663804797321814]
['Home - Resonance', 112.34714673913044, -0.02312311655040872]
['ITG Studios - Gravitational Forces', 135.99917763157896, -0.004196563845860897]
['Jakob - Sunshine Girl (Pt. 2)', 75.99954044117646, -0.015666099043745715]
['Jakob Ogawa - All Your Love', 86.1328125, -0.012112732377585773]
['Jakob Ogawa - You Might Be Sleeping (w. Clairo)', 92.28515625, -0.0034540407026909543]
['Jamie McDermott - In This Shirt', 1

In [5]:
len(features)

49

In [6]:
features_df = pd.DataFrame(features, columns = ['song', 'tempo', 'chroma_number'])

In [7]:
features_df = features_df.sort_values(by = ['song'])

In [8]:
features_df.reset_index(drop = True, inplace = True)

In [9]:
features_df

Unnamed: 0,song,tempo,chroma_number
0,Chase Shakur - Pray Slow,80.749512,-0.000407
1,Colin Stetson - Reborn,143.554688,-0.009726
2,Ethan Fields - Roll,112.347147,0.013494
3,Home - 12,89.102909,0.04064
4,Home - Byzantium,135.999178,0.010142
5,Home - Drawing The Blinds,89.102909,-0.010804
6,Home - Head First,135.999178,-0.009489
7,Home - Intro,99.384014,-0.044664
8,Home - Resonance,112.347147,-0.023123
9,ITG Studios - Gravitational Forces,135.999178,-0.004197


In [10]:
mid_term_window = 1
mid_term_step = 1
short_term_window = 0.05
short_term_step = 0.05
# use compute_beat = True if we want the extra beat features
pyaudio_feat, files, feat_names = MidTermFeatures.directory_feature_extraction("C:\\University\\Applied Machine Learning\\Project AML\\AppliedMachineLearning\\Main Project\\ProjectMusic", 
                                                                               mid_term_window, 
                                                                               mid_term_step, 
                                                                               short_term_window, 
                                                                               short_term_step,
                                                                               False)

Analyzing file 1 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Chase Shakur - Pray Slow.mp3
Analyzing file 2 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Colin Stetson - Reborn.mp3
Analyzing file 3 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Ethan Fields - Roll.mp3
Analyzing file 4 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Home - 12.mp3
Analyzing file 5 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Home - Byzantium.mp3
Analyzing file 6 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\ProjectMusic\Home - Drawing The Blinds.mp3
Analyzing file 7 of 49: C:\University\Applied Machine Learning\Project AML\AppliedMachineLearning\Main Project\

In [11]:
if len(features_df) == 1:
    # special case when only features from one song are extracted
    nn_features_df = pd.DataFrame([pyaudio_feat], columns = feat_names)
else:
    nn_features_df = pd.DataFrame(pyaudio_feat, columns = feat_names)
nn_features_df['song'] = features_df['song']

In [12]:
nn_features_df.head(5)

Unnamed: 0,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,spectral_flux_mean,spectral_rolloff_mean,mfcc_1_mean,mfcc_2_mean,...,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,song
0,0.022947,0.10652,3.149726,0.100529,0.163562,0.141208,0.010873,0.0328,-25.234294,2.963321,...,0.014283,0.009444,0.054343,0.003779,0.005269,0.010572,0.022736,0.004293,0.017441,Chase Shakur - Pray Slow
1,0.040007,0.038373,3.212871,0.111733,0.161664,0.266827,0.003898,0.054467,-24.885468,2.254199,...,0.00885,0.014541,0.018126,0.004267,0.002583,0.007599,0.010553,0.004186,0.007903,Colin Stetson - Reborn
2,0.031987,0.065058,3.123152,0.125037,0.177614,0.224604,0.012203,0.040521,-24.533269,2.750968,...,0.016014,0.00804,0.074586,0.002541,0.014231,0.013226,0.026769,0.01361,0.021303,Ethan Fields - Roll
3,0.010608,0.072424,3.085242,0.072542,0.147221,0.03932,0.009767,0.011183,-25.83394,3.845136,...,0.021134,0.003032,0.043249,0.001762,0.001266,0.011131,0.036833,0.0007,0.008889,Home - 12
4,0.038684,0.049714,3.150813,0.1193,0.156493,0.378989,0.010054,0.063881,-24.04681,1.263334,...,0.010845,0.001922,0.084427,0.003541,0.00804,0.010516,0.050022,0.000929,0.020577,Home - Byzantium


In [13]:
# get the basic features by indexing into the neural network feature DataFrame
zcr_ind = feat_names.index('zcr_mean')
ee_ind = feat_names.index('energy_entropy_mean')
spc_ind = feat_names.index('spectral_centroid_mean')

features_df['zero_crossing_rate'] = nn_features_df.iloc[:, zcr_ind]
features_df['energy_entropy'] = nn_features_df.iloc[:, ee_ind]
features_df['spectral_centroid'] = nn_features_df.iloc[:, spc_ind]

In [14]:
features_df

Unnamed: 0,song,tempo,chroma_number,zero_crossing_rate,energy_entropy,spectral_centroid
0,Chase Shakur - Pray Slow,80.749512,-0.000407,0.022947,3.149726,0.100529
1,Colin Stetson - Reborn,143.554688,-0.009726,0.040007,3.212871,0.111733
2,Ethan Fields - Roll,112.347147,0.013494,0.031987,3.123152,0.125037
3,Home - 12,89.102909,0.04064,0.010608,3.085242,0.072542
4,Home - Byzantium,135.999178,0.010142,0.038684,3.150813,0.1193
5,Home - Drawing The Blinds,89.102909,-0.010804,0.041825,3.138335,0.123426
6,Home - Head First,135.999178,-0.009489,0.035582,3.096984,0.100086
7,Home - Intro,99.384014,-0.044664,0.02314,3.178904,0.075927
8,Home - Resonance,112.347147,-0.023123,0.019254,3.137959,0.078118
9,ITG Studios - Gravitational Forces,135.999178,-0.004197,0.017798,3.040058,0.093822


In [15]:
moods = pd.read_csv("./mood_data_main.csv")

In [16]:
moods['song'] = moods['artist'] + " - " + moods['title']
# sort the column song to use it for merging
moods = moods.sort_values(by = ['song'])
moods.reset_index(drop = True, inplace = True)

In [17]:
moods["song"]

0                            Chase Shakur - Pray Slow
1                              Colin Stetson - Reborn
2                                 Ethan Fields - Roll
3                                           Home - 12
4                                    Home - Byzantium
5                           Home - Drawing The Blinds
6                                   Home - Head First
7                                        Home - Intro
8                                    Home - Resonance
9                  ITG Studios - Gravitational Forces
10                      Jakob - Sunshine Girl (Pt. 2)
11                        Jakob Ogawa - All Your Love
12    Jakob Ogawa - You Might Be Sleeping (w. Clairo)
13                    Jamie McDermott - In This Shirt
14                    Joy Again - Looking Out For You
15                      Kali Uchis - Feel Like A Fool
16                             Kali Uchis - Telepatia
17         Kali Uchis - i want war (BUT I NEED PEACE)
18                          

In [18]:
moods.head(5)

Unnamed: 0,title,artist,primary,secondary,song
0,Pray Slow,Chase Shakur,4,5,Chase Shakur - Pray Slow
1,Reborn,Colin Stetson,1,4,Colin Stetson - Reborn
2,Roll,Ethan Fields,2,3,Ethan Fields - Roll
3,12,Home,4,5,Home - 12
4,Byzantium,Home,1,3,Home - Byzantium


In [19]:
len(features_df)

49

In [20]:
new_features = features_df.merge(moods, on = "song", how = "left")
new_features = new_features.drop(columns = ['song'])
new_features

Unnamed: 0,tempo,chroma_number,zero_crossing_rate,energy_entropy,spectral_centroid,title,artist,primary,secondary
0,80.749512,-0.000407,0.022947,3.149726,0.100529,Pray Slow,Chase Shakur,4,5
1,143.554688,-0.009726,0.040007,3.212871,0.111733,Reborn,Colin Stetson,1,4
2,112.347147,0.013494,0.031987,3.123152,0.125037,Roll,Ethan Fields,2,3
3,89.102909,0.04064,0.010608,3.085242,0.072542,12,Home,4,5
4,135.999178,0.010142,0.038684,3.150813,0.1193,Byzantium,Home,1,3
5,89.102909,-0.010804,0.041825,3.138335,0.123426,Drawing The Blinds,Home,2,4
6,135.999178,-0.009489,0.035582,3.096984,0.100086,Head First,Home,1,2
7,99.384014,-0.044664,0.02314,3.178904,0.075927,Intro,Home,2,4
8,112.347147,-0.023123,0.019254,3.137959,0.078118,Resonance,Home,2,5
9,135.999178,-0.004197,0.017798,3.040058,0.093822,Gravitational Forces,ITG Studios,1,6


In [21]:
# save the basic features, neural network features and the feature names
if not existing_features:
    new_features = new_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    new_features.to_csv("../data/features.csv", header = True, index = False)
    
    nn_features_df.to_csv("../data/nn_features.csv", header = True, index = False)
    
    feat_names_df = pd.DataFrame(feat_names)
    feat_names_df.to_csv("../data/nn_feature_names.csv", header = True, index = False)
else:
    # concatenate the previous features with new features and overwrite the existing files
    previous_features = previous_features.drop(columns = ['song'])
    all_features = pd.concat([previous_features, new_features], ignore_index = True)
    all_features = all_features[['title', 'artist', 'tempo', 'chroma_number', 
                                 'zero_crossing_rate', 'energy_entropy', 
                                 'spectral_centroid', 'primary', 'secondary']]
    all_features.to_csv("../data/features.csv", header = True, index = False)
    
    prev_nn = pd.read_csv("../data/nn_features.csv")
    all_nn = pd.concat([prev_nn, nn_features_df], ignore_index = True)
    all_nn.to_csv("../data/nn_features.csv", header = True, index = False)

In [1]:
engineered_features

NameError: name 'engineered_features' is not defined

In [5]:
# feature engineering
feat_names_df = pd.read_csv("../data/nn_feature_names.csv")
current_features = pd.read_csv("../data/features.csv")
feats_to_extract = pd.read_csv("../data/nn_features.csv")
engineered_features = current_features[['title', 'artist', 'tempo', 'chroma_number']]

# check that the ordering of songs is the same
engineered_features['song'] = feats_to_extract['song']
engineered_features['song_check'] = engineered_features['artist'] + ' - ' + engineered_features['title']
if engineered_features['song'].equals(engineered_features['song_check']):
    print ("Song ordering matches.")
    engineered_features = engineered_features.drop(columns = ['song', 'song_check'])
else:
    print ("Song ordering DOES NOT match.")

# convert the feature names into a list
feat_names_list = feat_names_df['0'].to_list()
# selected 28 total features based on distribution boxplots by primary mood
selected_feats = ['zcr_mean', 'zcr_std', 'energy_mean', 'energy_entropy_mean', 'spectral_centroid_mean', 
                  'spectral_spread_mean', 'spectral_entropy_mean', 'mfcc_2_mean', 'mfcc_5_mean', 'mfcc_6_mean',
                  'spectral_centroid_std', 'spectral_entropy_std', 'spectral_spread_std', 'chroma_7_std',
                  'delta chroma_2_std', 'delta chroma_3_std', 'delta chroma_9_std', 'delta chroma_std_std',
                  'delta energy_std', 'delta mfcc_1_std', 'delta mfcc_3_std', 'delta mfcc_13_std',
                  'delta spectral_centroid_std', 'delta spectral_entropy_std', 'delta spectral_flux_std',
                  'delta spectral_spread_std']
for feat in selected_feats:
    engineered_features[feat] = feats_to_extract.iloc[:, feat_names_list.index(feat)]

# add mood labels to the final features dataframe
engineered_features['primary'] = current_features['primary']
engineered_features['secondary'] = current_features['secondary']

# save the engineered features
engineered_features.to_csv("../data/engineered_features.csv", header = True, index = False)
engineered_features

Song ordering matches.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered_features['song'] = feats_to_extract['song']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engineered_features['song_check'] = engineered_features['artist'] + ' - ' + engineered_features['title']


Unnamed: 0,title,artist,tempo,chroma_number,zcr_mean,zcr_std,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,...,delta energy_std,delta mfcc_1_std,delta mfcc_3_std,delta mfcc_13_std,delta spectral_centroid_std,delta spectral_entropy_std,delta spectral_flux_std,delta spectral_spread_std,primary,secondary
0,Pray Slow,Chase Shakur,80.749512,-0.000407,0.022947,0.017329,0.10652,3.149726,0.100529,0.163562,...,0.059472,1.00596,0.403339,0.224575,0.045946,0.175943,0.013225,0.048587,4,5
1,Reborn,Colin Stetson,143.554688,-0.009726,0.040007,0.007754,0.038373,3.212871,0.111733,0.161664,...,0.007091,0.362496,0.262763,0.193761,0.022964,0.072439,0.002462,0.035453,1,4
2,Roll,Ethan Fields,112.347147,0.013494,0.031987,0.021661,0.065058,3.123152,0.125037,0.177614,...,0.048894,1.808481,0.578886,0.274285,0.0535,0.287151,0.014551,0.041004,2,3
3,12,Home,89.102909,0.04064,0.010608,0.007553,0.072424,3.085242,0.072542,0.147221,...,0.012299,1.372463,0.434742,0.180317,0.035339,0.091205,0.019458,0.045563,4,5
4,Byzantium,Home,135.999178,0.010142,0.038684,0.01811,0.049714,3.150813,0.1193,0.156493,...,0.037722,1.357249,0.484079,0.219621,0.048634,0.391317,0.013648,0.04337,1,3
5,Drawing The Blinds,Home,89.102909,-0.010804,0.041825,0.017222,0.029655,3.138335,0.123426,0.169007,...,0.031561,1.0967,0.408545,0.244143,0.035429,0.240243,0.01003,0.029254,2,4
6,Head First,Home,135.999178,-0.009489,0.035582,0.016668,0.042764,3.096984,0.100086,0.146246,...,0.042732,1.180623,0.503945,0.218634,0.029297,0.231996,0.009717,0.043519,1,2
7,Intro,Home,99.384014,-0.044664,0.02314,0.003755,0.034831,3.178904,0.075927,0.141992,...,0.010922,0.860971,0.299406,0.226869,0.030645,0.027316,0.008715,0.05439,2,4
8,Resonance,Home,112.347147,-0.023123,0.019254,0.007839,0.113506,3.137959,0.078118,0.145261,...,0.061921,0.932585,0.486303,0.201469,0.03282,0.078871,0.012369,0.057298,2,5
9,Gravitational Forces,ITG Studios,135.999178,-0.004197,0.017798,0.007263,0.045727,3.040058,0.093822,0.165921,...,0.02014,1.098646,0.287604,0.184712,0.031785,0.081171,0.007262,0.045157,1,6
