In [1]:
import librosa
from librosa.feature import *
import numpy as np
import pickle
import pandas as pd
import os

In [2]:
#train_dir='/Users/nadimkawwa/Desktop/Udacity/MLEND/Capstone/nsynth-train/examples.json'
#df_train= pd.read_json(train_dir, orient='index')


In [3]:
train_dir ='nsynthtest/audio/'
test_dir='valteest/audio/'


In [4]:
df_train = pd.read_json(path_or_buf='nsynthtest/examples.json', orient='index')



df_train_sample=df_train.groupby('instrument_family', as_index=False, #group by instrument family
                               group_keys=False).apply(lambda df: df.sample(140)) #number of samples
#drop the synth_lead from the training dataset
df_train_sample= df_train_sample[df_train_sample['instrument_family']!=9]


#save the train file index as list
filenames_train = df_train_sample.index.tolist()


In [5]:
with open('filenames_train.pickle', 'wb') as f:
    pickle.dump(filenames_train, f)

In [6]:
df_test = pd.read_json(path_or_buf='valteest/examples.json', orient='index')


df_test_sample=df_test.groupby('instrument_family', as_index=False, #group by instrument family
                               group_keys=False).apply(lambda df: df.sample(100)) #number of samples
#drop the synth_lead from the training dataset
df_test_sample= df_test_sample[df_test_sample['instrument_family']!=9]

#save the train file index as list
filenames_test = df_test_sample.index.tolist()


In [7]:
with open('filenames_test.pickle', 'wb') as f:
    pickle.dump(filenames_test, f)

In [32]:
print((filenames_test[0]))

bass_synthetic_009-079-075


In [9]:
#Get a count of instruments in ascending order
n_class_train= df_train['instrument_family'].value_counts(ascending=True)
n_class_train

10    141
2     180
5     202
7     235
1     269
8     306
6     502
3     652
4     766
0     843
Name: instrument_family, dtype: int64

In [10]:




def feature_extract(file):
    """
    Define function that takes in a file an returns features in an array
    """
    
    #get wave representation
    y, sr = librosa.load(file)
        
    #determine if instruemnt is harmonic or percussive by comparing means
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    if np.mean(y_harmonic)>np.mean(y_percussive):
        harmonic=1
    else:
        harmonic=0
        
    #Mel-frequency cepstral coefficients (MFCCs)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    #temporal averaging
    mfcc=np.mean(mfcc,axis=1)
    
    #get the mel-scaled spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000)  
    #temporally average spectrogram
    spectrogram = np.mean(spectrogram, axis = 1)
    
    #compute chroma energy
    chroma = librosa.feature.chroma_cens(y=y, sr=sr)
    #temporally average chroma
    chroma = np.mean(chroma, axis = 1)
    
    #compute spectral contrast
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    contrast = np.mean(contrast, axis= 1)
    
    return [harmonic, mfcc, spectrogram, chroma, contrast]



In [11]:
def instrument_code(filename):
    """
    Function that takes in a filename and returns instrument based on naming convention
    """
    class_names=['bass', 'brass', 'flute', 'guitar', 
             'keyboard', 'mallet', 'organ', 'reed', 
             'string', 'synth_lead', 'vocal']
    
    for name in class_names:
        if name in filename:
            return class_names.index(name)
    else:
        return None

In [12]:
dict_test = {}
#loop over every file in the list
for file in filenames_test:
    #extract the features
    features = feature_extract(test_dir+ file + '.wav') #specify directory and .wav
    #add dictionary entry
    dict_test[file] = features

  return f(*args, **kwargs)


In [39]:
print(dict_test
     )

{'yy': [0, array([-149.26155 ,   83.02797 ,  -22.945787,   14.094036,   -7.547027,
         -5.87707 ,  -19.748907,  -12.250929,  -13.844024,  -11.492798,
        -16.253012,   -8.829573,  -13.172394], dtype=float32), array([5.56237958e-02, 3.39095592e-01, 5.77865243e-01, 5.14747739e-01,
       4.70993161e-01, 5.56067050e-01, 7.09420621e-01, 1.29193914e+00,
       5.34167814e+00, 2.52406788e+01, 2.15967388e+01, 8.12586594e+01,
       2.56101532e+02, 9.72670059e+01, 1.91081104e+01, 7.08177948e+00,
       1.04981709e+01, 7.79241800e-01, 8.03745747e+00, 3.42180099e+01,
       4.56784773e+00, 5.21278048e+00, 5.56383276e+00, 3.01170039e+00,
       1.88084183e+01, 5.81361923e+01, 9.67823944e+01, 6.62825632e+00,
       3.09934640e+00, 1.43462868e+01, 1.90643513e+00, 2.72759944e-01,
       4.42334080e+00, 4.55755281e+00, 4.88573253e-01, 3.94520313e-01,
       1.81751382e+00, 4.26723909e+00, 5.16951656e+00, 3.09264851e+01,
       5.17186451e+00, 3.48021537e-01, 1.40936404e-01, 4.43352401e-01,
 

In [13]:
#convert dict to dataframe
features_test = pd.DataFrame.from_dict(dict_test, orient='index',
                                       columns=['harmonic', 'mfcc', 'spectro', 'chroma', 'contrast'])

features_test.head()

Unnamed: 0,harmonic,mfcc,spectro,chroma,contrast
bass_synthetic_009-079-075,0,"[-122.34592, 62.285767, -83.07544, 43.76254, -...","[2.80481, 0.70378816, 0.23788619, 0.107425004,...","[0.052054815, 0.10145206, 0.35925588, 0.057336...","[16.728972041515807, 13.248107994769276, 26.97..."
bass_synthetic_098-090-025,0,"[-547.52435, 55.12924, 43.079487, 37.349724, 3...","[223.35283, 10.05366, 4.931809, 2.0684614, 1.1...","[0.34441915, 0.4150822, 0.33813697, 0.32387918...","[28.319288604612524, 16.663368161077823, 17.30..."
bass_synthetic_098-094-075,0,"[-487.50003, 40.112736, 26.44976, 26.131557, 2...","[36.344902, 1.4068937, 0.7942378, 0.4261765, 0...","[0.37610948, 0.353459, 0.34760252, 0.3694944, ...","[27.45194159029113, 16.24250639391276, 16.4396..."
bass_synthetic_134-075-050,1,"[-549.8209, 23.202682, 12.579145, 5.1855855, 1...","[14.901428, 14.584876, 8.145181, 9.374405, 12....","[0.2808584, 0.39315486, 0.3459177, 0.575937, 0...","[13.183841308095062, 12.50340397102937, 31.566..."
bass_synthetic_068-053-100,1,"[-522.1147, 42.621834, 28.985846, 21.986275, 1...","[0.010214687, 0.015203913, 0.020220457, 0.0401...","[0.13153379, 0.11405232, 0.091060214, 0.077634...","[36.84559243497511, 24.131741614369986, 25.557..."


In [14]:
#extract mfccs
mfcc_test = pd.DataFrame(features_test.mfcc.values.tolist(),index=features_test.index)
mfcc_test = mfcc_test.add_prefix('mfcc_')

#extract spectro
spectro_test = pd.DataFrame(features_test.spectro.values.tolist(),index=features_test.index)
spectro_test = spectro_test.add_prefix('spectro_')


#extract chroma
chroma_test = pd.DataFrame(features_test.chroma.values.tolist(),index=features_test.index)
chroma_test = chroma_test.add_prefix('chroma_')


#extract contrast
contrast_test = pd.DataFrame(features_test.contrast.values.tolist(),index=features_test.index)
contrast_test = chroma_test.add_prefix('contrast_')

#drop the old columns
features_test = features_test.drop(labels=['mfcc', 'spectro', 'chroma', 'contrast'], axis=1)

#concatenate
df_features_test=pd.concat([features_test, mfcc_test, spectro_test, chroma_test, contrast_test],
                           axis=1, join='inner')
df_features_test.head()

Unnamed: 0,harmonic,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,contrast_chroma_2,contrast_chroma_3,contrast_chroma_4,contrast_chroma_5,contrast_chroma_6,contrast_chroma_7,contrast_chroma_8,contrast_chroma_9,contrast_chroma_10,contrast_chroma_11
bass_synthetic_009-079-075,0,-122.345917,62.285767,-83.075439,43.762539,-60.098812,28.661697,-32.752171,27.26436,1.336832,...,0.359256,0.057337,0.05199,0.068528,0.201899,0.853911,0.214838,0.049852,0.044756,0.099251
bass_synthetic_098-090-025,0,-547.524353,55.129238,43.079487,37.349724,34.811089,31.781181,27.990141,24.011755,21.184132,...,0.338137,0.323879,0.313752,0.260379,0.267142,0.208942,0.211173,0.138874,0.152003,0.143321
bass_synthetic_098-094-075,0,-487.500031,40.112736,26.44976,26.131557,24.039185,22.207474,19.748316,18.223574,17.743853,...,0.347603,0.369494,0.286421,0.282833,0.285449,0.211463,0.192597,0.173287,0.175875,0.153185
bass_synthetic_134-075-050,1,-549.820923,23.202682,12.579145,5.185585,1.006346,-1.716701,-3.106157,-2.62185,0.542454,...,0.345918,0.575937,0.231747,0.2115,0.128066,0.119656,0.129938,0.100733,0.029278,0.005583
bass_synthetic_068-053-100,1,-522.114685,42.621834,28.985846,21.986275,14.194398,6.940434,1.723575,-2.210854,-5.064771,...,0.09106,0.077634,0.090206,0.829105,0.073468,0.04957,0.045302,0.053149,0.024841,0.022701


In [15]:
targets_test = []
for name in df_features_test.index.tolist():
    targets_test.append(instrument_code(name))

df_features_test['targets'] = targets_test

In [16]:
#save the dataframe to a pickle file
with open('df_features_test.pickle', 'wb') as f:
    pickle.dump(df_features_test, f)

In [18]:

#create dictionary to store all test features
dict_train = {}
#loop over every file in the list
for file in filenames_train:
    #extract the features
    features = feature_extract(train_dir+ file + '.wav') #specify directory and .wav
    #add dictionary entry
    dict_train[file] = features



  return f(*args, **kwargs)


In [19]:
#convert dict to dataframe
features_train = pd.DataFrame.from_dict(dict_train, orient='index',
                                       columns=['harmonic', 'mfcc', 'spectro', 'chroma', 'contrast'])

features_train.head()

Unnamed: 0,harmonic,mfcc,spectro,chroma,contrast
bass_synthetic_135-092-050,0,"[-485.84177, 0.8434897, -19.254045, -5.3055143...","[0.008918692, 0.0056764935, 0.004181718, 0.003...","[0.0121302605, 0.014421118, 0.020809676, 0.010...","[19.79329947123137, 10.689203420792404, 12.674..."
bass_electronic_027-056-100,0,"[-504.2966, 29.54423, 22.420982, 20.972946, 15...","[0.007094477, 0.0899159, 0.45439562, 0.8673591...","[0.07902835, 0.058857437, 0.06159919, 0.057146...","[25.25703672735785, 34.022033165773315, 24.418..."
bass_synthetic_033-085-075,1,"[-483.3433, 15.648635, -5.2274065, -3.8826995,...","[244.15414, 0.58063084, 0.11752053, 0.05877696...","[0.20172556, 0.847199, 0.19311695, 0.12484613,...","[40.90615706901779, 12.375029892856066, 15.113..."
bass_synthetic_009-089-050,0,"[-156.17845, 41.982857, -90.059296, 48.828915,...","[2.272348, 0.3704648, 0.2776783, 0.27018183, 0...","[0.2437127, 0.27753097, 0.098094255, 0.1558468...","[15.369699020280304, 9.896876465371422, 15.203..."
bass_synthetic_009-081-025,0,"[-132.17172, 62.357918, -85.56836, 37.948715, ...","[2.4958384, 1.0291046, 0.43158412, 0.15645695,...","[0.062473282, 0.044664066, 0.10499538, 0.13286...","[14.908414159748634, 10.80758788785952, 13.085..."


In [20]:
#extract mfccs
mfcc_train = pd.DataFrame(features_train.mfcc.values.tolist(),
                          index=features_train.index)
mfcc_train = mfcc_train.add_prefix('mfcc_')

#extract spectro
spectro_train = pd.DataFrame(features_train.spectro.values.tolist(),
                             index=features_train.index)
spectro_train = spectro_train.add_prefix('spectro_')


#extract chroma
chroma_train = pd.DataFrame(features_train.chroma.values.tolist(),
                            index=features_train.index)
chroma_train = chroma_train.add_prefix('chroma_')


#extract contrast
contrast_train = pd.DataFrame(features_train.contrast.values.tolist(),
                              index=features_train.index)
contrast_train = chroma_train.add_prefix('contrast_')

#drop the old columns
features_train = features_train.drop(labels=['mfcc', 'spectro', 'chroma', 'contrast'], axis=1)

#concatenate
df_features_train=pd.concat([features_train, mfcc_train, spectro_train, chroma_train, contrast_train],
                           axis=1, join='inner')
df_features_train.head()

Unnamed: 0,harmonic,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,contrast_chroma_2,contrast_chroma_3,contrast_chroma_4,contrast_chroma_5,contrast_chroma_6,contrast_chroma_7,contrast_chroma_8,contrast_chroma_9,contrast_chroma_10,contrast_chroma_11
bass_synthetic_135-092-050,0,-485.841766,0.84349,-19.254045,-5.305514,12.100674,14.165333,-9.482265,-16.57358,2.888299,...,0.02081,0.010405,0.013814,0.013185,0.013185,0.023054,0.986375,0.009001,0.013185,0.013185
bass_electronic_027-056-100,0,-504.2966,29.54423,22.420982,20.972946,15.76256,11.044422,7.210026,2.926386,-0.535751,...,0.061599,0.057147,0.061404,0.058633,0.108065,0.125011,0.840755,0.105918,0.060827,0.054683
bass_synthetic_033-085-075,1,-483.343292,15.648635,-5.227407,-3.882699,3.17029,11.608503,21.487236,19.765547,3.775088,...,0.193117,0.124846,0.0918,0.070244,0.060657,0.049596,0.022345,0.038611,0.032108,0.04596
bass_synthetic_009-089-050,0,-156.178452,41.982857,-90.059296,48.828915,-44.965481,57.052864,-24.577089,16.342007,-26.571112,...,0.098094,0.155847,0.274611,0.786843,0.293934,0.097031,0.138468,0.076386,0.057799,0.068974
bass_synthetic_009-081-025,0,-132.171722,62.357918,-85.568359,37.948715,-58.481983,32.785423,-18.783962,36.299557,14.9614,...,0.104995,0.132863,0.349546,0.075612,0.052912,0.081602,0.269707,0.818974,0.255519,0.056501


In [21]:
targets_train = []
for name in df_features_train.index.tolist():
    targets_train.append(instrument_code(name))

In [22]:
df_features_train['targets'] = targets_train
#save the dataframe to a pickle file
with open('df_features_train.pickle', 'wb') as f:
    pickle.dump(df_features_train, f)

In [25]:
from sklearn.ensemble import RandomForestClassifier
X_train = df_features_train.drop(labels=['targets'], axis=1)
y_train = df_features_train['targets']

X_test = df_features_test.drop(labels=['targets'], axis=1)
y_test = df_features_test['targets']


clf_Rf =RandomForestClassifier(n_estimators=10, max_depth=20)
clf_Rf.fit(X_train, y_train)
y_pred_RF = clf_Rf.predict(X_test)


accuracy_RF = np.mean(y_pred_RF == y_test)
print("The accuracy of Random Forest is {0:.2%}".format(accuracy_RF))

The accuracy of Random Forest is 78.30%


In [29]:
with open("clf_Rf.pickle", mode='wb') as file:
    pickle.dump(clf_Rf, file)

In [37]:
dict_test = {}

features = feature_extract('trim.wav') #specify directory and .wav
dict_test['yy'] = features
features_test = pd.DataFrame.from_dict(dict_test, orient='index',
                                       columns=['harmonic', 'mfcc', 'spectro', 'chroma', 'contrast'])

features_test.head()


Unnamed: 0,harmonic,mfcc,spectro,chroma,contrast
yy,0,"[-149.26155, 83.02797, -22.945787, 14.094036, ...","[0.055623796, 0.3390956, 0.57786524, 0.5147477...","[0.1089567, 0.11629038, 0.33677846, 0.5442161,...","[23.86902669066053, 21.88928457711498, 25.0683..."


In [38]:
#extract mfccs
mfcc_test = pd.DataFrame(features_test.mfcc.values.tolist(),index=features_test.index)
mfcc_test = mfcc_test.add_prefix('mfcc_')

#extract spectro
spectro_test = pd.DataFrame(features_test.spectro.values.tolist(),index=features_test.index)
spectro_test = spectro_test.add_prefix('spectro_')


#extract chroma
chroma_test = pd.DataFrame(features_test.chroma.values.tolist(),index=features_test.index)
chroma_test = chroma_test.add_prefix('chroma_')


#extract contrast
contrast_test = pd.DataFrame(features_test.contrast.values.tolist(),index=features_test.index)
contrast_test = chroma_test.add_prefix('contrast_')

#drop the old columns
features_test = features_test.drop(labels=['mfcc', 'spectro', 'chroma', 'contrast'], axis=1)

#concatenate
df_features_test=pd.concat([features_test, mfcc_test, spectro_test, chroma_test, contrast_test],
                           axis=1, join='inner')
df_features_test.head()
targets_test = []
for name in df_features_test.index.tolist():
    targets_test.append(instrument_code(name))

df_features_test['targets'] = targets_test
X_test = df_features_test.drop(labels=['targets'], axis=1)
y_pred_RF = clf_Rf.predict(X_test)
print(y_pred_RF)

[0]


Jesse Engel, Cinjon Resnick, Adam Roberts, Sander Dieleman, Douglas Eck,
  Karen Simonyan, and Mohammad Norouzi. "Neural Audio Synthesis of Musical Notes
  with WaveNet Autoencoders." 2017.