In [1]:
# Import packages
import numpy as np
import pandas as pd

import librosa
import librosa.display as ld

import glob

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('instruments.csv')
df.head()

Unnamed: 0,fname,label
0,5388d14d.wav,Saxophone
1,c685f05f.wav,Saxophone
2,36d20ab5.wav,Saxophone
3,d6665734.wav,Saxophone
4,7352e28f.wav,Saxophone


In [3]:
class Config:
    sr = 44100
    random_state = 42

In [17]:
def extract_mfcc(signal, sr):
    mfccs = librosa.feature.mfcc(y=signal, sr=sr)
    aver = np.mean(mfccs, axis = 1)
    feature = aver.reshape(20)
    return feature

def extract_melspectrogram(signal, sr):
    spectrogram = librosa.feature.melspectrogram(signal)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    spectrogram = np.mean(spectrogram, axis=1)
    return spectrogram

def extract(row):
    signal, sr = librosa.load(f'wavfiles/{row[0]}', Config.sr)
    x1 = extract_mfcc(signal, sr)
    x2 = extract_melspectrogram(signal, sr)
    return x1,x2

df['mfcc'] = zip(*df.apply(extract, result_type='expand'))

In [18]:
df.head(5)

Unnamed: 0,fname,label,mfcc,melspectrogram
0,5388d14d.wav,Saxophone,"(-272.33786, 58.306755, -47.943787, 7.301422, ...","(-2.7999992, -25.17658, -28.684496, -28.723795..."
1,c685f05f.wav,Saxophone,"(-287.33243, 101.56712, -29.987404, 20.038153,...","(2.3896997, -17.04163, -16.49534, -13.215973, ..."
2,36d20ab5.wav,Saxophone,"(-238.68503, 153.41728, -26.39078, 26.09202, 1...","(-9.527214, -15.543957, 10.565876, 24.367052, ..."
3,d6665734.wav,Saxophone,"(-351.35153, 166.21623, -41.64548, 4.0183616, ...","(-33.780457, -23.965872, -6.249142, 3.50676, 5..."
4,7352e28f.wav,Saxophone,"(-315.89813, 146.70885, -78.09529, -38.58948, ...","(-33.47275, -35.72336, -40.216938, -40.49374, ..."


In [62]:
len(df.mfcc[1]), len(df.melspectrogram[1])

(20, 128)

In [59]:
def get_X_y(data, feature_col, label_col=1):
    data = df.apply(lambda x: np.append(x[feature_col], x[label_col]), axis=1, result_type='expand')
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    return X, y

In [60]:
from sklearn.decomposition import PCA
# Apply PCA for MFCC feature
X, y = get_X_y(df, 2)
for n_components in [5, 10, 15, 20]:
    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(X)
    print(f'Explained variance for n_components={n_components} is {pca.explained_variance_ratio_.sum()}')

Explained variance for n_components=5 is 0.9373442087910978
Explained variance for n_components=10 is 0.9849497380888347
Explained variance for n_components=15 is 0.9975582644786162
Explained variance for n_components=20 is 0.9999999999999999


In [63]:
# Apply PCA for melspectrogram
X, y = get_X_y(df, 3)
for n_components in [5, 10, 15, 20, 30, 40, 50]:
    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(X)
    print(f'Explained variance for n_components={n_components} is {pca.explained_variance_ratio_.sum()}')

Explained variance for n_components=5 is 0.8349438099730696
Explained variance for n_components=10 is 0.8887424585811609
Explained variance for n_components=15 is 0.923406941473843
Explained variance for n_components=20 is 0.9469283337049852
Explained variance for n_components=30 is 0.9740396240628562
Explained variance for n_components=40 is 0.9865471004601114
Explained variance for n_components=50 is 0.9928228321331468


From the above analysis the number of pca components for
* MelSpectrogram feature can be choosen as 40
* MFCC feature can be choosen as 15