In [None]:
import numpy as np
import pandas as pd

import librosa
import librosa.display as ld

import glob

import matplotlib.pyplot as plt

Reading CSV that contains audio file to instrument label mapping.

In [None]:
df = pd.read_csv('instruments.csv')
df.head()

Getting the distribution of recordings for each type of instrument

In [None]:
instrument_types = df['label'].value_counts()
print(instrument_types)

In [None]:
instruments = df['label'].unique()
instruments

System-wide configuration such as sampling rate.

In [None]:
class Config:
    sr = 44100
    random_state = 42

Extracting a sample MFCC for each instrument type & plotting it.

In [None]:
def extract_mfcc(file):
    signal, sr = librosa.load(file, Config.sr)
    return librosa.feature.mfcc(y=signal, sr=sr)

In [None]:
mfccs = [extract_mfcc(f'wavfiles/{df[df.label == instrument].iloc[0,0]}') for instrument in instruments]

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=5, sharex=False,
                             sharey=True, figsize=(20,5))
fig.suptitle('Mel Frequency Cepstrum Coefficients', size=16)
for i, instrument in enumerate(instruments):
    row = 0 if i < 5 else 1
    col = i % 5
    axis = ax[row, col]
    axis.set(title = f'{instrument}')
    axis.get_xaxis().set_visible(False)
    i = librosa.display.specshow(mfccs[i], x_axis='time', ax=ax[row, col])
# plt.colorbar(i)

Classifier class with ability to configure no. of samples and features used for classification.

Internally uses Naive Bayes as the baseline followed by SVM for better performance.

In [None]:
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.naive_bayes import GaussianNB

class InstrumentClassifier:
    def __init__(self, n_samples, features = ['mfcc']):
        self.n_samples = n_samples
        self.features = features

        GNB = Pipeline([
            ('Standard Scaler', StandardScaler()),
            ('PCA', PCA(n_components = 10)),
            ('Gaussian Naive Bayes', GaussianNB())
        ])
        SVC = Pipeline([
            ('Standard Scaler', StandardScaler()),
            ('PCA', PCA(n_components = 10)),
            ('SVM', svm.SVC(gamma = 'auto'))
        ])
        self.pipe_dict = {0: 'GNB', 1: 'SVC'}
        self.pipelines = [GNB, SVC]

    def _extract_mfcc(self, signal, sr):
        mfccs = librosa.feature.mfcc(y=signal, sr=sr)
        aver = np.mean(mfccs, axis = 1)
        feature = aver.reshape(20)
        return feature

    def _extract_melspectrogram(self, signal, sr):
        spectrogram = librosa.feature.melspectrogram(signal)
        spectrogram = librosa.power_to_db(spectrogram)
        spectrogram = spectrogram.astype(np.float32)
        spectrogram = np.mean(spectrogram, axis=1)
        return spectrogram

    def _extract(self, row):
        signal, sr = librosa.load(f'wavfiles/{row[0]}', Config.sr)
        if 'mfcc' in self.features:
            X = self._extract_mfcc(signal, sr)
        if 'melspectrogram' in self.features:
            X = self._extract_melspectrogram(signal, sr)
        X_y = np.append(X, row[1])
        return X_y

    def _extract_X_y(self, df):
        samples = df.sample(n=self.n_samples, replace=True, random_state=Config.random_state)
        data = samples.apply(self._extract, axis=1, result_type='expand')
        X = data.iloc[:,:-1]
        y = data.iloc[:,-1]
        return X, y

    def extract_features(self, df):
        self.X, self.y = self._extract_X_y(df)
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.25, random_state=Config.random_state)

    def fit(self):
        for p in self.pipelines:
            p.fit(self.x_train, self.y_train)
    
    def perf(self):
        print(f'n_samples:{self.n_samples}, features: {self.features}')
        for i,model in enumerate(self.pipelines):
            print('{} Accuracy: {}'.format(self.pipe_dict[i], model.score(self.x_test, self.y_test)))
            print(f'Cross-validation result: {cross_validate(model, self.X, self.y)["test_score"]}')


Testing the classifier with different audio features & sample sizes.

In [None]:
for feature in ['mfcc', 'melspectrogram']:
    for samples in [128, 256, 512]:
        classifier = InstrumentClassifier(samples, [feature])
        classifier.extract_features(df)
        classifier.fit()
        classifier.perf()
