First set up code to visualize a sound form

In [None]:
%pylab inline
import IPython.display as ipd
import librosa
from librosa import load, display
import glob
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()
import pickle
from common import save_as_pickle
# import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import numpy as np

In [None]:
# You should change these paths according to the path of the files on your system.
PATH_TO_TRAIN_LABELS = "data/train/train.csv"
PATH_TO_TEST_LABELS = "data/test/test.csv"
PATH_TO_TRAIN_AUDIO_FILES = "data/train/wav/"
PATH_TO_TEST_AUDIO_FILES = "data/test/wav/"
PATH_TO_SUBMISSION = "submission/"
PATH_TO_PICKLE = "pickles/"
SUBMISSION_TITLE = "nn chroma"

In [None]:
# It is easier to deal with csv if you can load it into a structure you can work with.
# Pandas are the most convenient way to do that and are available with 
# inbuilt functionality to handle csv file.

# Pandas assumes that the first row in your file is the header adn not the actual values.
# This behavior can be overriden by passing header=None as a parameter.
train = pd.read_csv(PATH_TO_TRAIN_LABELS)
test = pd.read_csv(PATH_TO_TEST_LABELS)

In [None]:
# You can reactivate this cell to make sure your model is working correctly in terms of dimensions.
#train = train[:2]
#test = test[:2]

In [None]:
train_error_count = 0
train_error_labels = []
test_error_count = 0
test_error_labels = []

In [None]:
# To start with classification, we first need to convert the wav sound files into a format we can work 
# with. It is easier to take the amplitude at each sampling point and use that 
# numeric value to form a feature vector.
def train_parser(row):
    global train_error_count
    global train_error_labels
    path_to_wav_files = PATH_TO_TRAIN_AUDIO_FILES
    file_path = path_to_wav_files + str(row.ID) + ".wav"
    try:
        data, sampling_rate = librosa.load(file_path)
        stft = np.abs(librosa.stft(data))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sampling_rate).T,axis=0)
    except Exception as ex:
        print(ex)
        train_error_count += 1
        train_error_labels.append(row.ID)
        return [0]*12, row.Class
    features = chroma
    label = row.Class
    return [features, label]

In [None]:
# To create the training feature matrix, we can apply our parser to each training sample.
train_features = train.progress_apply(train_parser,axis=1)
print("%d samples had errors while parsing" % train_error_count)
print("Errorneous samples", train_error_labels)
save_as_pickle(data=train_features,pickle_file=PATH_TO_PICKLE + SUBMISSION_TITLE + " train.pickle" )

In [None]:
# Renaming the columns to singnify what they mean helps with documentation,
# and also helps you keep track of them later on.
train_features.columns = ['feature','label']
# train_features.head()

In [None]:
# this library helps us convert string labels into easy to handle encoded labels.
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

In [None]:
X = np.array(train_features.feature.tolist())
Y = np.array(train_features.label.tolist())
lb = LabelEncoder()
# Since labels are categories they dont inherently have an order amongst themselves.
# For example, Apples > oranges does not make any sense. So to madel such categorical 
# variables, we can convert them to one hot vectors.
Y = to_categorical(lb.fit_transform(Y))

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [None]:
number_of_labels = Y.shape[1]
filter_size = 2

In [None]:
model = Sequential()

In [None]:
model.add(Dense(256, input_shape=(12,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [None]:
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [None]:
model.add(Dense(number_of_labels))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', metrics = ['accuracy'], optimizer='adam')


In [None]:
model.fit(X,Y, batch_size=32, epochs=50)

In [None]:
def test_parser(row):
    global test_error_count
    global test_error_labels
    path_to_wav_files = PATH_TO_TEST_AUDIO_FILES
    file_path = path_to_wav_files + str(row.ID) + ".wav"
    try:
        data, sampling_rate = librosa.load(file_path)
        stft = np.abs(librosa.stft(data))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sampling_rate).T,axis=0)
    except Exception as ex:
        test_error_count += 1
        test_error_labels.append(row.ID)
        return pd.Series([0]*12)
    features = chroma
    return pd.Series(features)

In [None]:
test_features = test.progress_apply(test_parser,axis=1, reduce = True)
print("%d samples had errors while parsing" % test_error_count)
print("Errorneous samples", test_error_labels)
save_as_pickle(data=test_features,pickle_file=PATH_TO_PICKLE + SUBMISSION_TITLE + " test.pickle" )

In [None]:
X_test = test_features
# X_test

In [None]:
test_labels = model.predict(X_test, batch_size=32)

In [None]:
test_labels_strings = lb.inverse_transform(test_labels.argmax(axis=1))
# test_labels_strings

In [None]:
test['Class'] = test_labels_strings

In [None]:
test.to_csv(PATH_TO_SUBMISSION + SUBMISSION_TITLE + ".csv",index=None)

This approach gives 56% accuracy with the above setup.