First set up code to visualize a sound form

In [1]:
%pylab inline
import IPython.display as ipd
import librosa
from librosa import load, display
import glob
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()
import pickle
from common import save_as_pickle
# import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np

In [3]:
# You should change these paths according to the path of the files on your system.
PATH_TO_TRAIN_LABELS = "data/train/train.csv"
PATH_TO_TEST_LABELS = "data/test/test.csv"
PATH_TO_TRAIN_AUDIO_FILES = "data/train/wav/"
PATH_TO_TEST_AUDIO_FILES = "data/test/wav/"
PATH_TO_SUBMISSION = "submission/"
PATH_TO_PICKLE = "pickles/"
SUBMISSION_TITLE = "nn tonnetz"

In [4]:
# It is easier to deal with csv if you can load it into a structure you can work with.
# Pandas are the most convenient way to do that and are available with 
# inbuilt functionality to handle csv file.

# Pandas assumes that the first row in your file is the header adn not the actual values.
# This behavior can be overriden by passing header=None as a parameter.
train = pd.read_csv(PATH_TO_TRAIN_LABELS)
test = pd.read_csv(PATH_TO_TEST_LABELS)

In [5]:
# You can reactivate this cell to make sure your model is working correctly in terms of dimensions.
#train = train[:2]
#test = test[:2]

In [6]:
train_error_count = 0
train_error_labels = []
test_error_count = 0
test_error_labels = []

In [7]:
# To start with classification, we first need to convert the wav sound files into a format we can work 
# with. It is easier to take the amplitude at each sampling point and use that 
# numeric value to form a feature vector.
def train_parser(row):
    global train_error_count
    global train_error_labels
    path_to_wav_files = PATH_TO_TRAIN_AUDIO_FILES
    file_path = path_to_wav_files + str(row.ID) + ".wav"
    try:
        data, sampling_rate = librosa.load(file_path)
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(data),sr=sampling_rate).T,axis=0)
    except Exception as ex:
        print(ex)
        train_error_count += 1
        train_error_labels.append(row.ID)
        return [0]*6, row.Class
    features = tonnetz
    label = row.Class
    return [features, label]

In [8]:
# To create the training feature matrix, we can apply our parser to each training sample.
train_features = train.progress_apply(train_parser,axis=1)
print("%d samples had errors while parsing" % train_error_count)
print("Errorneous samples", train_error_labels)
save_as_pickle(data=train_features,pickle_file=PATH_TO_PICKLE + SUBMISSION_TITLE + " train.pickle" )

HBox(children=(IntProgress(value=0, max=5435), HTML(value='')))

  if np.any(X < 0) or np.any(X_ref < 0):
  bad_idx = (Z < np.finfo(dtype).tiny)


Audio buffer is not finite everywhere
Audio buffer is not finite everywhere
Audio buffer is not finite everywhere

3 samples had errors while parsing
Errorneous samples [300, 1182, 1488]


In [9]:
# Renaming the columns to singnify what they mean helps with documentation,
# and also helps you keep track of them later on.
train_features.columns = ['feature','label']
# train_features.head()

In [10]:
# this library helps us convert string labels into easy to handle encoded labels.
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
X = np.array(train_features.feature.tolist())
Y = np.array(train_features.label.tolist())
lb = LabelEncoder()
# Since labels are categories they dont inherently have an order amongst themselves.
# For example, Apples > oranges does not make any sense. So to madel such categorical 
# variables, we can convert them to one hot vectors.
Y = to_categorical(lb.fit_transform(Y))

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

In [13]:
number_of_labels = Y.shape[1]
filter_size = 2

In [14]:
model = Sequential()

In [15]:
model.add(Dense(256, input_shape=(6,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [16]:
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))

In [17]:
model.add(Dense(number_of_labels))
model.add(Activation('softmax'))

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               1792      
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                2570      
__________

In [19]:
model.compile(loss='categorical_crossentropy', metrics = ['accuracy'], optimizer='adam')


In [20]:
model.fit(X,Y, batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50


Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50


Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




<keras.callbacks.History at 0x1dad082dd68>

In [21]:
def test_parser(row):
    global test_error_count
    global test_error_labels
    path_to_wav_files = PATH_TO_TEST_AUDIO_FILES
    file_path = path_to_wav_files + str(row.ID) + ".wav"
    try:
        data, sampling_rate = librosa.load(file_path)
        stft = np.abs(librosa.stft(data))
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(data),sr=sampling_rate).T,axis=0)
    except Exception as ex:
        test_error_count += 1
        test_error_labels.append(row.ID)
        return pd.Series([0]*6)
    features = tonnetz
    return pd.Series(features)

In [22]:
test_features = test.progress_apply(test_parser,axis=1, reduce = True)
print("%d samples had errors while parsing" % test_error_count)
print("Errorneous samples", test_error_labels)
save_as_pickle(data=train_features,pickle_file=PATH_TO_PICKLE + SUBMISSION_TITLE + " test.pickle" )

HBox(children=(IntProgress(value=0, max=3297), HTML(value='')))




0 samples had errors while parsing
Errorneous samples []


In [23]:
X_test = test_features
# X_test

In [24]:
test_labels = model.predict(X_test, batch_size=32)

In [25]:
test_labels_strings = lb.inverse_transform(test_labels.argmax(axis=1))
# test_labels_strings

  if diff:


In [26]:
test['Class'] = test_labels_strings

In [27]:
test.to_csv(PATH_TO_SUBMISSION + SUBMISSION_TITLE + ".csv",index=None)

This approach gives 56% accuracy with the above setup.