In [1]:
import os
import csv
from collections import deque
import time

import librosa

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

#Keras
import keras
from keras import models
from keras import layers
from keras.callbacks import Callback

%matplotlib inline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#trainpath = '../data/kaggle-train-small/audio/'
trainpath = '../data/kaggle-train/audio/'

In [8]:
files = os.listdir(trainpath)
#files = files[0:10]

In [None]:
start = time.time()

stft = deque()
rmse = deque()
spec_cent = deque()
spec_bw = deque()
rolloff = deque()
zcr = deque()
mfcc = deque()

for idx, f in enumerate(files):
    filename = os.path.join(trainpath, str(f))
    y, sr = librosa.load(filename, mono=True, duration=120)
    stft.append(librosa.feature.chroma_stft(y=y, sr=sr))
    rmse.append(librosa.feature.rms(y=y))
    spec_cent.append(librosa.feature.spectral_centroid(y=y, sr=sr))
    spec_bw.append(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff.append(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr.append(librosa.feature.zero_crossing_rate(y))
    mfcc.append(librosa.feature.mfcc(y=y, sr=sr))
    if idx%10000 == 0:
        print(f'processing file: {idx}')
    
stft = np.array(stft)
rmse = np.array(rmse)
spec_cent = np.array(spec_cent)
spec_bw = np.array(spec_bw)
rolloff = np.array(rolloff)
zcr = np.array(zcr)
mfcc = np.array(mfcc)

np.save('./stft.numpy', stft)
np.save('./rmse.numpy', rmse)
np.save('./spec_cent.numpy', spec_cent)
np.save('./spec_bw.numpy', spec_bw)
np.save('./rolloff.numpy', rolloff)
np.save('./zcr.numpy', zcr)
np.save('./mfcc.numpy', mfcc)

    
print("Extracting the features took " + str((time.time()-start)/60) + "mins")

processing file: 0
processing file: 10000


In [6]:
stft = np.load('./stft.numpy.npy')
rmse = np.load('./rmse.numpy.npy')
spec_cent = np.load('./spec_cent.numpy.npy')
rolloff = np.load('./rolloff.numpy.npy')
zcr = np.load('./zcr.numpy.npy')
mfcc = np.load('./mfcc.numpy.npy')

In [10]:
df_train = pd.DataFrame(files, columns=['filename'])
df_train['audiotype'] = df_train['filename'].str.extract(pat = '_([a-z]*)_')
df_train['instrument'] = df_train['filename'].str.extract(pat = '^([a-z]*)_')
df_train['chroma_stft_mean'] = np.mean(stft, axis=(1,2))
df_train['chroma_stft_std'] = np.std(stft, axis=(1,2))
df_train['rmse_mean'] = np.mean(rmse, axis=(1,2))
df_train['rmse_std'] = np.std(rmse, axis=(1,2))
df_train['spec_cent_mean'] = np.mean(spec_cent, axis=(1,2))
df_train['spec_cent_std'] = np.std(spec_cent, axis=(1,2))
df_train['spec_bw_mean'] = np.mean(spec_bw, axis=(1,2))
df_train['spec_bw_std'] = np.std(spec_bw, axis=(1,2))
df_train['rolloff_mean'] = np.mean(rolloff, axis=(1,2))
df_train['rolloff_std'] = np.std(rolloff, axis=(1,2))
df_train['zcr_mean'] = np.mean(zcr, axis=(1,2))
df_train['zcr_std'] = np.std(zcr, axis=(1,2))
for j in range(0, 20):
    df_train[f' mfcc_mean{j}'] = np.mean(mfcc[:,j,:], axis=1)
    df_train[f' mfcc_std{j}'] = np.std(mfcc[:,j,:], axis=1)

df_train.dropna(inplace=True)
df_train.to_pickle('./df_train.pkl')

In [11]:
df_train = pd.read_pickle('./df_train.pkl')
df_train.head(20)

Unnamed: 0,filename,audiotype,instrument,chroma_stft_mean,chroma_stft_std,rmse_mean,rmse_std,spec_cent_mean,spec_cent_std,rolloff_mean,...,mfcc_mean15,mfcc_std15,mfcc_mean16,mfcc_std16,mfcc_mean17,mfcc_std17,mfcc_mean18,mfcc_std18,mfcc_mean19,mfcc_std19
0,keyboard_acoustic_010-064-050.wav,acoustic,keyboard,0.141385,0.289322,0.061293,0.088629,732.297437,911.230067,1171.256887,...,-7.161865,5.648172,-7.185595,8.262616,-4.402485,8.154543,3.564694,5.718117,15.429117,7.454765
1,organ_electronic_048-084-127.wav,electronic,organ,0.153087,0.302638,0.269546,0.15264,2285.527728,1254.141576,3235.83053,...,-2.765366,3.358678,-7.692285,6.340927,-12.007803,7.286099,-11.306641,6.808135,32.281609,18.238966
2,bass_synthetic_038-093-050.wav,synthetic,bass,0.187979,0.264302,0.008968,0.027033,2363.67603,1054.728094,5664.103708,...,9.904331,5.561808,3.634877,3.355129,-4.532478,4.726028,26.104923,15.218951,-13.752032,9.75382
3,brass_acoustic_014-067-100.wav,acoustic,brass,0.228951,0.305377,0.082316,0.04883,2186.9836,778.104957,3884.627185,...,-6.686733,5.794637,33.675407,15.761654,34.699574,14.006486,42.621082,18.076965,18.158424,7.996955
4,bass_synthetic_123-070-075.wav,synthetic,bass,0.031193,0.148407,0.013941,0.069042,308.126005,883.964268,520.033079,...,2.408881,7.501947,1.651202,4.91642,0.563238,2.538653,0.019563,1.103113,-0.736997,2.378023
5,organ_electronic_065-061-050.wav,electronic,organ,0.116215,0.269425,0.320845,0.201841,387.462269,582.655839,359.156402,...,0.65933,4.413772,3.569622,5.520141,-11.473527,6.826658,-10.947213,7.071216,1.169024,3.583707
6,guitar_acoustic_011-079-025.wav,acoustic,guitar,0.100189,0.251751,0.009332,0.029059,1590.736313,1355.110149,2676.029483,...,2.290287,4.052349,2.318639,3.328717,6.36852,5.995106,12.453623,15.721991,4.995918,8.226757
7,string_acoustic_010-040-127.wav,acoustic,string,0.220066,0.308069,0.015233,0.044543,471.090225,598.985427,799.653546,...,-3.003569,3.476644,-2.097698,2.789591,-1.75809,3.238113,-1.397056,2.802639,-1.792941,3.074017
8,keyboard_electronic_004-074-025.wav,electronic,keyboard,0.050198,0.193557,0.029618,0.090987,277.892332,384.914199,297.792997,...,-2.336124,3.447099,-5.185911,7.62715,-6.43052,9.426462,-5.752944,8.43596,-3.430038,5.121261
9,mallet_acoustic_075-084-025.wav,acoustic,mallet,0.346899,0.325698,0.02363,0.052565,1143.625495,198.873128,2357.512334,...,4.408544,5.452908,5.958602,7.441835,4.998078,4.751276,0.980078,2.564556,2.709939,6.365163


In [12]:
data = df_train[[c for c in df_train.columns if c not in ['filename', 'audiotype']]]

In [13]:
# normalizing
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:,2:], dtype = float))
print(X[0])

[ 0.21044656 -0.60180856 -0.19662271 -0.35474175  0.5491243  -0.38455103
  0.98335249 -0.32061836  0.18596393 -0.26692576 -0.77711603 -0.11110348
  0.35818454  0.68047229 -0.44001309 -0.1844341  -0.6764978   0.31683199
 -0.44001361 -0.0575284  -0.81213866  0.00129278 -0.85820828 -0.48904938
 -0.92589354 -0.55287598 -0.48359531 -1.00704589 -0.23179344 -1.08202351
 -0.12493322 -0.82851176 -0.2288695  -0.95016844 -0.30094444 -0.66864483
 -0.50868495 -0.71498318 -0.59786263 -0.49599124 -0.10645992 -0.73605492
  0.68370968 -0.31786875  0.54757835  0.33145075 -0.180274    1.16661129
  0.1495536 ]


In [14]:
# defining target
encoder = LabelEncoder()
y = encoder.fit_transform(data['instrument'])
print(y[0:5])

[4 6 0 1 0]


In [15]:
# spliting of dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
X_train.shape

(215820, 49)

In [None]:
#https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

class Metrics(Callback):
    
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
    
    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print('— val_f1: {} — val_precision: {} — val_recall {}'.format(_val_f1, _val_precision, _val_recall)
        return
 
metrics = Metrics()

In [17]:
# creating a model
model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
              
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_test,y_test),
                    epochs=20,
                    batch_size=128,
                    callbacks=[metrics])
                    
# calculate accuracy
test_loss, test_acc = model.evaluate(X_test,y_test)
print('test_acc: ',test_acc)

# predictions
predictions = model.predict(X_test)
#np.argmax(predictions[0])

W0813 23:12:43.329102 140106746386240 deprecation_wrapper.py:119] From /home/rsouza/Documents/envs/python_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0813 23:12:43.346997 140106746386240 deprecation_wrapper.py:119] From /home/rsouza/Documents/envs/python_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0813 23:12:43.351892 140106746386240 deprecation_wrapper.py:119] From /home/rsouza/Documents/envs/python_env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0813 23:12:43.387904 140106746386240 deprecation_wrapper.py:119] From /home/rsouza/Documents/envs/python_env/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprec

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
test_acc:  0.9545184965527467


3

In [None]:
testpath = '../data/kaggle-test/audio/'

In [None]:
start = time.time()

files = os.listdir(testpath)
stft = []
rmse = []
spec_cent = []
spec_bw = []
rolloff = []
zcr = []
mfcc = []

for f in files:
    filename = os.path.join(testpath, str(f))
    y, sr = librosa.load(filename, mono=True, duration=120)
    stft.append(librosa.feature.chroma_stft(y=y, sr=sr))
    rmse.append(librosa.feature.rms(y=y))
    spec_cent.append(librosa.feature.spectral_centroid(y=y, sr=sr))
    spec_bw.append(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff.append(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr.append(librosa.feature.zero_crossing_rate(y))
    mfcc.append(librosa.feature.mfcc(y=y, sr=sr))
    
stft = np.array(stft)
rmse = np.array(rmse)
spec_cent = np.array(spec_cent)
rolloff = np.array(rolloff)
zcr = np.array(zcr)
mfcc = np.array(mfcc)
    
print("Extracting the features took " + str((time.time()-start)/60) + "mins")

In [None]:
df_predict = pd.DataFrame(files, columns=['id'])
df_predict['id'] = df_predict['id'].str.extract(pat = '([0-9]*).wav')
df_predict['id'] = pd.to_numeric(df_predict['id'])
#df_predict['instrument'] = df_predict['filename'].str.extract(pat = '^([a-z]*)_')
df_predict['chroma_stft_mean'] = np.mean(stft, axis=(1,2))
df_predict['chroma_stft_std'] = np.std(stft, axis=(1,2))
df_predict['rmse_mean'] = np.mean(rmse, axis=(1,2))
df_predict['rmse_std'] = np.std(rmse, axis=(1,2))
df_predict['spec_cent_mean'] = np.mean(spec_cent, axis=(1,2))
df_predict['spec_cent_std'] = np.std(spec_cent, axis=(1,2))
df_predict['spec_bw_mean'] = np.mean(spec_bw, axis=(1,2))
df_predict['spec_bw_std'] = np.std(spec_bw, axis=(1,2))
df_predict['rolloff_mean'] = np.mean(rolloff, axis=(1,2))
df_predict['rolloff_std'] = np.std(rolloff, axis=(1,2))
df_predict['zcr_mean'] = np.mean(zcr, axis=(1,2))
df_predict['zcr_std'] = np.std(zcr, axis=(1,2))
for j in range(0, 20):
    df_predict[f' mfcc_mean{j}'] = np.mean(mfcc[:,j,:], axis=1)
    df_predict[f' mfcc_std{j}'] = np.std(mfcc[:,j,:], axis=1)
    
df_predict.sort_values(by='id', inplace=True)
df_predict.head()

In [None]:
df_predict.info()

In [None]:
df_predict.dropna(inplace=True)
testdata = df_predict[[c for c in df_predict.columns if c not in ['id']]]
X = scaler.fit_transform(np.array(testdata.iloc[:,1:], dtype = float))
X.shape

In [None]:
familyPredictions = model.predict(X)

In [None]:
with open('SVM-time-submission.csv', 'w', newline='') as writeFile:
    fieldnames = ['Id', 'Predicted']
    writer = csv.DictWriter(writeFile, fieldnames=fieldnames, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()
    for index in range(len(testDataset)):
        writer.writerow({'Id': index, 'Predicted': familyPredictionStrings[index]})