# Training a NN with Urban Sound Challenge Data

**Background:**

In parallel to the actuall fall detection experiment, we'll train a neural network leveraging the urban challenge dataset in order to develop the necessary libraries to map wav files to features (either MFCC or spectogram) and baseline the performance of the different approaches)

## Libraries and File Locations

In [17]:
import matplotlib.pyplot as plt
from python_speech_features import mfcc, ssc, logfbank
import pandas as pd
import os
import numpy as np
import soundfile
from scipy import signal

In [18]:
URBAN_SOUND_DIR = "/media/romulo/ROMULO'S/urban_sound_challenge/"

## Load sample WAV File

Test that features can be generated in a couple of ways

* MFCC
* Spectrograms

In [19]:
def log_specgram(audio, sample_rate, window_size=10, 
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    _, _, spec = signal.spectrogram(audio, fs=sample_rate,
                                    window='hann', nperseg=nperseg, noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)


**MFCCs**

In [20]:
samples, sample_rate = soundfile.read(URBAN_SOUND_DIR+"Train/5132.wav")

In [21]:
mfcc_feat = mfcc(samples,sample_rate,nfft=1200)

In [22]:
mfcc_feat.shape

(68, 13)

**Spectrogram**

In [23]:
samples.shape

(16573, 2)

In [24]:
f, t, spectrogram= signal.spectrogram(samples,fs=sample_rate)

In [25]:
spectrogram.shape

(16573, 2, 1)

In [26]:
log_spectrogram = log_specgram(samples,sample_rate,10, 0)
log_spectrogram.shape

(1, 2, 16573)

## Training using MFCC Loading the dataset

In [27]:
temp = pd.read_json(URBAN_SOUND_DIR+"train_mfcc_transformed.json")

Delete the below row at some point

In [29]:
temp = temp.head(300)

In [30]:
classes = list(temp['label'].unique())

In [31]:
for i,val in enumerate(classes):
    print(i,val)

0 siren
1 street_music
2 engine_idling
3 jackhammer
4 car_horn
5 drilling
6 dog_bark
7 air_conditioner
8 children_playing
9 gun_shot


In [32]:
temp['label_encoded'] = temp['label'].apply(classes.index)

# EXTRACT FEATURES USING IMAGE NET!!!

In [4]:
from keras.applications.inception_v3 import InceptionV3

In [7]:
pre_model = InceptionV3(weights='imagenet',
                  include_top=False,
                  input_shape=(23, 224, 3))

ValueError: The input must have 3 channels; got `input_shape=(1, 23, 200)`

# NEXT TEST THE JSON LOADING FORMAT!!!!

In [None]:
#temp = pd.read_csv(URBAN_SOUND_DIR+"train_mfcc_transformed.csv", dtype={'feature': np.ndarray.dtype} )

## Aggregate MFCC series into a single dimension

In [None]:
X_avg = []
for x in temp.feature.tolist():
    X_avg.append(np.mean(x,axis=0))

In [None]:
X_avg[0].shape

In [None]:
X = np.array(X_avg)

In [None]:
y = np.array(temp.label_encoded.tolist())

In [None]:
set([x.shape for x in X]) == set([(13,)])

## Aggregate MFCC to padded array

In [None]:
x_padded = []
for x in temp.feature.tolist():
    x_array = np.array(x)
    pad = np.zeros((801,13))
    pad[:x_array.shape[0],:] = x_array
    x_padded.append(pad)
x_padded_array = np.array(x_padded)

## Raw MFCC, different shapes

In [None]:
x_raw = np.array([np.array([np.array(y) for y in x]) for x in temp.feature.tolist()])

In [None]:
x_raw.shape

In [None]:
x_padded_array.shape

In [None]:
y.shape

# Setting up a simple model

With convolutions on the temporal space

In [7]:
import tensorflow as tf
from tensorflow import keras

In [None]:
conv_layer = keras.layers.Conv1D(200, (10),
                                 input_shape = (801,13),
                                 strides=10, padding='same', 
                                 data_format='channels_last', 
                                 dilation_rate=1, 
                                 activation=tf.nn.relu, 
                                 use_bias=True, 
                                 kernel_initializer='glorot_uniform', 
                                 bias_initializer='zeros', 
                                 kernel_regularizer=None, 
                                 bias_regularizer=None, 
                                 activity_regularizer=None,
                                 kernel_constraint=None, bias_constraint=None)

In [None]:
model = None

In [None]:
model_3 = keras.Sequential([
    conv_layer,
    keras.layers.Flatten(input_shape=(801,13)),
    keras.layers.Dense(200, activation=tf.nn.relu),
    keras.layers.Dropout(0.2, noise_shape=None, seed=None),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(70, activation=tf.nn.relu),
    keras.layers.Dense(10, activation=tf.nn.softmax)
])

In [None]:
model_3.compile(optimizer='sgd', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model_3.fit(x_padded_array, y, epochs=20,batch_size=10,shuffle=False,validation_split=0.1)

In [None]:
## Save the model to JSON

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("./trained_models/hdf5/urban_sound_model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
# serialize weights to HDF5
model.save_weights("./trained_models/hdf5/urban_sound_model.h5")
print("Saved model to disk")

In [8]:
with open('./trained_models/hdf5/urban_sound_model.json', 'r') as json_file:
    loaded_model_json = json_file.read()
loaded_model = keras.models.model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("./trained_models/hdf5/urban_sound_model.h5")


# Test the MFCC model

In [None]:
#test_temp = test_df.apply(parse_mfcc,args=('Test',4800), axis=1)
#test_temp.columns = ['feature']

In [None]:
#test_temp.to_json(URBAN_SOUND_DIR+"test_mfcc_transformed.json")

In [4]:
test_temp = pd.read_json(URBAN_SOUND_DIR+"test_mfcc_transformed.json")

In [5]:
test_temp.shape

(3297, 1)

# First Submission/ Prediction

In [9]:
x_test_padded = []
for x in test_temp.feature.tolist():
    x_array = np.array(x)
    pad = np.zeros((801,13))
    pad[:x_array.shape[0],:] = x_array
    x_test_padded.append(pad)
x_test_padded_array = np.array(x_test_padded)

In [10]:
y_prediction = loaded_model.predict(x_test_padded_array)

In [11]:
test_df['prediction'] = y_prediction.argmax(axis=-1)

In [13]:
def class_for_idx(cl):
    return classes[cl]

In [14]:
test_df['Class'] = test_df['prediction'].apply(class_for_idx)

NameError: name 'classes' is not defined

In [19]:
test_df.to_csv(URBAN_SOUND_DIR+'test_prediction_baseline.csv')

# Simple model using spectrograms

In [None]:
def parser_spec(row):
    # fun
    #ction to load files and extract features
    folder = 'Train'
    file_name = os.path.join(URBAN_SOUND_DIR, folder, str(row.ID) + '.wav')
   # handle exception to check if there isn't a file which is corrupted
    try:
        samples, sample_rate = soundfile.read(file_name)
        # here kaiser_fast is a technique used for faster extraction
        #f, t, spectrogram= signal.spectrogram(samples,sample_rate)
        spectrogram = log_specgram(samples,sample_rate)
    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return pd.Series([None, None])
    return pd.Series([spectrogram, row.Class])

In [None]:
train_df.head()

## Extracting features in chunks

In [None]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


In [None]:
chunk = 0
chunk_size = 50
for i in chunker(train_df.head(),chunk_size):
    temp_spec = i.apply(parser_spec, axis=1)
    temp_spec.columns = ['feature', 'label']
    temp_spec.to_json(URBAN_SOUND_DIR+"train_spectrogram_transformed_chunk_{}.json".format(chunk))
    print("Extracted features for chunk # {} of size {}".format(chunk,chunk_size))
    chunk +=1