In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Conv2D,MaxPooling2D,GlobalAveragePooling2D,Dense,Flatten
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import librosa
from keras.layers import *
import os
import keras
from keras.models import Model
print(os.listdir("../input"))
from keras.utils import to_categorical
from glob import glob
from tqdm import tqdm
from keras.utils.np_utils import to_categorical
# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


['audio_speech_actors_01-24']


In [2]:
audiodir=glob("../input/audio_speech_actors_01-24/*")

In [3]:
audiodirfilename=[glob(dir1+"/*") for dir1 in audiodir]

In [4]:
audiodirfilename1=[]
for i in audiodirfilename:
    audiodirfilename1.extend(i)

In [5]:
len(audiodirfilename1)

1440

In [6]:
sr=32000
duration=4.5
hop_length=400
fmin=20
fmax=sr//2
n_mels=128
n_fft=n_mels*20
samples=int(sr*duration)


In [7]:
def loadaudio(path):
    y,sr1=librosa.core.load(path,sr)
    #print(sr1)
    if len(y)<samples:
        length=samples-len(y)
        y=np.pad(y,(0,length),'constant')
    else:
        y=y[0:samples]
    return y

def audio_melspectrogram(x):
    spectrogram=librosa.feature.melspectrogram(x,sr=sr,n_mels=n_mels,n_fft=n_fft,fmin=fmin,fmax=fmax)
    return librosa.power_to_db(spectrogram).astype(np.float32)

def convert_wav_image(paths):
    X=[]
    for path in tqdm(paths):
        audio=loadaudio(path)
        spec=audio_melspectrogram(audio)
        X.append(spec.transpose())
    return X

def normalize(img):
    eps=0.01
    if np.std(img) != 0:
        img = (img - np.mean(img)) / np.std(img)
    else:
        img = (img - np.mean(img)) / eps
    return img

def normalize_dataset(X):
    '''
    Normalizes list of arrays
    (subtract mean and divide by standard deviation)
    '''
    normalized_dataset = []
    for img in X:
        normalized = normalize(img)
        normalized_dataset.append(normalized)
    return normalized_dataset

In [8]:
audiodirfilename1[0]

'../input/audio_speech_actors_01-24/Actor_18/03-01-02-01-02-02-18.wav'

In [9]:
y=librosa.feature.melspectrogram(loadaudio(audiodirfilename1[0]),sr=sr,n_mels=n_mels,n_fft=n_fft,fmin=fmin,fmax=fmax)

In [10]:
y.shape

(128, 282)

In [11]:
import IPython.display as ipd  # To play sound in the notebook
fname = audiodirfilename1[802]   # Raindrop
ipd.Audio(fname)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X = np.array(convert_wav_image(audiodirfilename1))
X = normalize_dataset(X)


100%|██████████| 1440/1440 [03:54<00:00,  6.19it/s]


In [14]:
ytrain=[int(x.split("/")[-1].split("-")[2][1]) for x in audiodirfilename1]

In [15]:
set(ytrain)

{1, 2, 3, 4, 5, 6, 7, 8}

In [16]:
ytrain1=to_categorical(np.array(ytrain)-1,8) # remember to apply to categorical labels should start from 0

In [17]:
ytrain1[:10]

array([[0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [18]:
X1=np.array(X)

In [19]:
X1.shape

(1440, 282, 128)

In [20]:
X1=X1.reshape((-1,282,128,1))

In [21]:

def spatial_squeeze_excite_block(input1):
    se=Conv2D(1,(1,1),activation='sigmoid',kernel_initializer='he_normal')(input1)
    x=Multiply()([input1,se])
    return x

def squeeze_excite_block(input1,ratio=4):
    se=GlobalAveragePooling2D()(input1)
    se=Dense(input1._keras_shape[-1]//ratio,activation='relu')(se)
    se=Dense(input1._keras_shape[-1],activation='sigmoid')(se)
    se=Reshape((1,1,input1._keras_shape[-1]))(se)
    finput=Multiply()([input1,se])
    return finput

def spatial_Csqueeze_excite_block(input1,ratio=4):
    squeezeblock=squeeze_excite_block(input1,ratio)
    spatialblock=spatial_squeeze_excite_block(input1)
    finalblock=Add()([squeezeblock,spatialblock])
    return finalblock

In [22]:
inp = keras.Input((282,128,1))

# conv_1
conv1 = Conv2D(filters=32, kernel_size=3, strides=1, padding='same')(inp)
conv1 = keras.layers.SpatialDropout2D(.2, data_format='channels_last')(conv1)
conv1 = BatchNormalization()(conv1)
conv1 = Activation('relu')(conv1)
conv1 = spatial_Csqueeze_excite_block(conv1)
conv1= SpatialDropout2D(0.2,data_format='channels_last')(conv1)

conv2 = Conv2D(filters=32, kernel_size=4, strides=1, padding='same')(inp)
conv2 = keras.layers.SpatialDropout2D(.2, data_format='channels_last')(conv2)
conv2 = BatchNormalization()(conv2)
conv2 = Activation('relu')(conv2)
conv2 = spatial_Csqueeze_excite_block(conv2)
conv2 = SpatialDropout2D(0.2,data_format='channels_last')(conv2)

conv3 = Conv2D(filters=32, kernel_size=5, strides=1, padding='same')(inp)
conv3 = keras.layers.SpatialDropout2D(.2, data_format='channels_last')(conv3)
conv3 = BatchNormalization()(conv3)
conv3 = Activation('relu')(conv3)
conv3 = spatial_Csqueeze_excite_block(conv3)
conv3 = SpatialDropout2D(0.2,data_format='channels_last')(conv3)
concat= Concatenate(axis=-1)([conv1,conv2])
concat = spatial_Csqueeze_excite_block(concat)

conv31 = Conv2D(64,(1,1))(concat)
conv31 = BatchNormalization()(conv31)
conv31= Activation('relu')(conv31)
conv31 = Conv2D(32,kernel_size=3,strides=1,padding='same')(conv31)
conv31= SpatialDropout2D(0.2,data_format='channels_last')(conv31)
conv31 = BatchNormalization()(conv31)
conv31 = Activation('relu')(conv31)
conv31= spatial_Csqueeze_excite_block(conv31)
max_pool31 = keras.layers.MaxPool2D(padding="SAME")(conv31)

conv32 = Conv2D(64,(1,1))(concat)
conv32 = BatchNormalization()(conv32)
conv32= Activation('relu')(conv32)
conv32= Conv2D(32,kernel_size=4,strides=1,padding='same')(conv32)
conv32= SpatialDropout2D(0.2,data_format='channels_last')(conv32)
conv32 = BatchNormalization()(conv32)
conv32 = Activation('relu')(conv32)
conv32=spatial_Csqueeze_excite_block(conv32)
max_pool32 = keras.layers.MaxPool2D(padding="SAME")(conv32)

concat2=Concatenate()([max_pool31,max_pool32])
concat2=spatial_Csqueeze_excite_block(concat2)

conv41= Conv2D(128,kernel_size=3,strides=1,padding='same')(concat2)
conv41= SpatialDropout2D(0.2,data_format='channels_last')(conv41)
conv41 = BatchNormalization()(conv41)
conv41 = Activation('relu')(conv41)
conv41=spatial_Csqueeze_excite_block(conv41)
max_pool41 = keras.layers.MaxPool2D(padding="SAME")(conv41)

conv51=Conv2D(256,kernel_size=4,strides=1,padding='same')(concat2)
conv51= SpatialDropout2D(0.2,data_format='channels_last')(conv51)
conv51 = BatchNormalization()(conv51)
conv51 = Activation('relu')(conv51)
conv51=spatial_Csqueeze_excite_block(conv51)
max_pool51 = keras.layers.MaxPool2D(padding="SAME")(conv51)
concat4=Concatenate()([max_pool51,max_pool41])

avg_pool = GlobalAveragePooling2D()(concat4)
dense10 = keras.layers.Dense(8,activation='sigmoid')(avg_pool)


model = keras.Model(inputs=inp,outputs=dense10)

In [23]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 282, 128, 1)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 282, 128, 32) 320         input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 282, 128, 32) 544         input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout2d_1 (SpatialDro (None, 282, 128, 32) 0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
spatial_dr

In [24]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [25]:
history=model.fit(X1,ytrain1,batch_size=32,epochs=200,verbose=1,shuffle=True,validation_split=0.1)

Train on 1296 samples, validate on 144 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200


In [26]:
# Important Intution Gained while watching training
# If val accuracy is changing very big time then regularziation (Dropout etc) has to be done very important
# Spatial Dropout makes more sense as adjacent values along channel axis in conv layer can be quite correlated
# it prevents network to learn the missing values from adjacent ones therefore does good regularization
