# **Conv2D-MFCC**

In [None]:
import numpy as np
import librosa
import pandas as pd
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data_path=pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/FYP-SER/data_path.csv")

In [None]:
def extract_features(data):
    data=data[:44100]
    zero_padding=tf.zeros([44100] - tf.shape(data), dtype=tf.float32)
    data=tf.concat([zero_padding, data], 0)
    data=np.array(data)
    mfcc=librosa.feature.mfcc(y=data, n_mfcc=50).T
    return mfcc


In [None]:
#Data Augmentation

def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [None]:
def remove_silent(data):
    data=librosa.effects.trim(data, top_db = 30)[0]
    return data

In [None]:
def get_features(path,i):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, sr=22050)
    data=remove_silent(data)
    
    # without augmentation
    if i==0:
      res1 = extract_features(data)
      result = np.array(res1)
      
    # data with noise
    if i==1:
      noise_data = noise(data)
      result = extract_features(noise_data)
      # result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    if i==2:
      new_data = stretch(data)
      data_stretch_pitch = pitch(new_data, sample_rate)
      result = extract_features(data_stretch_pitch)
      # result = np.vstack((result, res3)) # stacking vertically
    
    return result

In [None]:
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    for i in range(3):
        feature = get_features(path,i)
        X.append(feature)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

In [None]:
len(X),len(Y)

(4320, 4320)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Conv1D, MaxPooling1D,MaxPooling2D, Flatten, Dropout, BatchNormalization

In [None]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [None]:
X=np.array(X)

In [None]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3240, 87, 50), (3240, 8), (1080, 87, 50), (1080, 8))

In [None]:
# making our data compatible to model.
x_train = np.expand_dims(x_train, axis=3)
x_test = np.expand_dims(x_test, axis=3)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((3240, 87, 50, 1), (3240, 8), (1080, 87, 50, 1), (1080, 8))

In [None]:
#Modelling

model=Sequential()
model.add(Conv2D(256, (5,5), activation='tanh',padding='same', input_shape=(x_train.shape[1],x_train.shape[2],1)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3,3), padding='same'))


model.add(Conv2D(512, (5,5), activation='tanh',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3,3), padding='same'))


model.add(Conv2D(256, (5,5), activation='tanh',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3,3), padding='same'))


model.add(Conv2D(128, (5,5), activation='tanh',padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(3,3), padding='same'))


model.add(Flatten())

model.add(Dense(64, activation='tanh'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(8, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 87, 50, 256)       6656      
                                                                 
 batch_normalization (BatchN  (None, 87, 50, 256)      1024      
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 29, 17, 256)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 29, 17, 512)       3277312   
                                                                 
 batch_normalization_1 (Batc  (None, 29, 17, 512)      2048      
 hNormalization)                                                 
                                                        

In [None]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
n_epochs=100
history=model.fit(x_train, y_train, batch_size=64, epochs=n_epochs, validation_data=(x_test, y_test), callbacks=[rlrp])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

In [None]:
print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

epochs = [i for i in range(n_epochs)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

In [None]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Greys', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))