# <center>Speech emotion recognition - UHH ML Project 2D-CNN
</center> 

## Introduction 
This model is our 2D CNN model for Speech Emotion Recognition. It follows this steps
1. Data prepration and processing
2. Build a model
3. Compile model
4. Serialize model
5. Validate model




In [None]:
import pandas as pd
import tensorflow as tf
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd  
import glob 
import os
import sys
import numpy as np


from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense,Input, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPool2D
from tensorflow.keras.utils import to_categorical
from keras.utils import np_utils
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import seaborn as sns
from tqdm import tqdm
import keras_tuner as kt
from keras_tuner import HyperModel
from keras_tuner.tuners import Hyperband

In [None]:
def speedNpitch(data):
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.2  / length_change
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

def plot_history(history):
    fig, axs = plt.subplots(2)
    # create accuracy sublpot
    axs[0].plot(history.history['acc'], label='train accuracy')
    axs[0].plot(history.history['val_acc'], label='test accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].legend(loc='lower right')
    axs[0].set_title('Accuracy eval')
    # create error sublpot
    axs[1].plot(history.history['loss'], label='train error')
    axs[1].plot(history.history['val_loss'], label='test error')
    axs[1].set_ylabel('Error')
    axs[1].set_xlabel('Epoch')
    axs[1].legend(loc='upper right')
    axs[1].set_title('Error eval')
    plt.show()
    
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):

    df_cm = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names, )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

<a id="data"></a>
## 1. Getting Data
Read in the data csv-file with all labels and paths of the four datasets.

In [None]:
ref = pd.read_csv("/kaggle/input/data-pathcsv/Data_path.csv")  
print(ref.shape)
df=np.empty(shape=(ref.shape[0], 30, 216))

Here we are reading the audio file in and try to extract the mfcc features. There we take the mean to save space and to accelerate it.

In [None]:
j = 0
input_length = 44100 * 2.5
for i in tqdm(ref.path):
    path = i
    wav, sr = librosa.core.load(path, sr=44100, offset=0.5, duration=2.5, res_type='kaiser_fast')
    
    # Random offset / Padding
    if len(wav) > input_length:
        max_offset = len(wav) - input_length
        offset = np.random.randint(max_offset)
        wav = wav[offset:(input_length+offset)]
    else:
        if input_length > len(wav):
            max_offset = input_length - len(wav)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        wav = np.pad(wav, (offset, int(input_length) - len(wav) - offset), "constant")
        
    wav = speedNpitch(wav)
    MFCC = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=30)
    df[j,0:30,0:MFCC.shape[1]] = MFCC
    j = j+1   

In [None]:
print(MFCC.shape)

In [None]:
print(df.shape)

<a id="processing"></a>
### Data processing

Now the data is being put into a practical format for Keras and the CNN.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, ref.labels, test_size = 0.25, random_state = 42)
#X_train = np.array(X_train)
#y_train = np.array(y_train)
#X_test = np.array(X_test)
#y_test = np.array(y_test)
X_shape = X_train
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
print(y_train[0:10])
print(y_test[0:200])

In [None]:
X_train = np.expand_dims(X_train, axis=3)
X_test = np.expand_dims(X_test, axis=3)
X_train.shape[1]

In [None]:
X_train.shape

<a id="build"></a>
## 2. Build a model
We are using a 2D CNN. 

In [None]:
#Hyper Tuning
class CNNHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes

    def build(self, hp):
        model = tf.keras.Sequential()
        #input layer
        model.add(Conv2D(filters=hp.Choice('num_filters_1',values=[16, 256],default=16,), kernel_size=(4,10),activation=hp.Choice('Conv2D_activation_1',values=['relu','tanh','sigmoid'],default='relu'), padding='same', input_shape = (30, 216, 1)))
        model.add(BatchNormalization())
        model.add(MaxPool2D())
        model.add(Dropout(rate=hp.Float('dropout_1',min_value=0.0,max_value=0.5,default=0.25,step=0.05,)))
        # 2nd Conv2d layer          
        model.add(Conv2D(filters=hp.Choice('num_filters_2',values=[16, 256],default=16,),activation=hp.Choice('Conv2D_activation_2',values=['relu','tanh','sigmoid'],default='relu'),kernel_size=(4,10),padding='same'))
        model.add(BatchNormalization())
        model.add(MaxPool2D())
        model.add(Dropout(rate=hp.Float('dropout_2',min_value=0.0,max_value=0.5,default=0.25,step=0.05,)))
        # 3rd Conv2d layer          
        model.add(Conv2D(filters=hp.Choice('num_filters_3',values=[16, 256],default=16,), activation=hp.Choice('Conv2D_activation_3',values=['relu','tanh','sigmoid'],default='relu'),kernel_size=(4,10),padding='same'))
        model.add(BatchNormalization())
        model.add(MaxPool2D())
        model.add(Dropout(rate=hp.Float('dropout_3',min_value=0.0,max_value=0.5,default=0.25,step=0.05,)))
        # 4th Conv2d layer          
        model.add(Conv2D(filters=hp.Choice('num_filters_4',values=[16, 256],default=16,),activation=hp.Choice('Conv2D_activation_4',values=['relu','tanh','sigmoid'],default='relu'),kernel_size=(4,10),padding='same'))
        model.add(BatchNormalization())
        model.add(MaxPool2D())
        model.add(Dropout(rate=hp.Float('dropout_4',min_value=0.0,max_value=0.5,default=0.25,step=0.05,)))
        # 1st Dense layer
        model.add(Flatten())
        model.add(Dense(units=hp.Int('units',min_value=32, max_value=512,step=32,default=64),activation=hp.Choice('dense_activation',values=['relu','tanh','sigmoid'],default='relu')))
        model.add(Dropout(rate=hp.Float('dropout_5', min_value=0.0,max_value=0.5,default=0.25,step=0.05)))
        # Output layer 
        model.add(Dense(14))         
        model.add(Activation('softmax'))
        #opt = tf.keras.optimizers.Adam(hp.Float('learning_rate',min_value=1e-4,max_value=1e-2,sampling='LOG',default=1e-3))
        model.compile(optimizer=tf.keras.optimizers.Adam(hp.Float('learning_rate',min_value=1e-4, max_value=1e-2,sampling='LOG',default=1e-3)),loss=tf.keras.losses.categorical_crossentropy,metrics=['accuracy'])
        #model.summary()
        return model

In [None]:
NUM_CLASSES=14
hypermodel = CNNHyperModel(input_shape=(30, 216, 1), num_classes=NUM_CLASSES)

In [None]:
HYPERBAND_MAX_EPOCHS = 30
MAX_TRIALS = 60
EXECUTION_PER_TRIAL = 3
SEED = 1
N_EPOCH_SEARCH = 40
tuner = Hyperband(hypermodel,max_epochs=HYPERBAND_MAX_EPOCHS,objective='val_accuracy',seed=SEED,executions_per_trial=EXECUTION_PER_TRIAL,directory='hyperband',project_name='TPU1_aug')

In [None]:
tuner.search_space_summary()

In [None]:
# train model normally
tuner.search(X_train, y_train, epochs=N_EPOCH_SEARCH, validation_split=0.1)

In [None]:
# Show a summary of the search
tuner.results_summary()

# Retrieve the best model.
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model.
loss, accuracy = best_model.evaluate(X_test, y_test)

model_name = 'best_model_aug.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
best_model.save(model_path)
print('Save model and weights at %s ' % model_path)

model_json = best_model.to_json()
with open("model_json_aug.json", "w") as json_file:
    json_file.write(model_json)

<a id="compile"></a>
## 3. Compile Model 


In [None]:
best_model.compile(optimizer = tf.keras.optimizers.Adam(0.00034981818142934215), loss = tf.keras.losses.categorical_crossentropy, metrics = ['acc'])#tf.keras.losses.categorical_crossentropy.,loss='sparse_categorical_crossentropy'
history = best_model.fit(X_train, y_train, batch_size = 32, epochs = 30, validation_data = (X_test, y_test))

In [None]:
plot_history(history)

## 4. Serialize model

In [None]:
model_name = 'EM_2D_aug.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)

model_json = model.to_json()
with open("model_json_aug.json", "w") as json_file:
    json_file.write(model_json)

## 5. Validate model

In [None]:
preds = best_model.predict(X_test, batch_size=16, verbose=1)

preds=preds.argmax(axis=1)
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})

actual = y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

finaldf = actual.join(preds)
finaldf1 = actual.join(preds)

finaldf[170:180]

**Emotion by gender accuracy**

In [None]:
classes = finaldf.actualvalues.unique()
classes.sort()    

c = confusion_matrix(finaldf.actualvalues, finaldf.predictedvalues)
print(accuracy_score(finaldf.actualvalues, finaldf.predictedvalues))
print_confusion_matrix(c, class_names = classes)

In [None]:
# Classification report 
classes = finaldf.actualvalues.unique()
classes.sort()    
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))

**Gender accuracy result**

In [None]:
modidf1 = finaldf1
modidf1['actualvalues'] = modidf1.actualvalues.replace({'female_angry':'angry' , 'female_disgust':'disgust', 'female_fear':'fear', 'female_happy':'happy', 'female_sad':'sad'
                                       , 'female_surprise':'surprise', 'female_neutral':'neutral', 'male_angry':'angry', 'male_fear':'fear', 'male_happy':'happy', 'male_sad':'sad'
                                       , 'male_surprise':'surprise' , 'male_neutral':'neutral' , 'male_disgust':'disgust'})

modidf1['predictedvalues'] = modidf1.predictedvalues.replace({'female_angry':'angry', 'female_disgust':'disgust', 'female_fear':'fear', 'female_happy':'happy', 'female_sad':'sad'
                                       , 'female_surprise':'surprise', 'female_neutral':'neutral', 'male_angry':'angry', 'male_fear':'fear', 'male_happy':'happy', 'male_sad':'sad'
                                       , 'male_surprise':'surprise', 'male_neutral':'neutral', 'male_disgust':'disgust'})

classes = modidf1.actualvalues.unique() 
classes.sort() 

d = confusion_matrix(modidf1.actualvalues, modidf1.predictedvalues)
print(accuracy_score(modidf1.actualvalues, modidf1.predictedvalues))
print_confusion_matrix(d, class_names = classes)

In [None]:
modidf = finaldf
modidf['actualvalues'] = finaldf.actualvalues.replace({'female_angry':'female', 'female_disgust':'female', 'female_fear':'female', 'female_happy':'female'
                                       , 'female_sad':'female', 'female_surprise':'female', 'female_neutral':'female', 'male_angry':'male', 'male_fear':'male', 'male_happy':'male'
                                       , 'male_sad':'male', 'male_surprise':'male' , 'male_neutral':'male', 'male_disgust':'male'})

modidf['predictedvalues'] = finaldf.predictedvalues.replace({'female_angry':'female', 'female_disgust':'female', 'female_fear':'female', 'female_happy':'female'
                                       , 'female_sad':'female', 'female_surprise':'female', 'female_neutral':'female', 'male_angry':'male', 'male_fear':'male', 'male_happy':'male'
                                       , 'male_sad':'male', 'male_surprise':'male', 'male_neutral':'male', 'male_disgust':'male'})

classes = modidf.actualvalues.unique()  
classes.sort() 

c = confusion_matrix(modidf.actualvalues, modidf.predictedvalues)
print(accuracy_score(modidf.actualvalues, modidf.predictedvalues))
print_confusion_matrix(c, class_names = classes)

In [None]:
# Classification report 
classes = modidf.actualvalues.unique()
classes.sort()    
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))

**Emotion accuracy**

In [None]:
# Classification report 
classes = modidf.actualvalues.unique()
classes.sort()    
print(classification_report(modidf.actualvalues, modidf.predictedvalues, target_names=classes))