Project Title: Speech Emotion Recognition with Audio.

Goal: Predicting Emotion from an audio file with speech.

>Steps:
1. Importing Necessary Packages
2. As the data is stored on the drive a pipeline through drive and notebook is established with google.colab library for further processing.
3. Data Cleaning and Manipulation.
4. EDA on Audio files and content in the Audio files.
5. Extract features from the audio files through python Librosa package.
6. Initalize different Classifiers and Neural Network models from sci-kit and tensorflow libraries.
7. Train the models.
8. Test and Validate the models.
9. Prerfomance Analysis of the models.
10. Conclusion.


In [None]:
# Importing all the required packages
import pandas as pd
import numpy as np
import os
import IPython
import tensorflow.keras.layers as L
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
import re
import itertools
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.svm import SVC
import librosa
import librosa.display
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import IPython
import joblib
import math
from google.colab import drive 
drive.mount('/content/gdrive')

In [None]:
# Paths of different datasets
Crema_Path = r'/Dataset/Crema'
Ravdess_Path=r'/Dataset/Ravdess/audio_speech_actors_01-24'
Savee_Path=r'/Dataset/Savee'
Tess_Path=r'/Dataset/Tess'

In [None]:
# Creating a list for crema with emotion and audio file path
crema=[]
for wav in os.listdir(Crema_Path):
    emotion=wav.partition(".wav")[0].split('_')
    if emotion[2]=='SAD':
        crema.append(('sad',Crema_Path+'/'+wav))
    elif emotion[2]=='ANG':
        crema.append(('angry',Crema_Path+'/'+wav))
    elif emotion[2]=='DIS':
        crema.append(('disgust',Crema_Path+'/'+wav))
    elif emotion[2]=='FEA':
        crema.append(('fear',Crema_Path+'/'+wav))
    elif emotion[2]=='HAP':
        crema.append(('happy',Crema_Path+'/'+wav))
    elif emotion[2]=='NEU':
        crema.append(('neutral',Crema_Path+'/'+wav))
    else:
        crema.append(('unknown',Crema_Path+'/'+wav))
Crema_df=pd.DataFrame.from_dict(crema)
Crema_df.rename(columns={0:'Emotion',1:'File_Path'},inplace=True)
Crema_df.head()

In [None]:
# Unique Emotions in Crema
Crema_df['Emotion'].unique()

In [None]:
# Crema data shape
Crema_df.shape

In [None]:
# Emotion Distribution
plt.figure(figsize=(20,6))
plt.title('Emotions Counts of Crema')
emotions=sns.countplot(x='Emotion',data=Crema_df,palette='Set3')
emotions.set_xticklabels(emotions.get_xticklabels())
plt.show()

In [None]:
# Null values if any,
print('Number of null Values in crema data set: ',Crema_df.isna().sum().sum())

In [None]:
# Dataframe information
Crema_df.info()

In [None]:
# Creating a list for ravdess with emotion and audio file path
ravdess=[]
for directory in os.listdir(Ravdess_Path):
    actors=os.listdir(os.path.join(Ravdess_Path,directory))
    for wav in actors:
        emotion=wav.partition('.wav')[0].split('-')
        emotion_number=int(emotion[2])
        ravdess.append((emotion_number,os.path.join(Ravdess_Path,directory,wav)))
Ravdess_df=pd.DataFrame.from_dict(ravdess)
Ravdess_df.rename(columns={0:'Emotion',1:'File_Path'},inplace=True)
Ravdess_df['Emotion'].replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'},inplace=True)
Ravdess_df.head()


In [None]:
# Unique Emotions in Ravdess
Ravdess_df['Emotion'].unique()

In [None]:
# Ravdess file path size
Ravdess_df.shape

In [None]:
# Emotion Distribution
plt.figure(figsize=(20,6))
plt.title('Emotions Counts in Ravdess')
emotions=sns.countplot(x='Emotion',data=Ravdess_df,palette='Set3')
emotions.set_xticklabels(emotions.get_xticklabels())
plt.show()

In [None]:
# Null values if any,
print('Number of null Values in Ravdess data set: ',Ravdess_df.isna().sum().sum())

In [None]:
# Dataframe information
Ravdess_df.info()

In [None]:
# Creating a list for Savee with emotion and audio file path
savee=[]
for wav in os.listdir(Savee_Path):
    emo=wav.partition('.wav')[0].split('_')[1].replace(r'[0-9]','')
    emotion=re.split(r'[0-9]',emo)[0]
    if emotion=='a':
        savee.append(('angry',Savee_Path+'/'+wav))
    elif emotion=='d':
        savee.append(('disgust',Savee_Path+'/'+wav))
    elif emotion=='f':
        savee.append(('fear',Savee_Path+'/'+wav))
    elif emotion=='h':
        savee.append(('happy',Savee_Path+'/'+wav))
    elif emotion=='n':
        savee.append(('neutral',Savee_Path+'/'+wav))
    elif emotion=='sa':
        savee.append(('sad',Savee_Path+'/'+wav))
    elif emotion=='su':
        savee.append(('surprise',Savee_Path+'/'+wav))
Savee_df=pd.DataFrame.from_dict(savee)
Savee_df.rename(columns={0:'Emotion',1:'File_Path'},inplace=True)
Savee_df.head()

In [None]:
# Unique Emotions in Savee
Savee_df['Emotion'].unique()

In [None]:
#Savee list shape
Savee_df.shape

In [None]:
# Emotion Distribution
plt.figure(figsize=(20,6))
plt.title('Emotions Counts in Savee dataset')
emotions=sns.countplot(x='Emotion',data=Savee_df,palette='Set3')
emotions.set_xticklabels(emotions.get_xticklabels())
plt.show()

In [None]:
# Null values if any,
print('Number of null Values in Savee data set: ',Savee_df.isna().sum().sum())

In [None]:
# Dataframe information
Savee_df.info()

In [None]:
# Creating a list for tess with emotion and audio file path
tess=[]
for directory in os.listdir(Tess_Path):
    for wav in os.listdir(os.path.join(Tess_Path,directory)):
        emotion=wav.partition('.wav')[0].split('_')
        if emotion[2]=='ps':
            tess.append(('surprise',os.path.join(Tess_Path,directory,wav)))
        else:
            tess.append((emotion[2],os.path.join(Tess_Path,directory,wav)))
Tess_df=pd.DataFrame.from_dict(tess)
Tess_df.rename(columns={0:'Emotion',1:'File_Path'},inplace=True)
Tess_df.head()

In [None]:
#Data corrections
Tess_df.loc[Tess_df["Emotion"] == 'neutral (1)'] = 'neutral'

In [None]:
# Unique Emotions in tess
Tess_df['Emotion'].unique()

In [None]:
# Tess data file shape
Tess_df.shape

In [None]:
# Emotion Distribution
plt.figure(figsize=(20,6))
plt.title('Emotions Counts in Tess dataset')
emotions=sns.countplot(x='Emotion',data=Tess_df,palette='Set3')
emotions.set_xticklabels(emotions.get_xticklabels())
plt.show()

In [None]:
# Null values if any,
print('Number of null Values in Tess data set: ',Tess_df.isna().sum().sum())

In [None]:
# Dataframe information
Tess_df.info()

In [None]:
# Combining all the datasets into a single pandas series
main_df = pd.concat([Crema_df,Ravdess_df,Savee_df,Tess_df],axis=0)
main_df.shape

In [None]:
# Concatenated Dataframe contents
main_df.head()

In [None]:
# Concatenated Datasets Unique Emotions
main_df['Emotion'].unique()

In [None]:
# Emotion Distribution
plt.figure(figsize=(20,6))
plt.title('Emotions Counts in concatenated Dataset')
emotions=sns.countplot(x='Emotion',data=main_df,palette='Set3')
emotions.set_xticklabels(emotions.get_xticklabels())
plt.show()

In [None]:
# Emotion names
emotion_names=main_df['Emotion'].unique()
emotion_names

In [None]:
# Unique color for each Emotion

colors={'disgust':'#808080','happy':'#ffff00','sad':'#ff4000','neutral':'#00bfff','fear':'#ff8000','angry':'#ff0000','surprise':'#ff00ff'}

>Waveplot
Plotting the amplitude envelope of a waveform.

>Spectrogram
A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time. When applied to an audio signal, spectrograms are sometimes called sonographs.


In [None]:
# Function to create waveplots  for the audio files
def wave_plot(data,sr,emotion,color):
    plt.figure(figsize=(20,6))
    plt.title(f'{emotion} emotion for waveplot',size=18)
    librosa.display.waveshow(y=data,sr=sr,color=color,label="Frequency over time")
    plt.legend()

In [None]:
# Function to create spectogram for the audio files
def spectogram(data,sr,emotion):
    audio=librosa.stft(data)
    audio_db=librosa.amplitude_to_db(abs(audio))
    plt.figure(figsize=(20,6))
    plt.title(f'{emotion} emotion for spectogram',size=18)
    librosa.display.specshow(audio_db,sr=sr,x_axis='time',y_axis='hz')

In [None]:
# Creating feature graphs for the audio files
audio_path=[]
for emotion in emotion_names:
    path=np.array(main_df['File_Path'][main_df['Emotion']==emotion])[1]
    data,sr=librosa.load(path)
    wave_plot(data,sr,emotion,colors[emotion])
    spectogram(data,sr,emotion)
    audio_path.append(path)

In [None]:
# Audio sample for disgust emotion
print('Disgust Audio Sample\n\n')
IPython.display.Audio(audio_path[0])

In [None]:
# Audio sample for happy emotion
print('Happy Audio Sample\n\n')
IPython.display.Audio(audio_path[1])

In [None]:
# Audio sample for sad emotion
print('Sad Audio Sample\n\n')
IPython.display.Audio(audio_path[2])

In [None]:
# Audio sample for neutral emotion
print('Neutral Audio Sample\n\n')
IPython.display.Audio(audio_path[3])

In [None]:
# Audio sample for fear emotion
print('Fear Audio Sample\n\n')
IPython.display.Audio(audio_path[4])

In [None]:
# Audio sample for angry emotion
print('Angry Audio Sample\n\n')
IPython.display.Audio(audio_path[5])

In [None]:
# Audio sample for surprise emotion
print('Surprise Audio Sample\n\n')
IPython.display.Audio(audio_path[6])

#### Manipulating Audio Data

In [None]:
# Audio files Manipualtion
def add_noise(data,random=False,rate=0.035,threshold=0.075):
    if random:
        rate=np.random.random()*threshold
    noise=rate*np.random.uniform()*np.amax(data)
    augmented_data=data+noise*np.random.normal(size=data.shape[0])
    return augmented_data

def shifting(data,rate=1000):
    augmented_data=int(np.random.uniform(low=-5,high=5)*rate)
    augmented_data=np.roll(data,augmented_data)
    return augmented_data

def pitching(data,sr,pitch_factor=0.7,random=False):
    if random:
        pitch_factor=np.random.random() * pitch_factor
    return librosa.effects.pitch_shift(data,sr,pitch_factor)

def streching(data,rate=0.8):
  return librosa.effects.time_stretch(data,rate)

In [None]:
# Original Audio Sample
print('\t\t Original Audio\n')
plt.figure(figsize=(20,6))
librosa.display.waveshow(data,sr,color='#8000ff',label="Frequency over time")
plt.legend()
IPython.display.Audio(audio_path[6])

In [None]:
# Orginal Audio + Noise
print('\t\t Noise Audio\n')
noised_audio=add_noise(data)
plt.figure(figsize=(20,6))
librosa.display.waveshow(noised_audio,sr,color='#8000ff',label="Frequency over time")
plt.legend()
IPython.display.Audio(noised_audio,rate=sr)

In [None]:
# Orginal Audio + Streched
print('\t\t Streched Audio\n')
stretched_audio=streching(data)
plt.figure(figsize=(20,6))
librosa.display.waveshow(stretched_audio,sr,color='#8000ff',label="Frequency over time")
plt.legend()
IPython.display.Audio(stretched_audio,rate=sr)

In [None]:
# Orginal Audio + Shifted
print('\t\t Shifted Audio\n')
shifted_audio=shifting(data)
plt.figure(figsize=(20,6))
librosa.display.waveshow(shifted_audio,sr,color='#8000ff',label="Frequency over time")
plt.legend()
IPython.display.Audio(shifted_audio,rate=sr)

In [None]:
# Orginal Audio + Pitch
print('\t\t Pitched Audio\n')
pitched_audio=pitching(data,sr)
plt.figure(figsize=(20,6))
librosa.display.waveshow(pitched_audio,sr,color='#8000ff',label="Frequency over time")
plt.legend()
IPython.display.Audio(pitched_audio,rate=sr)

> Mel-Frequency Cepstral Coefficients (MFCCs)

Mel-Frequency Cepstral Coefficients is a representation of the short-term power spectrum of a sound, based on some transformation in a Mel-scale. It is commonly used in speech recognition as people’s voices are usually on a certain range of frequency and different from one to another. 

> Root-mean-square (RMS)

The root-mean-square here refers to the total magnitude of the signal, which in layman terms can be interpreted as the loudness or energy parameter of the audio file.



> Zero crossing rate (ZCR)

zero-crossing rate is the rate at which a signal changes from positive to zero to negative or from negative to zero to positive. Its value has been widely used in both speech recognition and music information retrieval, being a key feature to classify percussive sounds. Highly percussive sounds like rock, metal, emo, or punk music tend to have higher zero-crossing rate values.


> Tonnetz

Tonnetz (German for 'tone network') is a conceptual lattice diagram representing tonal space. Various visual representations of the Tonnetz can be used to show traditional harmonic relationships 

In [None]:
# Audio files features extraction
def zcr(data):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=2048,hop_length=512)
    return np.array(np.squeeze(zcr))
def rms(data):
    rms=librosa.feature.rms(data,frame_length=2048,hop_length=512)
    return np.array(np.squeeze(rms))
def tonnetz(data,sr):
    tonnetz=librosa.feature.tonnetz(data,sr=sr)
    return np.array(np.ravel(tonnetz))
def mfcc(data,sr):
    mfcc=librosa.feature.mfcc(data,sr=sr)
    return np.array(np.ravel(mfcc.T))


#function to get features from MFCC, ZCR, RMS and Tonnetz
def extract_features(data,sr):

    result=np.array([])
    result = np.append(result,mfcc(data,sr))
    result = np.append(result,tonnetz(data,sr))
    result = np.append(result, rms(data))
    result = np.append(result, zcr(data))
 
    return result

# Function to extract features from individual audio path
def get_features(path,duration=2.5, offset=0.6):
    data,sr=librosa.load(path,duration=duration,offset=offset)
    aud=extract_features(data,sr)
    
    return aud

In [None]:
# Data preparation for testing and training
X,Y=[],[]
for path,emotion,index in zip(main_df.File_Path,main_df.Emotion,range(main_df.File_Path.shape[0])):
    features=get_features(path)
    X.append(features)
    Y.append(emotion)
    if (index == main_df.File_Path.shape[0] -1):
      print('Features from all the audio files are extracted')

In [None]:
# Features extracted file from all the datasets
processed_data_path= 'Processed_Data.csv'

In [None]:
# Converting X and Y arrays to CSV file and to be saved in specified path above
extract=pd.DataFrame(X)
extract['Emotion']=Y
extract.to_csv(processed_data_path,index=False)
extract.head(10)

In [None]:
# Converting features sile to dataframe
df=pd.read_csv(processed_data_path)
df.shape

In [None]:
# Contents of dataframe
df.head(5)

In [None]:
# Removing all the null values and replacing with 0
df=df.fillna(0)
print('Number of null Values in data set: ',df.isna().sum().sum())
df.shape

In [None]:
# Processed dataframe information
df.info()

In [None]:
# Creating X and Y variables for train and test split
X=df.drop(labels='Emotion',axis=1)
Y=df['Emotion']

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=1, stratify = Y,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
# Accuracy of different models
models_accuracy_scores = []
model_names = ['SVC', 'SVC Tuned', 'MLPC', 'MLPC Tuned','KNN', 'KNN_Tuned','Decision Tree', 'Decision Tree Tuned', 'LR', 'LR Tuned', 'CNN', 'CNN Tuned']

In [None]:
# Importing all the trained models
SVC_model = joblib.load(r'models/finalized_SVC_model.sav')
SVC_tuned_model = joblib.load(r'models/finalized_SVC_tuned_model.sav')   

MLPC_model = joblib.load(r'models/finalized_MLPC_model.sav')
MLPC_tuned_model = joblib.load(r'models/finalized_tuned_MLPC_model.sav')

KNN_model = joblib.load(r'models/finalized_KNN_model.sav')
KNN_tuned_model = joblib.load(r'models/finalized_KNN_tuned_model.sav')

Decision_Tree_model = joblib.load(r'models/finalized_DTC_model.sav')
Decision_Tree_tuned_model = joblib.load(r'models/finalized_DTC_tuned_model.sav')

Logistic_Regression_model = joblib.load(r'models/finalized_LR_model.sav')
Logistic_Regression_tuned_model = joblib.load(r'models/finalized_DTC_tuned_model.sav')

CNN_model = tf.keras.models.load_model(r'models/finalized_CNN_model.h5')
CNN_tuned_model = tf.keras.models.load_model(r'models/finalized_CNN_tuned_model.h5')

In [None]:
# Intinalizing scalar and fiting train and test data
scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_test=scaler.transform(X_test)

In [None]:
# Predictions from SVC and MLPC models
y_pred1 = SVC_model.predict(X_test)
y_pred2 = SVC_tuned_model.predict(X_test) 

y_pred3 = MLPC_model.predict(X_test)
y_pred4 = MLPC_tuned_model.predict(X_test)

y_pred5 = KNN_model.predict(X_test)
y_pred6 = KNN_tuned_model.predict(X_test)

y_pred7 = Decision_Tree_model.predict(X_test)
y_pred8 = Decision_Tree_tuned_model.predict(X_test)

y_pred9 = Logistic_Regression_model.predict(X_test)
y_pred10 = Logistic_Regression_tuned_model.predict(X_test)

In [None]:
# Classification report of SVC models
print(f'\t  SVC Model Classification Report\n\n',classification_report(y_test,y_pred1,target_names=emotion_names))
print('----------------------------------------------------------------------')
print(f'\t SVC tuned Model Classification Report\n\n',classification_report(y_test,y_pred2,target_names=emotion_names))
print('----------------------------------------------------------------------')

In [None]:
# Classification report of  MLPC models
print(f'\t  MLPC Model Classification Report\n\n',classification_report(y_test,y_pred3,target_names=emotion_names))
print('----------------------------------------------------------------------')
print(f'\t MLPC  tuned Model Classification Report\n\n',classification_report(y_test,y_pred4,target_names=emotion_names))
print('----------------------------------------------------------------------')

In [None]:
# Classification report of  KNN models
print(f'\t  KNN Model Classification Report\n\n',classification_report(y_test,y_pred5,target_names=emotion_names))
print('----------------------------------------------------------------------')
print(f'\t KNN  tuned Model Classification Report\n\n',classification_report(y_test,y_pred6,target_names=emotion_names))
print('----------------------------------------------------------------------')

In [None]:
# Classification report of  DTC models
print(f'\t  Decision Tree Model Classification Report\n\n',classification_report(y_test,y_pred7,target_names=emotion_names))
print('----------------------------------------------------------------------')
print(f'\t Decision Tree  tuned Model Classification Report\n\n',classification_report(y_test,y_pred8,target_names=emotion_names))
print('----------------------------------------------------------------------')

In [None]:
# Classification report of  LR models
print(f'\t  Logistic Regression Model Classification Report\n\n',classification_report(y_test,y_pred9,target_names=emotion_names))
print('----------------------------------------------------------------------')
print(f'\t Logistic Regression  tuned Model Classification Report\n\n',classification_report(y_test,y_pred10,target_names=emotion_names))
print('----------------------------------------------------------------------')

In [None]:
# Accuracy of SVC model
accuracy=accuracy_score(y_pred1,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of SVC Model: {accuracy}')

In [None]:
# Accuracy of SVC tuned model
accuracy=accuracy_score(y_pred2,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of SVC Tuned Model: {accuracy}')

In [None]:
# Accuracy of MLPC model
accuracy=accuracy_score(y_pred3,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of MLPC Model: {accuracy}')

In [None]:
# Accuracy of MLPC tuned model
accuracy=accuracy_score(y_pred4,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of MLPC Tuned Model: {accuracy}')

In [None]:
# Accuracy of KNN model
accuracy=accuracy_score(y_pred5,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of KNN Model: {accuracy}')

In [None]:
# Accuracy of KNN model
accuracy=accuracy_score(y_pred6,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of KNN Tuned Model: {accuracy}')

In [None]:
# Accuracy of DTC model
accuracy=accuracy_score(y_pred7,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of Decision Tree Model: {accuracy}')

In [None]:
# Accuracy of DTC Tuned model
accuracy=accuracy_score(y_pred8,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of Decision Tree Tuned Model: {accuracy}')

In [None]:
# Accuracy of LR model
accuracy=accuracy_score(y_pred9,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of LR Model: {accuracy}')

In [None]:
# Accuracy of LR Tuned model
accuracy=accuracy_score(y_pred10,y_test)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of LR Tuned Model: {accuracy}')

In [None]:
# Confusion matrix of SVC
conf=confusion_matrix(y_test,y_pred1)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for SVC model ')
plt.show()

In [None]:
# Confusion matrix of SVC tuned model
conf=confusion_matrix(y_test,y_pred2)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for SVC tuned model ')
plt.show()

In [None]:
# Confusion matrix of MLPC model
conf=confusion_matrix(y_test,y_pred3)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for MLPC model ')
plt.show()

In [None]:
# Confusion matrix of MLPC tuned model
conf=confusion_matrix(y_test,y_pred4)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for MLPC model ')
plt.show()

In [None]:
# Confusion matrix of KNN model
conf=confusion_matrix(y_test,y_pred5)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for KNN model ')
plt.show()

In [None]:
# Confusion matrix of KNN Tuned model
conf=confusion_matrix(y_test,y_pred6)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for KNN Tuned model ')
plt.show()

In [None]:
# Confusion matrix of DTC model
conf=confusion_matrix(y_test,y_pred7)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for Decision Tree classifier model ')
plt.show()

In [None]:
# Confusion matrix of DTC Tuned model
conf=confusion_matrix(y_test,y_pred8)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for Decision Tree classifier Tuned  model ')
plt.show()

In [None]:
# Confusion matrix of LR model
conf=confusion_matrix(y_test,y_pred9)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for Logistic Regression model ')
plt.show()

In [None]:
# Confusion matrix of LR Tuned model
conf=confusion_matrix(y_test,y_pred10)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for Logistic Regression Tuned model ')
plt.show()

In [None]:
# Label encoder for emotion classes
lb=LabelEncoder()
Y1=np_utils.to_categorical(lb.fit_transform(Y))
print(lb.classes_)
Y1

In [None]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y1, test_size=0.7, random_state=1, stratify = Y,shuffle=True)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
# Scalar initalization for train and test data
scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_test=scaler.transform(X_test)

In [None]:
y_pred11= CNN_model.predict(X_test)
cnn_pred = []
for i in y_pred11:
  j = np.argmax(i)
  cnn_pred.append(j)
cnn_pred = np.array(cnn_pred)
cnn_pred

In [None]:
# predictions from CNN tuned model
y_pred12= CNN_tuned_model.predict(X_test)
CNN_tuned_model_pred = []
for i in y_pred12:
  j = np.argmax(i)
  CNN_tuned_model_pred.append(j)
CNN_tuned_model_pred = np.array(CNN_tuned_model_pred)
CNN_tuned_model_pred

In [None]:
# Preparing y_check from y_test for comparision purpose
y_check=np.argmax(y_test,axis=1)
y_check

In [None]:
# Classification report of CNN model
print(f'CNN Model Classification Report: \n\n',classification_report(y_check,cnn_pred,target_names=emotion_names))

In [None]:
# Classification report of CNN tuned model
print(f'CNN Tuned Model Classification Report:\n\n',classification_report(y_check,CNN_tuned_model_pred,target_names=emotion_names))

In [None]:
# Importing history of CNN and CNN tuned models
df1 = pd.read_csv(r'/CNN Models Training History/history1.csv')
df2 = pd.read_csv(r'CNN Models Training History/history2.csv')

In [None]:
# Traininng accuracy for CNN model
fig=px.line(df1, y=['accuracy','val_accuracy'],
           labels={'index':'epoch','value':'accuracy'},
           title=f'Trarining and Validation Accuracy Chart for CNN model')
plt.figure(figsize=(20,6))
fig.show()

In [None]:
# Training loss for CNN model
fig=px.line(df1, y=['loss','val_loss'],
           labels={'index':'epoch','value':'loss'},
           title=f'Training and Validation Loss Chart for CNN model')
fig.show()

In [None]:
# Training accuracy for CNN tuned model
fig=px.line(df2, y=['accuracy','val_accuracy'],
           labels={'index':'epoch','value':'accuracy'},
           title=f'Trarining and Validation Accuracy Chart of CNN tuned model')
plt.figure(figsize=(20,6))
fig.show()

In [None]:
# Training loss for CNN tuned model
fig=px.line(df1, y=['loss','val_loss'],
           labels={'index':'epoch','value':'loss'},
           title=f'Training and Validation Loss Chart of CNN tuned model')
fig.show()

In [None]:
#Accuracy score for CNN model
accuracy=accuracy_score(cnn_pred,y_check)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of CNN  Model: {accuracy}')

In [None]:
# Accuracy score for CNN tuned model
accuracy=accuracy_score(CNN_tuned_model_pred,y_check)
models_accuracy_scores.append(accuracy)
print(f'Accuracy Score of CNN Tuned Model: {accuracy}')

In [None]:
# Confusion matrix of CNN model
conf=confusion_matrix(y_check,cnn_pred)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for CNN model ')
plt.show()

In [None]:
# Confusion matrix of CNN tuned model
conf=confusion_matrix(y_check,CNN_tuned_model_pred)
cm=pd.DataFrame(
    conf,index=[i for i in emotion_names],
    columns=[i for i in emotion_names]
)
plt.figure(figsize=(20,6))
ax=sns.heatmap(cm,annot=True,fmt='d')
ax.set_title(f'confusion matrix for CNN Tuned model ')
plt.show()

In [None]:
# Models vs accuracy chart
def addlabels(x,y):
	for i in range(len(x)):
		plt.text(i, y[i]+0.02, round(y[i],3), ha = 'center')

if __name__ == '__main__':
	plt.figure(figsize = (20,6))
	plt.bar(model_names, models_accuracy_scores)
	addlabels(model_names, models_accuracy_scores)
	plt.title("Models Vs Accuracy")
	plt.xlabel("Models")
	plt.ylabel("Accuracy Scores")
	plt.show()


Conclusion:

From the above analysis we can draw the following statements.
1. Detecting emotion from audio instead of speech is critically useful when the same audio content is expressed in different emotions.
2. Features for audio mainly depends on tone, rythm, pitch, frequency, amplitude, speed of sound, etc.
3. Advanced Audio features like MFCC, RMS, ZCR, Tonnetz provides data about the audio that is unique and helps the machine learning models to take advantage of the audio file and predict an emotion.
4. Different Audio manipulation techniques can also be used for model learning which makes the audio more clearer, louder and enhance time parameters like shifting, stretching etc.
5. Machine learning models used for this project are modeled in two version one, base models and two, parameter tuned models. Base models outperformed tuned models in some cases while tuned models take strong grip in neural network models.
6. Through perfomance analysis of models, accuracy score greater than 80% are recommended for real world scenarios in predictions.