In [1]:
import librosa
import os
import pandas as pd
import glob
import numpy as np
import soundfile
from sklearn.model_selection import train_test_split
import sys
import pickle

In [2]:
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
def extract_feature(file_name,mfcc,chroma,mel,zcr,rmse):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
        if zcr:
            zcr=np.mean(librosa.feature.zero_crossing_rate(y=X, frame_length=2048, hop_length=512).T,axis=0)
            result=np.hstack((result, zcr))
        if rmse:
            rmse=np.mean(librosa.feature.rms(S=stft,frame_length=2048, hop_length=512, pad_mode='reflect').T,axis=0)
            result=np.hstack((result, rmse))
    return result

In [4]:
#Load the data and extract features for each sound file
def load_data(test_size=0.5):
    x,y=[],[]
    for file in glob.glob("C:\\Users\\ACER\\Documents\\Jupyter Notebook\\TESS Toronto emotional speech set data/*//*"):
        file_name=os.path.basename(file)
        L=file_name.split('_')
        if L[2]=='angry.wav':
          e='angry'
        if L[2]=='disgust.wav':
          e='disgust'
        if L[2]=='fear.wav':
          e='fear'
        if L[2]=='happy.wav':
          e='happy'
        if L[2]=='neutral.wav':
          e='neutral'
        if L[2]=='ps.wav':
          e='pleasant surprised'
        if L[2]=='sad.wav':
          e='sad'
        feature=extract_feature(file,mfcc=True,chroma=True,mel=True,zcr=True,rmse=True)
        x.append(feature)
        y.append(e)
    return train_test_split(np.array(x), y, test_size=test_size)

In [5]:
x_train,x_test,y_train,y_test=load_data(test_size=0.2)

In [6]:
x_train.shape[1]

182

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [8]:
import time
start = time.time()
model=MLPClassifier(alpha=0.0001, batch_size=64, epsilon=1e-08, hidden_layer_sizes=(500,), learning_rate='constant',activation='relu',solver='adam',max_iter=600)
model.fit(x_train,y_train)
end = time.time()
train_time_total=end-start
print(train_time_total)

6.566116571426392


# Accuracy of our Model

In [10]:
y_pred=model.predict(x_test)
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 100.00%


# Classification Report

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

                    precision    recall  f1-score   support

             angry       1.00      1.00      1.00        90
           disgust       1.00      1.00      1.00        97
              fear       1.00      1.00      1.00        67
             happy       1.00      1.00      1.00        77
           neutral       1.00      1.00      1.00        81
pleasant surprised       1.00      1.00      1.00        67
               sad       1.00      1.00      1.00        81

          accuracy                           1.00       560
         macro avg       1.00      1.00      1.00       560
      weighted avg       1.00      1.00      1.00       560



# Pickle file


In [12]:
filename='Speech_Emotion_Recognition_Model'
pickle.dump(model,open(filename,'wb'))

In [13]:
loaded_model=pickle.load(open(filename,'rb'))
loaded_model.predict(x_test)

array(['fear', 'pleasant surprised', 'neutral', 'angry', 'angry', 'sad',
       'fear', 'sad', 'fear', 'disgust', 'sad', 'angry', 'neutral',
       'angry', 'pleasant surprised', 'sad', 'neutral', 'sad', 'fear',
       'fear', 'disgust', 'disgust', 'neutral', 'sad',
       'pleasant surprised', 'pleasant surprised', 'neutral', 'angry',
       'fear', 'disgust', 'happy', 'pleasant surprised', 'disgust',
       'disgust', 'disgust', 'disgust', 'sad', 'angry', 'angry',
       'neutral', 'neutral', 'sad', 'fear', 'disgust',
       'pleasant surprised', 'pleasant surprised', 'fear', 'angry',
       'angry', 'pleasant surprised', 'neutral', 'disgust', 'sad',
       'neutral', 'pleasant surprised', 'angry', 'fear', 'angry', 'angry',
       'happy', 'neutral', 'neutral', 'neutral', 'neutral', 'disgust',
       'angry', 'pleasant surprised', 'happy', 'fear', 'angry', 'happy',
       'disgust', 'fear', 'angry', 'disgust', 'disgust', 'angry',
       'disgust', 'fear', 'neutral', 'sad', 'angry', '

In [14]:
loaded_model.predict(x_test)
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 100.00%


In [15]:
x_test

array([[-3.19655975e+02,  3.79566574e+01, -3.86134410e+00, ...,
         1.50374454e-02,  1.92977514e-01,  2.85487261e-02],
       [-4.12647400e+02,  6.03137703e+01,  1.36190691e+01, ...,
         4.91076789e-04,  1.44943576e-01,  1.31565809e-02],
       [-5.38866333e+02,  6.60644989e+01,  1.78690186e+01, ...,
         4.58595205e-05,  8.71294807e-02,  5.49675022e-03],
       ...,
       [-4.73261078e+02,  8.79019241e+01, -3.11367059e+00, ...,
         4.23563106e-05,  8.44726562e-02,  9.24000418e-03],
       [-2.74370728e+02,  2.96860600e+01, -1.39437103e+01, ...,
         4.53519486e-02,  2.34846248e-01,  4.27885603e-02],
       [-5.36719727e+02,  8.12751083e+01,  3.78236351e+01, ...,
         1.32717169e-05,  8.12928735e-02,  8.21741321e-03]])