In [24]:
pip install librosa soundfile numpy sklearn pyaudio

Note: you may need to restart the kernel to use updated packages.


In [25]:
pip install soundfile

Note: you may need to restart the kernel to use updated packages.


In [26]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [27]:
def extract_feature(file_name, mfcc, chroma, mel):
    X, sample_rate = librosa.load(os.path.join(file_name), res_type='kaiser_fast')
    if chroma:
        stft=np.abs(librosa.stft(X))
    result=np.array([])
    if mfcc:
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
    if chroma:
        chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
    if mel:
        mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result

In [28]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
# Emotions to observe
observed_emotions=['neutral','calm','happy','sad','angry','fearful', 'disgust','surprised']


In [30]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("C:/Users/gundr/3D Objects/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, train_size= 0.80,random_state=9)

In [31]:
import time
x_train,x_test,y_train,y_test=load_data(test_size=0.20)

In [32]:
print((x_train.shape[0], x_test.shape[0]))

(1151, 288)


In [33]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [35]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', 
                    max_iter=500)

In [36]:
model.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [38]:
MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [39]:
y_pred=model.predict(x_test)

In [40]:
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 47.22%


In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       angry       0.87      0.49      0.62        41
        calm       0.48      0.45      0.47        31
     disgust       0.33      0.52      0.41        31
     fearful       0.46      0.64      0.53        36
       happy       0.68      0.48      0.57        54
     neutral       0.27      0.40      0.32        25
         sad       0.35      0.51      0.41        35
   surprised       0.82      0.26      0.39        35

    accuracy                           0.47       288
   macro avg       0.53      0.47      0.47       288
weighted avg       0.56      0.47      0.48       288



In [42]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test,y_pred)
print (matrix)

[[20  0  7  1  4  3  5  1]
 [ 0 14  4  3  0  7  3  0]
 [ 0  2 16  4  1  3  5  0]
 [ 1  1  4 23  2  1  4  0]
 [ 1  2  7  9 26  5  3  1]
 [ 0  5  1  0  0 10  9  0]
 [ 1  3  0  8  1  4 18  0]
 [ 0  2  9  2  4  4  5  9]]


In [43]:
import pickle
# Save the Model to file in the current working directory
#For any new testing data other than the data in dataset

Pkl_Filename = "Emotion_Recognition_Through_Speech.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [44]:
with open(Pkl_Filename, 'rb') as file:  
    Emotion_Recognition_Through_Speech = pickle.load(file)

Emotion_Recognition_Through_Speech

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [45]:
A=Emotion_Recognition_Through_Speech.predict(x_test)
A

array(['disgust', 'sad', 'neutral', 'sad', 'neutral', 'angry', 'calm',
       'happy', 'fearful', 'disgust', 'angry', 'happy', 'sad', 'fearful',
       'happy', 'neutral', 'sad', 'disgust', 'fearful', 'neutral',
       'happy', 'fearful', 'happy', 'fearful', 'fearful', 'neutral',
       'sad', 'sad', 'sad', 'neutral', 'sad', 'fearful', 'happy',
       'fearful', 'sad', 'neutral', 'calm', 'disgust', 'sad', 'fearful',
       'happy', 'fearful', 'fearful', 'disgust', 'happy', 'calm', 'happy',
       'disgust', 'happy', 'neutral', 'fearful', 'disgust', 'fearful',
       'fearful', 'surprised', 'neutral', 'angry', 'disgust', 'happy',
       'sad', 'disgust', 'sad', 'neutral', 'fearful', 'fearful', 'happy',
       'disgust', 'sad', 'fearful', 'neutral', 'neutral', 'fearful',
       'disgust', 'surprised', 'sad', 'sad', 'neutral', 'disgust', 'calm',
       'happy', 'disgust', 'sad', 'disgust', 'happy', 'fearful', 'calm',
       'fearful', 'fearful', 'neutral', 'calm', 'sad', 'fearful', 'calm'

In [46]:
new_feature= extract_feature("C:/Users/gundr/3D Objects/Audio_Song_Actors_01-24/Actor_01/03-02-01-01-01-01-01.wav",mfcc=True, chroma=True, mel=True)
new_feature.shape
Emotion_Recognition_Through_Speech.predict([new_feature])

array(['fearful'], dtype='<U9')

In [47]:
file = 'C:/Users/gundr/3D Objects/Audio_Song_Actors_01-24/Actor_01/03-02-01-01-01-01-01.wav'

new_feature= extract_feature(file, mfcc=True, chroma=True, mel=True)


Emotion_Recognition_Through_Speech.predict([new_feature])

array(['fearful'], dtype='<U9')