In [19]:
import soundfile
import numpy as np
import librosa
import glob
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

In [3]:
def extract_feature(file_name, **kwargs):
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [17]:
emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [18]:
def load_data(test_size=0.2):
    X, y = [], []
    for file in glob.glob("Dataset\\ravdess data\\Actor_*/*.wav"):
        basename = os.path.basename(file)
        emotion = emotion[basename.split("-")[2]]
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        X.append(features)
        y.append(emotion)
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [6]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

In [16]:
print("Number of training samples:", X_train.shape[0])
print("Number of testing samples:", X_test.shape[0])
print("Number of features:", X_train.shape[1])


Number of training samples: 504
Number of testing samples: 168
Number of features: 180


In [8]:
model=MLPClassifier(alpha=0.01, batch_size=256, 
                    epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [10]:
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
y_pred = model.predict(X_test)

print("Test Data: ",X_test[:10] , "\n")
print("Actual Emotion:    ",y_test[:10])
print("Predicted Emotion: ",y_pred[:10])

Test Data:  [[-4.84017059e+02  3.35188069e+01 -4.27565667e+00 ...  3.55753631e-03
   1.79984688e-03  1.16297302e-03]
 [-4.35545203e+02  3.20185715e+01  5.38026750e-01 ...  1.10966379e-03
   1.26860222e-03  5.66523828e-04]
 [-5.25702263e+02  4.09430383e+01 -2.43137723e+01 ...  4.92032607e-05
   2.16024488e-05  1.82958245e-05]
 ...
 [-3.86171040e+02 -7.10082870e+00 -2.88772834e+01 ...  3.01358549e-03
   1.72012487e-03  1.07116255e-03]
 [-5.54994849e+02  4.29431429e+01 -4.86395794e+00 ...  1.23255110e-04
   4.18910600e-05  3.71569372e-05]
 [-4.61271776e+02  2.29928488e+01 -3.58219153e+01 ...  3.00771324e-04
   1.76716730e-04  1.15658879e-04]] 

Actual Emotion:     ['happy', 'angry', 'happy', 'sad', 'angry', 'angry', 'happy', 'angry', 'angry', 'happy']
Predicted Emotion:  ['happy' 'happy' 'happy' 'neutral' 'happy' 'angry' 'happy' 'angry' 'sad'
 'happy']


In [12]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy of the model is: {:.2f}%".format(accuracy*100))

Accuracy of the model is: 73.21%


In [15]:
confusion_mat = confusion_matrix(y_test, y_pred)
confusion_mat

array([[49,  8,  2,  2],
       [ 3, 29,  3,  6],
       [ 0,  3,  9,  9],
       [ 1,  5,  3, 36]], dtype=int64)

In [14]:
# if not os.path.isdir("result"):
#     os.mkdir("result")

# pickle.dump(model, open("result/mlp_classifier.model", "wb"))

In [20]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       angry       0.92      0.80      0.86        61
       happy       0.64      0.71      0.67        41
     neutral       0.53      0.43      0.47        21
         sad       0.68      0.80      0.73        45

    accuracy                           0.73       168
   macro avg       0.69      0.68      0.69       168
weighted avg       0.74      0.73      0.73       168

