In [45]:
import soundfile
import numpy as np
import librosa
import glob
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [46]:
def extract_feature(file_name, **kwargs):
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [47]:
int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [48]:
def load_data(test_size=0.2):
    X, y = [], []
    for file in glob.glob("D:\\DataFlair\\ravdess data\\Actor_*/*.wav"):
        basename = os.path.basename(file)
        emotion = int2emotion[basename.split("-")[2]]
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        X.append(features)
        y.append(emotion)
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [49]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

In [50]:
print("[+] Number of training samples:", X_train.shape[0])
print("[+] Number of testing samples:", X_test.shape[0])
print("[+] Number of features:", X_train.shape[1])


[+] Number of training samples: 504
[+] Number of testing samples: 168
[+] Number of features: 180


In [51]:
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08, 
    'hidden_layer_sizes': (300,), 
    'learning_rate': 'adaptive', 
    'max_iter': 500, 
}

In [52]:
model = MLPClassifier(**model_params)

In [53]:
print("[*] Training the model...")
model.fit(X_train, y_train)

[*] Training the model...




MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=500, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [54]:
y_pred = model.predict(X_test)
print("Text Data: ",X_test)
print("Output Data: ",y_pred)

Text Data:  [[-4.84017059e+02  3.35188069e+01 -4.27565667e+00 ...  3.55753631e-03
   1.79984688e-03  1.16297302e-03]
 [-4.35545203e+02  3.20185715e+01  5.38026750e-01 ...  1.10966379e-03
   1.26860222e-03  5.66523828e-04]
 [-5.25702263e+02  4.09430383e+01 -2.43137723e+01 ...  4.92032607e-05
   2.16024488e-05  1.82958245e-05]
 ...
 [-2.88720480e+02  1.47543712e+01 -1.99646873e+01 ...  1.33129519e-02
   8.80587188e-03  7.19241115e-03]
 [-6.97362436e+02  6.87248920e+01  8.32315180e+00 ...  2.19207354e-06
   1.73332923e-06  1.36195189e-06]
 [-5.51417047e+02  3.36983079e+01 -1.56172817e+01 ...  3.57153356e-04
   2.09038457e-04  1.31212363e-04]]
Output Data:  ['happy' 'angry' 'happy' 'sad' 'happy' 'angry' 'angry' 'angry' 'angry'
 'happy' 'sad' 'neutral' 'sad' 'neutral' 'sad' 'neutral' 'sad' 'angry'
 'sad' 'sad' 'happy' 'happy' 'sad' 'happy' 'sad' 'happy' 'angry' 'angry'
 'neutral' 'angry' 'sad' 'sad' 'happy' 'angry' 'neutral' 'sad' 'sad'
 'happy' 'sad' 'neutral' 'sad' 'happy' 'happy' 'sad' '

In [55]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 77.38%


In [56]:
if not os.path.isdir("result"):
    os.mkdir("result")

pickle.dump(model, open("result/mlp_classifier.model", "wb"))