In [1]:
import librosa
import pydub
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from pydub import AudioSegment
from pydub.playback import play 




In [2]:
def extract_feature(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        #print(file_name)
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            #print("randi" + file_name)
            stft = np.abs(librosa.stft(y=X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result

In [12]:
#DataFlair - Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#DataFlair - Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [5]:
# all emotions on RAVDESS dataset
int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

# we allow only these emotions ( feel free to tune this on your need )
AVAILABLE_EMOTIONS = {
    "angry",
    "sad",
    "neutral",
    "happy"
}

def load_data(test_size=0.2):
    X, y = [], []
    for file in glob.glob(r"C:\Users\SHUBHAM SAINI\RawData\Actor_*\*.wav"):
        # get the base name of the audio file
        basename = os.path.basename(file)
        # get the emotion label
        emotion = int2emotion[basename.split("-")[2]]
        #print(emotion)
        print(file+ "  "+emotion)
        # we allow only AVAILABLE_EMOTIONS we set
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        # extract speech features
        wav_file = AudioSegment.from_file(file=r"C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-01-01-01.wav", format="wav")
        #print("randwa" + file)
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        # add to data
        X.append(features)
        y.append(emotion)
    # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [6]:
# load RAVDESS dataset, 75% training 25% testing
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-01-01-01.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-01-02-01.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-02-01-01.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-02-02-01.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-01-01-01-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-01-01-02-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-01-02-01-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-01-02-02-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-02-01-01-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-02-01-02-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-02-02-01-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-02-02-02-02-01.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-03-01-01-01-01.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-03-01-01-02-01.wav

C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-01-01-02-02-03.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-01-01-01-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-01-01-02-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-01-02-01-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-01-02-02-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-02-01-01-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-02-01-02-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-02-02-01-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-02-02-02-02-03.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-03-01-01-01-03.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-03-01-01-02-03.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-03-01-02-01-03.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-03-01-02-02-03.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_03\03-01-03-02-01-01-03.wav  happ

C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-01-01-01-02-05.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-01-01-02-01-05.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-01-01-02-02-05.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-01-01-01-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-01-01-02-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-01-02-01-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-01-02-02-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-02-01-01-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-02-01-02-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-02-02-01-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-02-02-02-02-05.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-03-01-01-01-05.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-03-01-01-02-05.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_05\03-01-03-01-02-01-05.wav  

C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-01-01-01-02-07.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-01-01-02-01-07.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-01-01-02-02-07.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-01-01-01-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-01-01-02-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-01-02-01-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-01-02-02-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-02-01-01-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-02-01-02-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-02-02-01-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-02-02-02-02-07.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-03-01-01-01-07.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-03-01-01-02-07.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_07\03-01-03-01-02-01-07.wav  

C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-01-01-02-01-09.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-01-01-02-02-09.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-01-01-01-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-01-01-02-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-01-02-01-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-01-02-02-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-02-01-01-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-02-01-02-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-02-02-01-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-02-02-02-02-09.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-03-01-01-01-09.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-03-01-01-02-09.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-03-01-02-01-09.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_09\03-01-03-01-02-02-09.wav  ha

C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-01-01-02-02-11.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-01-01-01-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-01-01-02-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-01-02-01-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-01-02-02-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-02-01-01-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-02-01-02-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-02-02-01-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-02-02-02-02-11.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-03-01-01-01-11.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-03-01-01-02-11.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-03-01-02-01-11.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-03-01-02-02-11.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_11\03-01-03-02-01-01-11.wav  happ

C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-01-01-02-01-13.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-01-01-02-02-13.wav  neutral
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-01-01-01-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-01-01-02-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-01-02-01-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-01-02-02-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-02-01-01-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-02-01-02-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-02-02-01-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-02-02-02-02-13.wav  calm
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-03-01-01-01-13.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-03-01-01-02-13.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-03-01-02-01-13.wav  happy
C:\Users\SHUBHAM SAINI\RawData\Actor_13\03-01-03-01-02-02-13.wav  ha

In [7]:
# print some details
# number of samples in training data
print("[+] Number of training samples:", X_train.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", X_test.shape[0])
# number of features used
# this is a vector of features extracted 
# using extract_features() function
print("[+] Number of features:", X_train.shape[1])

[+] Number of training samples: 294
[+] Number of testing samples: 98
[+] Number of features: 180


In [8]:
# best model, determined by a grid search
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08, 
    'hidden_layer_sizes': (300,), 
    'learning_rate': 'adaptive', 
    'max_iter': 500, 
}

In [9]:
# initialize Multi Layer Perceptron classifier
# with best parameters ( so far )
model = MLPClassifier(**model_params)

In [10]:
# train the model
print("[*] Training the model...")
model.fit(X_train, y_train)

[*] Training the model...


MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [11]:
# predict 25% of data to measure how good we are
y_pred = model.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 66.33%


In [None]:
model.save_weights("mlp_classifier.h5")

In [None]:
# now we save the model
# make result directory if doesn't exist yet
if not os.path.isdir("result"):
    os.mkdir("result")

pickle.dump(model, open("result/mlp_classifier.pkl, "wb"))

In [None]:
model = pickle.load(open("result/mlp_classifier.pkl", "rb"))
filename = r"C:\Users\SHUBHAM SAINI\RawData\Actor_01\03-01-01-01-01-01-01.wav"
# extract features and reshape it
features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)
# predict
result = model.predict(features)[0]
# show the result !
print("result:", result)