In [1]:
import numpy as np
import librosa
import soundfile
import os, glob, pickle
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

### Define a function extract_feature to extract the mfcc, chroma, and mel features from a sound file. This function takes 4 parameters- the file name and three Boolean parameters for the three features:

    mfcc: Mel Frequency Cepstral Coefficient, represents the short-term power spectrum of a sound
    chroma: Pertains to the 12 different pitch classes
    mel: Mel Spectrogram Frequency


In [2]:
def extract_feature(file_name, mfcc, chroma, mel):
    #can return NoneType if soundfile doesn't open
    with soundfile.SoundFile(file_name) as f:
        X = f.read(dtype="float32")
        sample_rate=f.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
        return result

### lets make a dictionary of the emotions in the RAVDESS dataset, and then a list for the emotions we're going to be trying to identify from voices

    RAVDESS: Ryerson Audio-Visual Database of Emotional Speech and Song

In [3]:
#emotions and their key from the RAVDESS set
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#emotions we are looking to identify
observed_emotions=['calm', 'happy', 'sad', 'angry']

### now lets load the voice audio
#### x, y are for features and emotions of analyzed audio. 
#### we will use glob to grab all the audio files we want. 
#### excuse the mess of a path

In [4]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("/home/ubuntu/Downloads/school/2021-FALL/dataScience/simpleSpeechEmotionalRecognition/audio/Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

### split the dataset into training and testing sets
#### Let’s keep the test set 25% of everything and use load_data to generate x and y train and test sets

In [5]:
x_train,x_test,y_train,y_test=load_data(test_size=0.25)
#lets see the shape of features
print((x_train.shape[0], x_test.shape[0]))
#now lets get number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

(576, 192)
Features extracted: 180


### This is a Multi-layer Perceptron Classifier; it optimizes the log-loss function using LBFGS or stochastic gradient descent. 
#### BFGS calculates gradient descent while preconditioning the gradient with curve information (to avoid possible local minimas). LBFGS is limitted and used when memory requirements aren't up to par with data size.
    Unlike SVM or Naive Bayes, the MLPClassifier has an internal NN for the purpose of classification.

In [6]:
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

### fit and train :)

In [7]:
model.fit(x_train, y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

### and now, predict

In [8]:
y_pred=model.predict(x_train)

### use sklearn's accuracy_score() method to test the accuracy of our model

In [10]:
accuracy=accuracy_score(y_true=y_train, y_pred=y_pred)

#print accuracy with 2 decimals
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 79.17%
