In [1]:
#All the Required Packages and Libraies are installed.
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.io import wavfile
from python_speech_features import mfcc
import os, glob, pickle
import librosa
from scipy import signal
import noisereduce as nr
import soundfile
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
#Loading the required RAVDESS DataSet with length of 1439 Audio Files 
os.listdir(path='/Users/satyamshandilya/ML Project/speech-emotion-recognition-ravdess-data')
def getListOfFiles(dirName):
    listOfFile=os.listdir(dirName)
    allFiles=list()
    for entry in listOfFile:
        fullPath=os.path.join(dirName, entry)
        if os.path.isdir(fullPath):
            allFiles=allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
    return allFiles

dirName = './speech-emotion-recognition-ravdess-data'
listOfFiles = getListOfFiles(dirName)
len(listOfFiles)

1440

In [3]:
#Now Cleaning Step is Performed where:
#DOWN SAMPLING OF AUDIO FILES IS DONE  AND PUT MASK OVER IT AND DIRECT INTO CLEAN FOLDER
#MASK IS TO REMOVE UNNECESSARY EMPTY VOIVES AROUND THE MAIN AUDIO VOICE 
def envelope(y , rate, threshold):
    mask=[]
    y=pd.Series(y).apply(np.abs)
    y_mean = y.rolling(window=int(rate/10) ,  min_periods=1 , center = True).mean()
    for mean in y_mean:
        if mean>threshold:
            mask.append(True)
        else:
            mask.append(False)
    return mask

In [4]:
import glob,pickle
for file in tqdm(glob.glob('/Users/satyamshandilya/ML Project/speech-emotion-recognition-ravdess-data//**//*.wav')):
    file_name = os.path.basename(file)
    signal , rate = librosa.load(file, sr=16000)
    mask = envelope(signal,rate, 0.0005)
    wavfile.write(filename= '/Users/satyamshandilya/ML Project/clean_speech//'+str(file_name), rate=rate,data=signal[mask])

100%|██████████| 1439/1439 [00:37<00:00, 38.29it/s]


In [5]:
#Feature Extraction of Audio Files Function 
#Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result=np.hstack((result, mel))
    return result

In [6]:
#Emotions in the RAVDESS dataset to be classified Audio Files based on . 
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
#These are the emotions User wants to observe more :
observed_emotions=['calm', 'happy', 'angry', 'disgust']

In [7]:
#Load the data and extract features for each sound file
from glob import glob
import os
import glob
def load_data(test_size=0.33):
    x,y=[],[]
    answer = 0
    for file in glob.glob('/Users/satyamshandilya/ML Project/clean_speech//*.wav'):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            answer += 1
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append([emotion,file_name])
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [10]:
#Split the dataset
import librosa
import numpy as np
# x_train,x_test,y_trai,y_tes=load_data(test_size=0.25)
print(np.shape(x_train),np.shape(x_test), np.shape(y_trai),np.shape(y_tes))
y_test_map = np.array(y_tes).T
y_test = y_test_map[0]
test_filename = y_test_map[1]
y_train_map = np.array(y_trai).T
y_train = y_train_map[0]
train_filename = y_train_map[1]
print(np.shape(y_train),np.shape(y_test))
print(*test_filename,sep="\n")

(576, 180) (192, 180) (576, 2) (192, 2)
(576,) (192,)
03-01-03-02-02-01-16.wav
03-01-03-01-02-01-17.wav
03-01-03-01-02-02-05.wav
03-01-02-01-01-01-21.wav
03-01-05-02-01-01-21.wav
03-01-02-01-02-02-01.wav
03-01-07-01-02-01-17.wav
03-01-03-02-02-02-04.wav
03-01-03-02-02-01-14.wav
03-01-07-01-01-02-17.wav
03-01-07-02-02-01-12.wav
03-01-05-01-02-02-07.wav
03-01-07-02-01-02-09.wav
03-01-02-02-01-02-01.wav
03-01-05-02-01-02-06.wav
03-01-02-02-01-02-19.wav
03-01-05-01-02-01-17.wav
03-01-03-01-02-02-13.wav
03-01-05-01-01-02-14.wav
03-01-05-02-01-01-14.wav
03-01-03-01-02-02-21.wav
03-01-05-02-02-01-23.wav
03-01-05-02-02-02-13.wav
03-01-07-02-01-01-21.wav
03-01-03-01-02-02-23.wav
03-01-05-02-02-01-24.wav
03-01-05-02-02-02-12.wav
03-01-03-01-02-01-13.wav
03-01-07-02-01-02-24.wav
03-01-02-02-01-02-23.wav
03-01-03-01-02-01-09.wav
03-01-03-01-02-02-14.wav
03-01-05-01-01-01-10.wav
03-01-03-02-01-01-02.wav
03-01-03-01-01-01-15.wav
03-01-02-02-02-01-02.wav
03-01-03-02-02-01-05.wav
03-01-07-01-02-01-11.

In [11]:
#Get the shape of the training and testing datasets
print((x_train[0]))
print()
print((x_test[0]))
print()
#Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

[-3.90711548e+02  4.63122864e+01 -2.51764641e+01 -3.83111382e+00
 -1.75483227e+01 -3.19253902e+01 -2.38653374e+01 -2.12510452e+01
 -9.19464207e+00 -1.29695063e+01 -1.92252884e+01 -3.70474815e+00
 -2.09867096e+01  9.58110571e-01 -2.16735516e+01 -5.31903410e+00
 -1.10358829e+01 -1.03974533e+01 -1.19409599e+01 -8.03185558e+00
 -1.37130070e+01 -7.08648062e+00 -8.36725616e+00  2.27544951e+00
  9.75155413e-01  1.17445765e+01  5.38146257e+00  9.60692787e+00
  6.95992517e+00  9.21212101e+00  1.02973766e+01  1.20698681e+01
  9.84880543e+00  8.19135666e+00  2.30051231e+00  3.31454563e+00
  4.71673822e+00  4.91703415e+00  2.95189667e+00 -3.65189090e-02
  4.31435645e-01  4.05591279e-01  3.96214724e-01  3.96962672e-01
  4.28109407e-01  4.97069478e-01  5.28852880e-01  6.03578627e-01
  6.23844266e-01  5.69440782e-01  5.70043802e-01  4.93273020e-01
  5.22305936e-06  6.58757017e-06  1.73842989e-06  3.08892709e-06
  1.41703087e-04  4.63715615e-03  3.21895033e-02  8.75838026e-02
  3.16878080e-01  4.50642

In [12]:
# Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, 
                    batch_size=256, 
                    epsilon=1e-08, 
                    hidden_layer_sizes=(300,), 
                    learning_rate='adaptive', 
                    max_iter=500)

In [13]:
#Train the model
model.fit(x_train,y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [14]:
#SAVING THE MODEL
import pickle
# Save the Modle to file in the current working directory
#For any new testing data other than the data in dataset

Pkl_Filename = "Emotion_Voice_Detection_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [15]:
# Load the Model back from file
with open(Pkl_Filename, 'rb') as file:  
    Emotion_Voice_Detection_Model = pickle.load(file)

Emotion_Voice_Detection_Model

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [16]:
y_pred=Emotion_Voice_Detection_Model.predict(x_test)
y_pred

array(['happy', 'angry', 'angry', 'calm', 'happy', 'calm', 'angry',
       'happy', 'happy', 'happy', 'disgust', 'disgust', 'disgust', 'calm',
       'angry', 'calm', 'angry', 'happy', 'angry', 'angry', 'happy',
       'angry', 'angry', 'disgust', 'happy', 'angry', 'angry', 'happy',
       'disgust', 'calm', 'happy', 'happy', 'angry', 'happy', 'happy',
       'calm', 'happy', 'disgust', 'calm', 'angry', 'disgust', 'calm',
       'angry', 'disgust', 'calm', 'disgust', 'disgust', 'angry',
       'disgust', 'angry', 'calm', 'angry', 'angry', 'angry', 'calm',
       'happy', 'calm', 'angry', 'happy', 'calm', 'calm', 'angry',
       'disgust', 'angry', 'calm', 'disgust', 'calm', 'happy', 'happy',
       'happy', 'calm', 'calm', 'happy', 'happy', 'calm', 'happy',
       'happy', 'happy', 'disgust', 'calm', 'calm', 'calm', 'angry',
       'disgust', 'disgust', 'happy', 'disgust', 'angry', 'disgust',
       'angry', 'disgust', 'disgust', 'calm', 'disgust', 'calm',
       'disgust', 'angry', 'd

In [17]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
results=confusion_matrix(y_test,y_pred)
print('Confusion Matrix')
print(results)
print()
print('Accuracy Score ',accuracy_score(y_test,y_pred))
print()
print('Report')
print(classification_report(y_test,y_pred))

Confusion Matrix
[[37  3  7  3]
 [ 0 39  5  0]
 [ 4  3 32  2]
 [ 7  3  3 44]]

Accuracy Score  0.7916666666666666

Report
              precision    recall  f1-score   support

       angry       0.77      0.74      0.76        50
        calm       0.81      0.89      0.85        44
     disgust       0.68      0.78      0.73        41
       happy       0.90      0.77      0.83        57

    accuracy                           0.79       192
   macro avg       0.79      0.79      0.79       192
weighted avg       0.80      0.79      0.79       192



In [18]:
#Store the Prediction probabilities into CSV file 
import numpy as np
import pandas as pd
y_pred1 = pd.DataFrame(y_pred, columns=['predictions'])
y_pred1['file_names'] = test_filename
print(y_pred1)
y_pred1.to_csv('predictionfinal.csv')

    predictions                file_names
0         happy  03-01-03-02-02-01-16.wav
1         angry  03-01-03-01-02-01-17.wav
2         angry  03-01-03-01-02-02-05.wav
3          calm  03-01-02-01-01-01-21.wav
4         happy  03-01-05-02-01-01-21.wav
..          ...                       ...
187     disgust  03-01-02-02-02-01-05.wav
188     disgust  03-01-07-01-01-02-20.wav
189       happy  03-01-03-01-01-01-13.wav
190     disgust  03-01-07-01-02-01-02.wav
191     disgust  03-01-03-01-01-02-17.wav

[192 rows x 2 columns]
