
[Dataset](https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view)

Task 2. Enhance the emotion detection model by integrating voice tone analysis for a more comprehensive and accurate emotion recognition."

In [25]:
import os
Root = "D:\Study\Project\Emotion detection using audio"
os.chdir(Root)

In [26]:
ls

 Volume in drive D is Rohit
 Volume Serial Number is 705C-48A5

 Directory of D:\Study\Project\Emotion detection using audio

19-Feb-24  13:11    <DIR>          .
19-Feb-24  11:56    <DIR>          ..
19-Feb-24  11:58    <DIR>          Actor_01
19-Feb-24  11:58    <DIR>          Actor_02
19-Feb-24  11:58    <DIR>          Actor_03
19-Feb-24  11:58    <DIR>          Actor_04
19-Feb-24  11:58    <DIR>          Actor_05
19-Feb-24  11:58    <DIR>          Actor_06
19-Feb-24  11:58    <DIR>          Actor_07
19-Feb-24  11:58    <DIR>          Actor_08
19-Feb-24  11:58    <DIR>          Actor_09
19-Feb-24  11:58    <DIR>          Actor_10
19-Feb-24  11:58    <DIR>          Actor_11
19-Feb-24  11:58    <DIR>          Actor_12
19-Feb-24  11:58    <DIR>          Actor_13
19-Feb-24  11:58    <DIR>          Actor_14
19-Feb-24  11:58    <DIR>          Actor_15
19-Feb-24  11:58    <DIR>          Actor_16
19-Feb-24  11:58    <DIR>          Actor_17
19-Feb-24  11:58    <DIR>          Actor_18
19-Feb-

In [27]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [28]:
#Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(y = X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [29]:
# Emotions in the dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

#Emotions to observe
observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [30]:
#Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("D:\Study\Project\Emotion detection using audio\Actor_*/*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [31]:
#Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

In [32]:
x_train

array([[-5.22061890e+02,  3.50668907e+01,  3.75342917e+00, ...,
         1.65243138e-04,  1.04321596e-04,  6.55571566e-05],
       [-6.41227722e+02,  4.49487762e+01, -1.85174131e+00, ...,
         3.89261913e-05,  3.05255380e-05,  2.94166657e-05],
       [-6.50705750e+02,  5.30211639e+01, -4.92040396e+00, ...,
         4.75216802e-05,  3.46632551e-05,  1.62844444e-05],
       ...,
       [-5.50096191e+02,  1.70297680e+01, -1.14575644e+01, ...,
         1.51764631e-04,  1.16828531e-04,  8.47479314e-05],
       [-5.55357605e+02,  4.71569710e+01,  1.10750742e+01, ...,
         1.61086471e-04,  1.04962470e-04,  6.52811723e-05],
       [-5.04816345e+02,  3.53618660e+01, -1.43495779e+01, ...,
         6.08151546e-04,  5.55269653e-04,  4.47782222e-04]])

In [33]:
#Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [34]:
#Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [35]:
#Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [36]:
#Train the model
model.fit(x_train,y_train)

In [37]:
#Predict for the test set
y_pred=model.predict(x_test)

In [38]:
y_pred

array(['happy', 'calm', 'happy', 'happy', 'fearful', 'calm', 'calm',
       'disgust', 'calm', 'happy', 'happy', 'calm', 'fearful', 'happy',
       'disgust', 'happy', 'calm', 'fearful', 'happy', 'calm', 'calm',
       'disgust', 'disgust', 'calm', 'happy', 'happy', 'calm', 'happy',
       'calm', 'fearful', 'happy', 'happy', 'happy', 'calm', 'happy',
       'calm', 'calm', 'fearful', 'calm', 'calm', 'happy', 'calm', 'calm',
       'calm', 'fearful', 'calm', 'disgust', 'happy', 'calm', 'happy',
       'fearful', 'fearful', 'happy', 'happy', 'happy', 'happy', 'calm',
       'happy', 'calm', 'calm', 'disgust', 'calm', 'happy', 'calm',
       'happy', 'calm', 'calm', 'calm', 'fearful', 'happy', 'fearful',
       'fearful', 'fearful', 'fearful', 'fearful', 'disgust', 'fearful',
       'happy', 'calm', 'fearful', 'happy', 'calm', 'fearful', 'calm',
       'disgust', 'happy', 'calm', 'fearful', 'happy', 'happy', 'disgust',
       'happy', 'calm', 'happy', 'disgust', 'disgust', 'calm', 'calm'

In [39]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)

#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 66.15%


In [40]:
from sklearn.metrics import accuracy_score, f1_score

In [41]:
f1_score(y_test, y_pred,average=None)

array([0.859375  , 0.42424242, 0.60526316, 0.61403509])

In [42]:
import pandas as pd
df=pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
df.head(20)

Unnamed: 0,Actual,Predicted
0,happy,happy
1,calm,calm
2,happy,happy
3,happy,happy
4,disgust,fearful
5,calm,calm
6,happy,calm
7,happy,disgust
8,disgust,calm
9,happy,happy


In [43]:
import pickle
# Writing different model files to file
with open( 'modelForPrediction1.sav', 'wb') as f:
    pickle.dump(model,f)

In [44]:
filename = 'modelForPrediction1.sav'
loaded_model = pickle.load(open(filename, 'rb')) # loading the model file from the storage

feature=extract_feature("Actor_07\\03-01-01-01-01-01-07.wav", mfcc=True, chroma=True, mel=True)

feature=feature.reshape(1,-1)

prediction=loaded_model.predict(feature)
prediction

array(['calm'], dtype='<U7')

In [45]:
feature

array([[-6.35592651e+02,  6.64345856e+01,  3.61689687e+00,
         2.15306664e+01,  4.65024322e-01, -4.00999689e+00,
        -1.21751642e+01,  8.66315901e-01, -4.18574667e+00,
        -4.36179352e+00, -1.42581356e+00,  2.10608411e+00,
        -7.12672758e+00,  3.70053124e+00, -5.58267689e+00,
        -2.45656562e+00, -1.50990117e+00, -2.32082200e+00,
        -5.89503050e+00, -1.79849172e+00, -2.98432493e+00,
        -4.50238180e+00, -7.21068025e-01, -1.94489598e+00,
        -5.69466472e-01, -1.82352602e+00, -2.93693089e+00,
        -3.50408077e+00, -1.86185098e+00, -7.86468565e-01,
        -2.23111463e+00, -2.07092619e+00, -5.22028160e+00,
        -5.01933861e+00, -2.69417477e+00, -3.15398514e-01,
        -4.64844435e-01, -8.92420352e-01,  2.41881514e+00,
         2.95093346e+00,  6.58863008e-01,  6.46732211e-01,
         6.24277949e-01,  7.11605310e-01,  7.52990305e-01,
         6.49803340e-01,  6.41918957e-01,  6.64402068e-01,
         7.19834387e-01,  7.78768122e-01,  7.74789214e-0