In [2]:
import librosa
import soundfile
import os, glob
import wave
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
import pyaudio
from array import array

In [3]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X,sample_rate = librosa.load(file_name)
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [4]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
observed_emotions=['calm', 'happy', 'sad', 'angry']

In [21]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("C:\\ravdess_data\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        #print(emotion)
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)
x_train,x_test,y_train,y_test=load_data(test_size=0.25)
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [30]:
mlp = MLPClassifier(max_iter=1600)

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}



In [31]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(x_train,y_train)

GridSearchCV(cv=3, estimator=MLPClassifier(max_iter=1600), n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.05],
                         'hidden_layer_sizes': [(50, 50, 50), (50, 100, 50),
                                                (100,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [32]:
from sklearn.metrics import classification_report

y_pred=clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       angry       0.82      0.77      0.80        48
        calm       0.89      0.84      0.86        57
       happy       0.82      0.80      0.81        50
         sad       0.70      0.84      0.77        37

    accuracy                           0.81       192
   macro avg       0.81      0.81      0.81       192
weighted avg       0.82      0.81      0.81       192



In [33]:
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 81.25%


In [15]:
# This is for audio file input

def find_emot(audio_file):
    feats=[]
    feats.append(extract_feature(audio_file,mfcc=True, chroma=True, mel=True))
    emot=clf.predict(feats)
    print(emot[0])
find_emot("Audio_d.wav")

sad


In [6]:
#This is for Microphone input

import pyaudio
import wave
def record_file(f):
    CHUNK = 8192
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    RECORD_SECONDS = 5
    
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,input_device_index = 0,frames_per_buffer=CHUNK)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("* done recording")
    
    stream.stop_stream()
    stream.close()
    p.terminate()
    wf = wave.open(f, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

In [7]:
print("Please talk")
filename = "micro_3.wav"

record_file(filename)

features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)

result = clf.predict(features)[0]

print("result:", result)

Please talk
* recording
* done recording
result: happy
