In [1]:
##Import Libraries, librosa - For Audio Analysis, Pyaudio - for audio recording
import pyaudio
import wave
import librosa 
import librosa.display
import numpy as np
import pandas as pd
import IPython.display as ipd 

import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

  from ._conv import register_converters as _register_converters


In [2]:
#Importing Keras load_model for loading the saved model
from keras.models import load_model

Using TensorFlow backend.


In [3]:
import os
os.getcwd()

'/Users/rohithmovva/Documents/DSP_Project_Files'

In [4]:
## Loading the saved CNN model
savedmodel=load_model('EmotionDetectionModel.h5')

In [5]:
savedmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 126, 128, 256)     2560      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 64, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 61, 62, 128)       295040    
_________________________________________________________________
dropout_1 (Dropout)          (None, 61, 62, 128)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 59, 60, 128)       147584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 59, 60, 128)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 453120)            0         
__________

In [6]:
###This part of code is taken from Source:  https://people.csail.mit.edu/hubert/pyaudio/
## For recording the voice for a duration of 3 sec, start talking after * recording displayed
CHUNK = 1024 
FORMAT = pyaudio.paInt16 #paInt8
CHANNELS = 1 
RATE = 44100 #sample rate
RECORD_SECONDS = 3
WAVE_OUTPUT_FILENAME = "Output.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK) #buffer

print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data) # 2 bytes(16 bits) per channel

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

* recording
* done recording


In [7]:
## Function for predicting the Emotion from the audio file
def get_pred(Test_file):
    voice_clip,sample_rate = librosa.load(Test_file,res_type="kaiser_fast",duration=3)
    melspec = librosa.power_to_db(librosa.feature.melspectrogram(voice_clip,sample_rate,power=2,fmax=3000),ref=np.max)
    normimage = ((melspec.flatten() - np.mean(melspec)) / np.std(melspec)).reshape(128,130)
    melimg=np.array(normimage).reshape(128,130,1)
    mel3d = np.dstack(melimg)
    mel3d = mel3d.reshape(1,128,130,1)
    testpreds=savedmodel.predict(mel3d)
    predicted=np.argmax(testpreds)
    emotion = {0:"Neutral",1:"Calm",2:"Happy",3:"Sad",4:"Angry",5:"Fearful",6:"Disgust",7:"Surprised"}
    return(emotion[predicted])

In [8]:
## Testing with a Disgust file
Test_file='Audio_Speech_Actors_01-24/Actor_22/03-01-07-02-02-02-22.wav'

In [9]:
ipd.Audio(Test_file)

In [10]:
get_pred(Test_file)

'Disgust'

In [11]:
## Testing with Angry file
Test_file1='Audio_Speech_Actors_01-24/Actor_21/03-01-05-01-02-02-21.wav'
ipd.Audio(Test_file1)

In [12]:
get_pred(Test_file1)

'Angry'

In [13]:
## Testing with recorded audio file
ipd.Audio(WAVE_OUTPUT_FILENAME)

In [14]:
get_pred(WAVE_OUTPUT_FILENAME)

'Fearful'