In [2]:
# Imports, ignore warnings
import warnings
warnings.filterwarnings('ignore')
import os
from tensorflow import keras
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from keras.models import model_from_json
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [3]:
# Read in the JSON file
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

In [4]:
# Load the model from JSON 
loaded_model = model_from_json(loaded_model_json)

In [5]:
# Load weights into new model
loaded_model.load_weights('saved_models/Emotion_Voice_Detection_Model.h5')
print('Loaded model from disk')
loaded_model.summary()

Loaded model from disk
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_7 (Conv1D)            (None, 216, 128)          768       
_________________________________________________________________
activation_8 (Activation)    (None, 216, 128)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 216, 128)          82048     
_________________________________________________________________
activation_9 (Activation)    (None, 216, 128)          0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 216, 128)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 27, 128)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (N

In [6]:
loaded_model.layers[17].get_config()

{'name': 'activation_14',
 'trainable': True,
 'dtype': 'float32',
 'activation': 'softmax'}

In [7]:
o = keras.optimizers.RMSprop(lr = 0.00001, decay = 1e-6)
loaded_model.compile(loss ='categorical_crossentropy', optimizer = o, metrics = ['accuracy'])

In [11]:
# Method for reading in the audio files and extracting features
"""
- d is the directory the audio files, default is the current working directory.
- dur is the duration in seconds that will be read in.
- For this CNN to work, dur must be 2.5
"""

def readAudioFiles(d, dur, sample_rate):
    if d is None:
        d = 'dir'
        
    df = pd.DataFrame(columns=['feature'])
    file_names = []
    i = 0
    for audiofile in os.listdir(d):
        if audiofile.endswith('.wav'):
            # Load file using librosa
            print(audiofile, 'loaded')
            file_names.append(audiofile)
            X, sr = librosa.load(os.path.join(d, audiofile), res_type = 'kaiser_fast', duration = dur , sr = sample_rate, offset = 0.5)
            sr = np.array(sr)
            # Extract the MFCCS
            mfccs = np.mean(librosa.feature.mfcc(y = X, 
                                                sr = sr, 
                                                n_mfcc = 13),
                                                axis=0)
            feature = mfccs
            # Add to data frame
            df.loc[i] = [feature]
            i += 1
    df = pd.DataFrame(df['feature'].values.tolist())
    df = shuffle(df)
    df = df.fillna(0)
    return df, file_names 

In [12]:
audio_features, file_names = readAudioFiles(d = 'the-office-audio-clips', dur = 2.5, sample_rate = 44100)    

She_Said_Michael.wav loaded
Snake_Mongoose_Dwight_1.wav loaded
Bankruptcy_Michael_1.wav loaded
Don't_Touch_It_Michael_1.wav loaded
ID_badges_Dwight_1.wav loaded
Crackling_Bacon_Michael_1.wav loaded
Deposed_King_Michael_1.wav loaded
Stranger_On_Purpose_Dwight_1.wav loaded
Commited_Crime_Dwight_1.wav loaded
Ow_Oh_Not_Again_Dwight_1.wav loaded


In [13]:
audio_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
9,-18.07468,-19.676079,-21.052315,-21.596598,-21.580614,-23.643553,-24.15435,-22.210325,-20.224453,-20.864336,...,-18.627945,-20.423378,-19.527126,-19.974916,-19.125187,-20.035051,-19.512945,-19.762545,-13.051805,-5.390477
4,-7.514073,-6.662798,-6.897815,-7.844941,-7.306717,-8.897575,-9.387383,-10.066462,-11.570973,-12.838842,...,-19.882822,-19.376831,-21.699799,-22.639927,-23.941227,-25.620226,-24.350258,-24.918831,-24.986565,-24.331949
3,-21.345259,-20.299679,-22.164175,-23.63065,-22.702942,-24.027012,-25.012608,-24.492353,-22.514584,-20.75897,...,-48.629925,-48.629925,-48.629925,-48.629925,-48.629925,-48.629925,-48.629925,0.0,0.0,0.0
2,-9.027798,-13.212311,-21.835543,-21.280186,-22.559011,-22.05781,-24.048397,-25.234243,-25.34429,-25.535872,...,-47.674599,-47.674599,-47.674599,-47.674599,-47.674599,-47.674599,-47.674599,0.0,0.0,0.0
7,-8.611604,-9.100842,-10.840145,-10.781713,-11.992739,-16.734877,-21.976036,-22.458759,-21.910828,-23.604235,...,-24.854952,-23.881542,-22.850485,-25.252556,-23.397551,-24.862186,-27.039858,-24.97267,-24.112698,-24.691084


In [14]:
audio_features_cnn = np.expand_dims(audio_features, axis = 2)
audio_features_cnn

array([[[-18.07468033],
        [-19.6760788 ],
        [-21.05231476],
        ...,
        [-19.76254463],
        [-13.05180454],
        [ -5.3904767 ]],

       [[ -7.5140729 ],
        [ -6.6627984 ],
        [ -6.89781475],
        ...,
        [-24.91883087],
        [-24.98656464],
        [-24.33194923]],

       [[-21.34525871],
        [-20.2996788 ],
        [-22.16417503],
        ...,
        [  0.        ],
        [  0.        ],
        [  0.        ]],

       ...,

       [[ -9.31606483],
        [-13.10586452],
        [-18.90342712],
        ...,
        [-13.18312931],
        [-14.63031483],
        [-12.95053196]],

       [[ -4.44563627],
        [ -5.80927563],
        [ -8.32289028],
        ...,
        [-12.39449215],
        [-12.50139427],
        [-13.84210014]],

       [[ -1.72818065],
        [ -4.50054741],
        [-12.2768259 ],
        ...,
        [-15.89528751],
        [-16.40517426],
        [-16.7436142 ]]])

In [15]:
preds = loaded_model.predict(audio_features_cnn, 
                             batch_size = 32, 
                             verbose = 1)



In [16]:
def sumProbs(preds):
    file = []
    for i in range(preds.shape[1]):
        temp = []
        p_angry = preds[i][0] + preds[i][5]
        p_calm = preds[i][1] + preds[i][6]
        p_fearful = preds[i][2] + preds[i][7]
        p_happy = preds[i][3] + preds[i][8]
        p_sad = preds[i][4] + preds[i][9]
        temp.append(p_angry)
        temp.append(p_calm)
        temp.append(p_fearful)
        temp.append(p_happy)
        temp.append(p_sad)
        file.append(temp)
    return np.array(file)

In [17]:
new_preds = sumProbs(preds)
new_preds

array([[1.80200896e-05, 7.09981975e-08, 8.93785805e-02, 9.09717739e-01,
        8.85576243e-04],
       [1.02936802e-02, 7.61072649e-10, 9.56698835e-01, 1.93735231e-02,
        1.36339050e-02],
       [1.17096462e-07, 1.31424639e-16, 9.99628186e-01, 3.71638627e-04,
        1.44117305e-08],
       [2.59109016e-04, 1.32688547e-05, 6.12308204e-01, 3.10572147e-01,
        7.68472254e-02],
       [3.88132618e-03, 7.88469225e-14, 9.16556776e-01, 1.50372816e-08,
        7.95619488e-02],
       [1.86996260e-07, 6.40687454e-14, 3.64633068e-03, 2.91192322e-04,
        9.96062219e-01],
       [8.83149579e-02, 3.08007939e-20, 8.39964925e-13, 3.67914070e-03,
        9.08005953e-01],
       [3.56421369e-05, 9.37485367e-11, 9.62090135e-01, 1.01701244e-05,
        3.78639922e-02],
       [9.32094395e-01, 8.48155422e-13, 2.49118366e-05, 6.78804293e-02,
        3.09038313e-07],
       [4.60913358e-03, 5.71336242e-11, 9.93434608e-01, 1.92849291e-03,
        2.78348452e-05]], dtype=float32)

In [21]:
arg_max = new_preds.argmax(axis = 1)
print(arg_max)

[3 2 2 2 2 4 4 2 0 2]


In [25]:
emotions = {
    0: 'angry', 
    1: 'calm',
    2 : 'fearful',
    3 : 'happy',
    4 : 'sad',
}

def inverseTransform(preds, emotion_dict):
    decoded = []
    preds = preds.tolist()
    for i in range(len(preds)):
        key = preds[i]
        filename = file_names[i]
        val = emotion_dict[key]
        print('file name:', filename, '/', 'CNN prediction:', key, '/', 'predicted emotion:', val)
        decoded.append(val) 
    return filename, key, val
        

In [26]:
pred_emo = inverseTransform(arg_max, emotions)

file name: She_Said_Michael.wav / CNN prediction: 3 / predicted emotion: happy
file name: Snake_Mongoose_Dwight_1.wav / CNN prediction: 2 / predicted emotion: fearful
file name: Bankruptcy_Michael_1.wav / CNN prediction: 2 / predicted emotion: fearful
file name: Don't_Touch_It_Michael_1.wav / CNN prediction: 2 / predicted emotion: fearful
file name: ID_badges_Dwight_1.wav / CNN prediction: 2 / predicted emotion: fearful
file name: Crackling_Bacon_Michael_1.wav / CNN prediction: 4 / predicted emotion: sad
file name: Deposed_King_Michael_1.wav / CNN prediction: 4 / predicted emotion: sad
file name: Stranger_On_Purpose_Dwight_1.wav / CNN prediction: 2 / predicted emotion: fearful
file name: Commited_Crime_Dwight_1.wav / CNN prediction: 0 / predicted emotion: angry
file name: Ow_Oh_Not_Again_Dwight_1.wav / CNN prediction: 2 / predicted emotion: fearful
