In [1]:
# Import libraries 
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import glob 
from sklearn.metrics import confusion_matrix
import IPython.display as ipd  # To play sound in the notebook
import os
import sys
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
TESS = "../DataSet/TESS/TESS Toronto emotional speech set data/"
RAV = "../DataSet/RAVDESS/ravdess-emotional-speech-audio/"
SAVEE = "../DataSet/SAVEE/ALL/"
CREMA = "../DataSet/CREMA-D/AudioWAV/"

# Run one example 
dir_list = os.listdir(SAVEE)
dir_list[0:5]

['JK_sa01.wav', 'JK_sa15.wav', 'DC_n13.wav', 'DC_su09.wav', 'DC_n07.wav']

In [3]:
RAV

'../DataSet/RAVDESS/ravdess-emotional-speech-audio/'

### 1. SAVEE dataset
The audio files are named in such a way that the prefix letters describes the emotion classes as follows:

'a' = 'anger'
'd' = 'disgust'
'f' = 'fear'
'h' = 'happiness'
'n' = 'neutral'
'sa' = 'sadness'
'su' = 'surprise'

In [4]:
emotion=[]
path = []
for i in dir_list:
    if i[-8:-6]=='_a':
        emotion.append('male_angry')
    elif i[-8:-6]=='_d':
        emotion.append('male_disgust')
    elif i[-8:-6]=='_f':
        emotion.append('male_fear')
    elif i[-8:-6]=='_h':
        emotion.append('male_happy')
    elif i[-8:-6]=='_n':
        emotion.append('male_neutral')
    elif i[-8:-6]=='sa':
        emotion.append('male_sad')
    elif i[-8:-6]=='su':
        emotion.append('male_surprise')
    else:
        emotion.append('male_error') 
    path.append(SAVEE + i)
    
# Now check out the label count distribution 
SAVEE_df = pd.DataFrame(emotion, columns = ['labels'])
SAVEE_df['source'] = 'SAVEE'
SAVEE_df = pd.concat([SAVEE_df, pd.DataFrame(path, columns = ['path'])], axis = 1)
SAVEE_df.labels.value_counts()

male_neutral     120
male_sad          60
male_surprise     60
male_happy        60
male_angry        60
male_disgust      60
male_fear         60
Name: labels, dtype: int64

In [5]:
dir_list = os.listdir(RAV)
dir_list.sort()

emotion = []
gender = []
path = []
for i in dir_list:
    fname = os.listdir(RAV + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        temp = int(part[6])
        if temp%2 == 0:
            temp = "female"
        else:
            temp = "male"
        gender.append(temp)
        path.append(RAV + i + '/' + f)

        
RAV_df = pd.DataFrame(emotion)
RAV_df = RAV_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAV_df = pd.concat([pd.DataFrame(gender),RAV_df],axis=1)
RAV_df.columns = ['gender','emotion']
RAV_df['labels'] =RAV_df.gender + '_' + RAV_df.emotion
RAV_df['source'] = 'RAVDESS'  
RAV_df = pd.concat([RAV_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAV_df = RAV_df.drop(['gender', 'emotion'], axis=1)
RAV_df.labels.value_counts()

male_neutral       144
female_neutral     144
female_angry        96
female_happy        96
male_angry          96
female_disgust      96
male_happy          96
male_fear           96
female_fear         96
female_sad          96
male_sad            96
male_surprise       96
female_surprise     96
male_disgust        96
Name: labels, dtype: int64

In [6]:
dir_list = os.listdir(TESS)
dir_list.sort()
dir_list

['OAF_Fear',
 'OAF_Pleasant_surprise',
 'OAF_Sad',
 'OAF_angry',
 'OAF_disgust',
 'OAF_happy',
 'OAF_neutral',
 'YAF_angry',
 'YAF_disgust',
 'YAF_fear',
 'YAF_happy',
 'YAF_neutral',
 'YAF_pleasant_surprised',
 'YAF_sad']

In [7]:
path = []
emotion = []

for i in dir_list:
    fname = os.listdir(TESS + i)
    for f in fname:
        if i == 'OAF_angry' or i == 'YAF_angry':
            emotion.append('female_angry')
        elif i == 'OAF_disgust' or i == 'YAF_disgust':
            emotion.append('female_disgust')
        elif i == 'OAF_Fear' or i == 'YAF_fear':
            emotion.append('female_fear')
        elif i == 'OAF_happy' or i == 'YAF_happy':
            emotion.append('female_happy')
        elif i == 'OAF_neutral' or i == 'YAF_neutral':
            emotion.append('female_neutral')                                
        elif i == 'OAF_Pleasant_surprise' or i == 'YAF_pleasant_surprised':
            emotion.append('female_surprise')               
        elif i == 'OAF_Sad' or i == 'YAF_sad':
            emotion.append('female_sad')
        else:
            emotion.append('Unknown')
        path.append(TESS + i + "/" + f)

TESS_df = pd.DataFrame(emotion, columns = ['labels'])
TESS_df['source'] = 'TESS'
TESS_df = pd.concat([TESS_df,pd.DataFrame(path, columns = ['path'])],axis=1)
TESS_df.labels.value_counts()

female_disgust     400
female_surprise    400
female_angry       400
female_happy       400
female_fear        400
female_sad         400
female_neutral     400
Name: labels, dtype: int64

In [8]:
dir_list = os.listdir(CREMA)
dir_list.sort()
print(dir_list[0:10])

['1001_DFA_ANG_XX.wav', '1001_DFA_DIS_XX.wav', '1001_DFA_FEA_XX.wav', '1001_DFA_HAP_XX.wav', '1001_DFA_NEU_XX.wav', '1001_DFA_SAD_XX.wav', '1001_IEO_ANG_HI.wav', '1001_IEO_ANG_LO.wav', '1001_IEO_ANG_MD.wav', '1001_IEO_DIS_HI.wav']


In [9]:
gender = []
emotion = []
path = []
female = [1002,1003,1004,1006,1007,1008,1009,1010,1012,1013,1018,1020,1021,1024,1025,1028,1029,1030,1037,1043,1046,1047,1049,
          1052,1053,1054,1055,1056,1058,1060,1061,1063,1072,1073,1074,1075,1076,1078,1079,1082,1084,1089,1091]

for i in dir_list: 
    part = i.split('_')
    if int(part[0]) in female:
        temp = 'female'
    else:
        temp = 'male'
    gender.append(temp)
    if part[2] == 'SAD' and temp == 'male':
        emotion.append('male_sad')
    elif part[2] == 'ANG' and temp == 'male':
        emotion.append('male_angry')
    elif part[2] == 'DIS' and temp == 'male':
        emotion.append('male_disgust')
    elif part[2] == 'FEA' and temp == 'male':
        emotion.append('male_fear')
    elif part[2] == 'HAP' and temp == 'male':
        emotion.append('male_happy')
    elif part[2] == 'NEU' and temp == 'male':
        emotion.append('male_neutral')
    elif part[2] == 'SAD' and temp == 'female':
        emotion.append('female_sad')
    elif part[2] == 'ANG' and temp == 'female':
        emotion.append('female_angry')
    elif part[2] == 'DIS' and temp == 'female':
        emotion.append('female_disgust')
    elif part[2] == 'FEA' and temp == 'female':
        emotion.append('female_fear')
    elif part[2] == 'HAP' and temp == 'female':
        emotion.append('female_happy')
    elif part[2] == 'NEU' and temp == 'female':
        emotion.append('female_neutral')
    else:
        emotion.append('Unknown')
    path.append(CREMA + i)
    
CREMA_df = pd.DataFrame(emotion, columns = ['labels'])
CREMA_df['source'] = 'CREMA'
CREMA_df = pd.concat([CREMA_df,pd.DataFrame(path, columns = ['path'])],axis=1)
CREMA_df.labels.value_counts()

male_angry        671
male_sad          671
male_disgust      671
male_happy        671
male_fear         671
female_angry      600
female_disgust    600
female_happy      600
female_fear       600
female_sad        600
male_neutral      575
female_neutral    512
Name: labels, dtype: int64

In [10]:
df = pd.concat([SAVEE_df, RAV_df, TESS_df, CREMA_df], axis = 0)
print(df.labels.value_counts())
df.head()
df.to_csv("Data_path.csv",index=False)

female_angry       1096
female_disgust     1096
female_happy       1096
female_fear        1096
female_sad         1096
female_neutral     1056
male_neutral        839
male_angry          827
male_sad            827
male_disgust        827
male_happy          827
male_fear           827
female_surprise     496
male_surprise       156
Name: labels, dtype: int64


In [11]:
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd  # To play sound in the notebook


Using TensorFlow backend.


In [12]:
ref = pd.read_csv("Data_path.csv")
ref.head()

Unnamed: 0,labels,source,path
0,male_sad,SAVEE,../DataSet/SAVEE/ALL/JK_sa01.wav
1,male_sad,SAVEE,../DataSet/SAVEE/ALL/JK_sa15.wav
2,male_neutral,SAVEE,../DataSet/SAVEE/ALL/DC_n13.wav
3,male_surprise,SAVEE,../DataSet/SAVEE/ALL/DC_su09.wav
4,male_neutral,SAVEE,../DataSet/SAVEE/ALL/DC_n07.wav


In [13]:
df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(ref.path):
    X, sample_rate = librosa.load(path
                                  , res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5
                                 )
    sample_rate = np.array(sample_rate)
    
    # mean as the feature. Could do min and max etc as well. 
    mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                        sr=sample_rate, 
                                        n_mfcc=13),
                    axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

# Check a few records to make sure its processed successfully
print(len(df))
df.head()

12162


Unnamed: 0,feature
0,"[-23.589674, -24.579994, -22.594236, -21.48121..."
1,"[-23.371296, -24.25871, -24.391262, -23.972637..."
2,"[-3.8807042, -5.170106, -8.079335, -7.4936047,..."
3,"[-7.7042007, -6.693756, -7.5397215, -8.864786,..."
4,"[-4.9374447, -5.9628124, -9.859098, -8.692251,..."


In [15]:
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]
df=df.fillna(0)
print(df.shape)
df[:5]

(12162, 219)


Unnamed: 0,labels,source,path,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,male_sad,SAVEE,../DataSet/SAVEE/ALL/JK_sa01.wav,-23.589674,-24.579994,-22.594236,-21.481213,-20.949923,-20.414589,-20.267546,...,-8.499668,-10.080903,-12.700766,-17.040066,-20.24037,-23.302591,-24.621037,-23.829395,-12.847005,-5.907684
1,male_sad,SAVEE,../DataSet/SAVEE/ALL/JK_sa15.wav,-23.371296,-24.258711,-24.391262,-23.972637,-23.494141,-24.208841,-25.631187,...,-8.738687,-8.822194,-8.977812,-9.998901,-15.77799,-22.670012,-23.585886,-24.138811,-26.113234,-27.176575
2,male_neutral,SAVEE,../DataSet/SAVEE/ALL/DC_n13.wav,-3.880704,-5.170106,-8.079335,-7.493605,-7.611509,-5.591492,-4.388686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,male_surprise,SAVEE,../DataSet/SAVEE/ALL/DC_su09.wav,-7.704201,-6.693756,-7.539721,-8.864786,-8.661815,-8.826547,-9.143904,...,-25.902903,-25.006645,-24.709747,-25.51671,-26.94138,-25.354641,-25.213074,-27.60746,-25.052874,-22.871309
4,male_neutral,SAVEE,../DataSet/SAVEE/ALL/DC_n07.wav,-4.937445,-5.962812,-9.859098,-8.692251,-8.685309,-8.844451,-8.032232,...,-11.438392,-11.691319,-11.480921,-10.730117,-9.891499,-9.329518,-8.907435,-8.881425,-8.315574,-7.196213


In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','labels','source'],axis=1)
                                                    , df.labels
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )

# Lets see how the data present itself before normalisation 
X_train[150:160]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4950,-18.611179,-17.616539,-18.411484,-18.987419,-17.404621,-16.747272,-17.733747,-18.055025,-17.93121,-15.913172,...,-22.899403,-21.647816,-19.758656,-18.879402,-19.397377,-20.171659,-22.689243,-24.612814,-24.153776,-22.703135
3860,-17.820063,-20.567936,-22.585173,-22.175566,-20.658344,-17.975264,-15.879942,-18.664234,-21.569914,-25.008776,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9761,-1.533947,-4.030602,-9.614023,-12.045173,-9.992992,-11.92625,-14.008465,-13.561555,-14.024568,-15.151947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7620,-4.531077,-3.933792,-4.567834,-5.871509,-5.282475,-6.490459,-8.156466,-9.188803,-8.681725,-8.212409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11586,-20.621702,-21.587507,-20.563646,-20.703459,-21.205715,-18.608534,-18.446669,-16.211845,-14.257651,-15.160404,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7914,-17.514988,-18.551867,-17.043016,-16.977903,-19.369633,-19.562126,-22.008749,-20.178385,-17.989597,-19.336285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9513,-18.740368,-18.82493,-16.149488,-16.963457,-18.229979,-18.183954,-19.274342,-18.395123,-16.951286,-16.672031,...,-17.88213,-19.390713,-17.779472,-19.165974,0.0,0.0,0.0,0.0,0.0,0.0
5835,-19.066849,-18.328381,-17.710285,-18.043192,-18.25248,-18.710625,-16.626352,-17.831005,-18.028343,-17.859106,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5389,-20.76059,-20.047138,-18.961346,-19.468687,-19.316292,-18.162563,-18.102333,-19.914133,-20.931385,-19.215496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11222,-18.252924,-17.727373,-19.222475,-18.469971,-17.572325,-17.850542,-17.932026,-20.5889,-18.612183,-15.990726,...,-18.065437,-18.13509,-19.665306,-20.741905,-20.273037,-18.371035,-15.576723,-17.512489,-17.008547,-18.195284


In [17]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Check the dataset now 
X_train[150:160]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4950,0.186263,0.303048,0.437488,0.388585,0.498587,0.542016,0.460983,0.432405,0.435681,0.581124,...,-0.875892,-0.806953,-0.687441,-0.627255,-0.672991,-0.715326,-0.85805,-0.981787,-0.964725,-0.873013
3860,0.241323,0.089489,0.128663,0.153172,0.258718,0.451531,0.597264,0.387662,0.168481,-0.086141,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
9761,1.374803,1.286108,1.088442,0.901201,1.044982,0.897253,0.734844,0.762426,0.722556,0.636969,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
7620,1.166209,1.293113,1.461826,1.357065,1.392247,1.29779,1.16505,1.08358,1.114895,1.146062,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
11586,0.046335,0.015714,0.278242,0.261873,0.218366,0.404869,0.408573,0.567777,0.70544,0.636348,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
7914,0.262556,0.235369,0.538746,0.536968,0.353724,0.334603,0.146709,0.276456,0.431393,0.33,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
9513,0.177272,0.21561,0.604861,0.538035,0.437741,0.436154,0.347727,0.407427,0.507639,0.525454,...,-0.565665,-0.667944,-0.56594,-0.644768,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
5835,0.154549,0.25154,0.489372,0.458307,0.436082,0.397346,0.542392,0.448858,0.428548,0.438368,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
5389,0.036668,0.127173,0.396802,0.353048,0.357656,0.43773,0.433887,0.295864,0.21537,0.338862,...,0.540018,0.526282,0.525535,0.526482,0.511615,0.511544,0.512207,0.500545,0.495891,0.491863
11222,0.211197,0.295028,0.37748,0.426793,0.486224,0.460721,0.446407,0.246306,0.385675,0.575435,...,-0.576999,-0.590613,-0.681711,-0.741075,-0.726468,-0.605809,-0.428508,-0.554162,-0.532642,-0.602008


In [18]:
max_data = np.max(X_train)
min_data = np.min(X_train)
X_train = (X_train-min_data)/(max_data-min_data+1e-6)
X_train =  X_train-0.5

max_data = np.max(X_test)
min_data = np.min(X_test)
X_test = (X_test-min_data)/(max_data-min_data+1e-6)
X_test =  X_test-0.5

X_train[150:160]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
4950,0.225976,0.260507,0.229571,0.223734,0.248347,0.271944,0.273993,0.271421,0.271275,0.317243,...,0.194599,0.211055,0.238353,0.252237,0.235715,0.238385,0.214614,0.171467,0.164676,0.14013
3860,0.23437,0.22817,0.185194,0.189823,0.213375,0.258457,0.2947,0.264607,0.23066,0.212718,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
9761,0.407182,0.40936,0.32311,0.297577,0.328011,0.324897,0.315603,0.321677,0.314882,0.325991,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
7620,0.37538,0.410421,0.376764,0.363244,0.378641,0.384602,0.380968,0.370583,0.37452,0.405739,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
11586,0.204642,0.217,0.206688,0.205481,0.207492,0.251501,0.26603,0.292036,0.31228,0.325894,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
7914,0.237608,0.250259,0.244121,0.245109,0.227227,0.241027,0.226243,0.247673,0.270624,0.277905,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
9513,0.224605,0.247267,0.253622,0.245262,0.239476,0.256164,0.256786,0.267617,0.282214,0.308522,...,0.259876,0.240411,0.263187,0.248584,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
5835,0.221141,0.252708,0.237027,0.233777,0.239234,0.25038,0.286362,0.273926,0.270191,0.294881,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
5389,0.203168,0.233876,0.223725,0.218615,0.2278,0.256399,0.269876,0.250628,0.237787,0.279293,...,0.492531,0.492602,0.486273,0.492898,0.491642,0.495417,0.5,0.494098,0.472364,0.410232
11222,0.229777,0.259293,0.220948,0.229238,0.246545,0.259827,0.271779,0.243081,0.263674,0.316352,...,0.257491,0.256741,0.239524,0.228495,0.224162,0.261329,0.304076,0.26454,0.255697,0.19376


In [19]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()

(9121, 216)
['female_angry' 'female_disgust' 'female_fear' 'female_happy'
 'female_neutral' 'female_sad' 'female_surprise' 'male_angry'
 'male_disgust' 'male_fear' 'male_happy' 'male_neutral' 'male_sad'
 'male_surprise']


In [20]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape

(9121, 216, 1)