In [1]:
cd F:\MTP

F:\MTP


In [2]:
import os
import torch
from collections import defaultdict
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

In [3]:
path = os.listdir('EmoDB/wav')

We extract speaker embeddings(x-vectors) with a pretrained TDNN model using SpeechBrain from each utternace with a size of 512.

In [4]:
import torchaudio
from speechbrain.pretrained import EncoderClassifier
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

Each utterance from the database is name according to the scheme :
    Positions 1-2: number of speaker
    Positions 3-5: code for text
    Position 6: emotion (sorry, letter stands for german emotion word)
    Position 7: if there are more than two versions these are numbered a, b, c....

Example: 03a01Fa.wav is the audio file from Speaker 03 speaking text a01 with
the emotion "Freude" (Happiness in German).

So now,we extract the emotion from the filenames and consider it as y in our model.

In [5]:
emotion_code = {
    'W':0, #anger
    'L':1, #boredom
    'E':2, #disgust
    'A':3, #fear
    'F':4, #happy
    'T':5, #sad
    'N':6  #neutral
}

In [6]:
x_vec_size = 512
X = torch.zeros([len(path),x_vec_size])
y = torch.zeros([len(path),7],dtype = int)
for i in range(len(path)):
    signal, fs =torchaudio.load('EmoDB/wav/'+path[i])
    embeddings = classifier.encode_batch(signal)
    X[i] = embeddings[0][0]
    y[i][emotion_code[path[i][5]]]=1

In [7]:
print(X.shape,y.shape)

torch.Size([535, 512]) torch.Size([535, 7])


In [8]:
print(y[0])

tensor([0, 0, 0, 0, 1, 0, 0])


Convert them into numpy arrays so that we can train them easily

In [9]:
X=X.numpy()
y=y.numpy()

In [10]:
dict_X=defaultdict(list)
dict_y=defaultdict(list)
for i in range(len(path)):
    dict_X[path[i][:2]].append(X[i])
    dict_y[path[i][:2]].append(y[i])

To make this evaluation speaker independent, we separate the data speaker-wise and use leave-one-out validation.

In [11]:
dict = {0:"03",1:"08",2:"09",3:"10",4:"11",5:"12",6:"13",7:"14",8:"15",9:"16"}

In [12]:
#requirements


from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics
from sklearn.metrics import accuracy_score

Since the x-vectors are extracted before the last layer from a pretrained model, we just add softmax layer to get the output layer.

In [13]:
def train_and_test(X,y,X_test,y_test):
    model = Sequential()
    model.add(Dense(7, input_shape=(len(X[0]),)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    model.fit(X,y,batch_size=20, epochs=200)
    predict_y = model.predict(X_test)
    predicted_test = predict_y
    np.where(np.argmax(predict_y[:]),1,0)
    for i in range(len(predict_y)):
        temp = np.argmax(predict_y[i])
        predict_y[i] = np.zeros((1,7))
        predict_y[i][temp] = 1
    acc=accuracy_score(y_test,predict_y)
    print(acc)
    return acc

In [None]:
temp = 9
acc = 0
accuracies = [0]*10
for i in range(10):
    a = list((dict_X[dict[i]] for i in range(10) if i != temp))
    b = list((dict_y[dict[i]] for i in range(10) if i != temp))
    X = np.concatenate(a)
    X_test = np.array(dict_X[dict[temp]])
    y_test = np.array(dict_y[dict[temp]])
    y = np.concatenate(b)
    temp -= 1
    print(X.shape,y.shape,X_test.shape,y_test.shape)
    temp_acc = train_and_test(X,y,X_test,y_test)
    accuracies[temp]=temp_acc
    print(f'Accuracy for iteration: {temp_acc}')
    acc += temp_acc 

leave-one-speaker-out validation accuracies

In [15]:
accuracies

[0.5862068965517241,
 0.7209302325581395,
 0.7368421052631579,
 0.6909090909090909,
 0.6,
 0.6885245901639344,
 0.8840579710144928,
 0.6964285714285714,
 0.8450704225352113,
 0.7959183673469388]

In [16]:
print("Overall accuracy :"+ str(acc*10)+"%" )

Overall accuracy :72.4488824777126%


In [17]:
M31_X,M31_y=np.array(dict_X["03"]),np.array(dict_y["03"])
F34_X,F34_y=np.array(dict_X["08"]),np.array(dict_y["08"])
F21_X,F21_y=np.array(dict_X["09"]),np.array(dict_y["09"])
M32_X,M32_y=np.array(dict_X["10"]),np.array(dict_y["10"])
M26_X,M26_y=np.array(dict_X["11"]),np.array(dict_y["11"])
M30_X,M30_y=np.array(dict_X["12"]),np.array(dict_y["12"])
F32_X,F32_y=np.array(dict_X["13"]),np.array(dict_y["13"])
F35_X,F35_y=np.array(dict_X["14"]),np.array(dict_y["14"])
M25_X,M25_y=np.array(dict_X["15"]),np.array(dict_y["15"])
F31_X,F31_y=np.array(dict_X["16"]),np.array(dict_y["16"])