In [1]:
cd F:/MTP

F:\MTP


In [2]:
import os
path = os.listdir('EmoDB/wav')

In [3]:
emotion_code = {
    'W':0, #anger
    'L':1, #boredom
    'E':2, #disgust
    'A':3, #fear
    'F':4, #happy
    'T':5, #sad
    'N':6  #neutral
}

speaker_code = {
    '03':0,
    '08':1,
    '09':2,
    '10':3,
    '11':4,
    '12':5,
    '13':6,
    '14':7,
    '15':8,
    '16':9
}

gender_code = {
    '03':0,
    '08':1,
    '09':1,
    '10':0,
    '11':0,
    '12':0,
    '13':1,
    '14':1,
    '15':0,
    '16':1
}

In [4]:
spk_id = {0:"03",1:"08",2:"09",3:"10",4:"11",5:"12",6:"13",7:"14",8:"15",9:"16"}

In [5]:
import librosa
import numpy as np

In [6]:
import torchaudio
import torch

In [7]:
from torch.utils.data import Dataset, DataLoader

In [132]:
class TrainDataset(Dataset):

    def __init__(self, dir_path, test_key,valid_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.test_key = test_key
        self.valid_key = valid_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        train_keys = list((spk_id[i] for i in range(10) if i not in [self.test_key,self.valid_key]))
        melspecs = []
        Y = []
        for key in train_keys:
            for file in files:
                if file[:2]==key:
                    r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                    melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                    temp = self.chunk(torch.Tensor(melspec))
                    melspecs.extend(temp)
                    for _ in range(len(temp)):
                        y = torch.zeros(7,dtype = int)
                        y[emotion_code[file[5]]]=1
                        Y.append(y)
        return melspecs,Y
    
    def chunk(self,melspec):
        melspec = melspec.transpose(0,1)
        res = []
        for i in range(0,melspec.size(0),50):
            temp = melspec[i:i+100,:]
            if temp.size(0)==100:
                res.append(temp)
        return res        
        
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [133]:
ds = TrainDataset("EmoDB/wav/",0,1)

2629 2629


In [32]:
train_dataloader = DataLoader(ds, batch_size=4,shuffle=True)

In [34]:
len(train_dataloader)

658

In [28]:
class ValidDataset(Dataset):

    def __init__(self, dir_path,valid_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.valid_key = valid_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        melspecs = []
        Y = []
        for file in files:
            if file[:2]==spk_id[self.valid_key]:
                r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                temp = self.chunk(torch.Tensor(melspec))
                melspecs.extend(temp)
                for _ in range(len(temp)):
                    y = torch.zeros(7,dtype = int)
                    y[emotion_code[file[5]]]=1
                    Y.append(y)
        return melspecs,Y
    
    def chunk(self,melspec):
        melspec = melspec.transpose(0,1)
        res = []
        for i in range(0,melspec.size(0),50):
            temp = melspec[i:i+100,:]
            if temp.size(0)==100:
                res.append(temp)
        return res        
        
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [29]:
validset = ValidDataset("EmoDB/wav/",1)

399 399


In [30]:
val_dataloader = DataLoader(ds, batch_size=4,shuffle=True)

In [31]:
len(val_dataloader)

658

In [12]:
class TestDataset(Dataset):

    def __init__(self, dir_path, test_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.test_key = test_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        melspecs = []
        Y = []
        key = spk_id[self.test_key]
        for file in files:
            if file[:2]==key:
                r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                melspec = melspec.transpose()
                melspecs.append(melspec)
                y = torch.zeros(7,dtype = int)
                y[emotion_code[file[5]]]=1
                Y.append(y)
        return melspecs,Y
   
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [13]:
testset = TestDataset("EmoDB/wav/",0)

49 49


In [15]:
testloader = DataLoader(testset, batch_size=1,shuffle=True)

In [16]:
import torch.nn as nn
import torch.nn.functional as F

class TDNN(nn.Module):
    
    def __init__(
                    self, 
                    input_dim=23, 
                    output_dim=512,
                    context_size=5,
                    stride=1,
                    dilation=1,
                    batch_norm=False,
                    dropout_p=0.2
                ):
        super(TDNN, self).__init__()
        self.context_size = context_size
        self.stride = stride
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dilation = dilation
        self.dropout_p = dropout_p
        self.batch_norm = batch_norm
      
        self.kernel = nn.Linear(input_dim*context_size, output_dim)
        self.nonlinearity = nn.ReLU()
        if self.batch_norm:
            self.bn = nn.BatchNorm1d(output_dim)
        if self.dropout_p:
            self.drop = nn.Dropout(p=self.dropout_p)
        
    def forward(self, x):
        
        
        _, _, d = x.shape
        assert (d == self.input_dim), 'Input dimension was wrong. Expected ({}), got ({})'.format(self.input_dim, d)
        x = x.unsqueeze(1)

        # Unfold input into smaller temporal contexts
        x = F.unfold(
                        x, 
                        (self.context_size, self.input_dim), 
                        stride=(1,self.input_dim), 
                        dilation=(self.dilation,1)
                    )

        # N, output_dim*context_size, new_t = x.shape
        x = x.transpose(1,2)
        x = self.kernel(x.float())
        x = self.nonlinearity(x)
        
        if self.dropout_p:
            x = self.drop(x)

        if self.batch_norm:
            x = x.transpose(1,2)
            x = self.bn(x)
            x = x.transpose(1,2)

        return x
import torch.nn as nn
# from models.tdnn import TDNN
import torch
import torch.nn.functional as F

class X_vector(nn.Module):
    def __init__(self, input_dim = 24, num_classes=7):
        super(X_vector, self).__init__()
        self.tdnn1 = TDNN(input_dim=input_dim, output_dim=512, context_size=5, dilation=1,dropout_p=0.5)
        self.tdnn2 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=1,dropout_p=0.5)
        self.tdnn3 = TDNN(input_dim=512, output_dim=512, context_size=2, dilation=2,dropout_p=0.5)
        self.tdnn4 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1,dropout_p=0.5)
        self.tdnn5 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=3,dropout_p=0.5)
        #### Frame levelPooling
        self.segment6 = nn.Linear(1024, 512)
        self.segment7 = nn.Linear(512, 512)
        self.output = nn.Linear(512, num_classes)
#         self.softmax = nn.Softmax(dim=1)
    def forward(self, inputs):
        tdnn1_out = self.tdnn1(inputs)
#         return tdnn1_out
        tdnn2_out = self.tdnn2(tdnn1_out)
        tdnn3_out = self.tdnn3(tdnn2_out)
        tdnn4_out = self.tdnn4(tdnn3_out)
        tdnn5_out = self.tdnn5(tdnn4_out)
        ### Stat Pool
        mean = torch.mean(tdnn5_out,1)
        std = torch.std(tdnn5_out,1)
        stat_pooling = torch.cat((mean,std),1)
        segment6_out = self.segment6(stat_pooling)
        x_vec = self.segment7(segment6_out)
        predictions = self.output(x_vec)
        return predictions

In [18]:
input_feats = [4,100,24]
input = torch.rand(input_feats)
model = X_vector()
out = model(input)

In [19]:
out

tensor([[ 0.0430,  0.0629, -0.0228,  0.0610,  0.0439, -0.0419, -0.0279],
        [ 0.0423,  0.0624, -0.0229,  0.0619,  0.0451, -0.0408, -0.0308],
        [ 0.0442,  0.0637, -0.0208,  0.0601,  0.0420, -0.0420, -0.0305],
        [ 0.0437,  0.0630, -0.0218,  0.0619,  0.0421, -0.0409, -0.0313]],
       grad_fn=<AddmmBackward>)

In [48]:
for layer in model.children():
   if hasattr(layer, 'reset_parameters'):
       layer.reset_parameters()

In [21]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss()

In [46]:
def train(train_dataloader,epoch):
    running_loss = 0.0
    train_loss_list=[]
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs.requires_grad = True
        # zero the parameter gradients
        optimizer.zero_grad()
        labels = torch.argmax(labels,dim =1)
        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss_list.append(loss.item())
        mean_loss = np.mean(np.asarray(train_loss_list))
        if i%100==0:
            print('Iteration - {} Epoch - {} Total training loss - {} '.format(i,epoch,mean_loss))
            
def validation(valid_dataloader,epoch):
    model.eval()
    with torch.no_grad():
        val_loss_list=[]
        for i, data in enumerate(valid_dataloader, 0):
            inputs, labels = data
            labels = torch.argmax(labels,dim =1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss_list.append(loss.item())
            if i%100==0:
                print('Iteration - {} Epoch - {} Loss - {}'.format(i,epoch,np.mean(np.asarray(val_loss_list))))
                
        mean_loss = np.mean(np.asarray(val_loss_list))
        print('Total validation loss {} after {} epochs'.format(mean_loss,epoch))
        model_save_path = os.path.join( 'best_check_point_'+str(epoch)+'_'+str(mean_loss))
        state_dict = {'model': model.state_dict(),'optimizer': optimizer.state_dict(),'epoch': epoch}
        torch.save(state_dict, model_save_path)


In [49]:
for epoch in range(100):
    train(train_dataloader,epoch)
    validation(val_dataloader,epoch)

Iteration - 0 Epoch - 0 Total training loss - 1.932255744934082 
Iteration - 100 Epoch - 0 Total training loss - 1.3132809970048394 
Iteration - 200 Epoch - 0 Total training loss - 1.2890096272513345 
Iteration - 300 Epoch - 0 Total training loss - 1.2668476909572302 
Iteration - 400 Epoch - 0 Total training loss - 1.28971478802865 
Iteration - 500 Epoch - 0 Total training loss - 1.26285388321428 
Iteration - 600 Epoch - 0 Total training loss - 1.256680853001637 
Iteration - 0 Epoch - 0 Loss - 1.2015089988708496
Iteration - 100 Epoch - 0 Loss - 1.0766516071144898
Iteration - 200 Epoch - 0 Loss - 1.0732556615599351
Iteration - 300 Epoch - 0 Loss - 1.0746253565498365
Iteration - 400 Epoch - 0 Loss - 1.0840016880683472
Iteration - 500 Epoch - 0 Loss - 1.086331317346253
Iteration - 600 Epoch - 0 Loss - 1.0843947761368236
Total validation loss 1.0867940213723748 after 0 epochs
Iteration - 0 Epoch - 1 Total training loss - 2.2937402725219727 
Iteration - 100 Epoch - 1 Total training loss - 1

Iteration - 300 Epoch - 9 Total training loss - 0.9659724588137727 
Iteration - 400 Epoch - 9 Total training loss - 0.9537861545725802 
Iteration - 500 Epoch - 9 Total training loss - 0.9362265076004935 
Iteration - 600 Epoch - 9 Total training loss - 0.9701674697415503 
Iteration - 0 Epoch - 9 Loss - 1.0101689100265503
Iteration - 100 Epoch - 9 Loss - 0.9896002311517696
Iteration - 200 Epoch - 9 Loss - 0.976959256983515
Iteration - 300 Epoch - 9 Loss - 0.9644247058321075
Iteration - 400 Epoch - 9 Loss - 0.9574731065522406
Iteration - 500 Epoch - 9 Loss - 0.9622777105864888
Iteration - 600 Epoch - 9 Loss - 0.9564418919669609
Total validation loss 0.9551710378163492 after 9 epochs
Iteration - 0 Epoch - 10 Total training loss - 0.7006010413169861 
Iteration - 100 Epoch - 10 Total training loss - 0.9483041629047677 
Iteration - 200 Epoch - 10 Total training loss - 0.9101804855895873 
Iteration - 300 Epoch - 10 Total training loss - 0.901070951425356 
Iteration - 400 Epoch - 10 Total train

Iteration - 400 Epoch - 18 Total training loss - 1.3977029264075427 
Iteration - 500 Epoch - 18 Total training loss - 1.2848768669353452 
Iteration - 600 Epoch - 18 Total training loss - 1.1970177485627709 
Iteration - 0 Epoch - 18 Loss - 1.3501555919647217
Iteration - 100 Epoch - 18 Loss - 0.8764643058328345
Iteration - 200 Epoch - 18 Loss - 0.8728582899072277
Iteration - 300 Epoch - 18 Loss - 0.8648361109716948
Iteration - 400 Epoch - 18 Loss - 0.8733248997879147
Iteration - 500 Epoch - 18 Loss - 0.8844458882263797
Iteration - 600 Epoch - 18 Loss - 0.8818372419590561
Total validation loss 0.881375299896875 after 18 epochs
Iteration - 0 Epoch - 19 Total training loss - 0.9488004446029663 
Iteration - 100 Epoch - 19 Total training loss - 0.8663161632669444 
Iteration - 200 Epoch - 19 Total training loss - 0.9176895846913925 
Iteration - 300 Epoch - 19 Total training loss - 0.9128416891478968 
Iteration - 400 Epoch - 19 Total training loss - 0.9046306170707805 
Iteration - 500 Epoch - 1

Iteration - 500 Epoch - 27 Total training loss - 1.2958721914899094 
Iteration - 600 Epoch - 27 Total training loss - 1.2373272276063334 
Iteration - 0 Epoch - 27 Loss - 1.1655679941177368
Iteration - 100 Epoch - 27 Loss - 0.911109330335466
Iteration - 200 Epoch - 27 Loss - 0.9623365947485563
Iteration - 300 Epoch - 27 Loss - 0.9516985027546503
Iteration - 400 Epoch - 27 Loss - 0.9426370448713886
Iteration - 500 Epoch - 27 Loss - 0.9455263527359316
Iteration - 600 Epoch - 27 Loss - 0.9471932181626707
Total validation loss 0.9471208674317979 after 27 epochs
Iteration - 0 Epoch - 28 Total training loss - 0.1323520541191101 
Iteration - 100 Epoch - 28 Total training loss - 0.7890179075190041 
Iteration - 200 Epoch - 28 Total training loss - 0.8120794391481957 
Iteration - 300 Epoch - 28 Total training loss - 0.8860519419204168 
Iteration - 400 Epoch - 28 Total training loss - 0.90216447646887 
Iteration - 500 Epoch - 28 Total training loss - 0.8727242993947235 
Iteration - 600 Epoch - 28 

Iteration - 600 Epoch - 36 Total training loss - 1.481706055567201 
Iteration - 0 Epoch - 36 Loss - 0.8200722932815552
Iteration - 100 Epoch - 36 Loss - 1.0724506667344877
Iteration - 200 Epoch - 36 Loss - 1.0760306033922071
Iteration - 300 Epoch - 36 Loss - 1.0625822692614457
Iteration - 400 Epoch - 36 Loss - 1.0837521129415517
Iteration - 500 Epoch - 36 Loss - 1.0710165798366664
Iteration - 600 Epoch - 36 Loss - 1.0681513630461177
Total validation loss 1.0701742943795256 after 36 epochs
Iteration - 0 Epoch - 37 Total training loss - 0.8585156202316284 
Iteration - 100 Epoch - 37 Total training loss - 1.0162152769320671 
Iteration - 200 Epoch - 37 Total training loss - 0.9221201275049046 
Iteration - 300 Epoch - 37 Total training loss - 0.9186782637546297 
Iteration - 400 Epoch - 37 Total training loss - 0.955815564517642 
Iteration - 500 Epoch - 37 Total training loss - 1.0038997413798199 
Iteration - 600 Epoch - 37 Total training loss - 1.0016114688835913 
Iteration - 0 Epoch - 37 L

Iteration - 0 Epoch - 45 Loss - 1.102959156036377
Iteration - 100 Epoch - 45 Loss - 0.9986438314513405
Iteration - 200 Epoch - 45 Loss - 0.9948219723665892
Iteration - 300 Epoch - 45 Loss - 0.9838036688855321
Iteration - 400 Epoch - 45 Loss - 0.987344478505508
Iteration - 500 Epoch - 45 Loss - 0.9712142240025564
Iteration - 600 Epoch - 45 Loss - 0.9783307352597622
Total validation loss 0.9733988468543737 after 45 epochs
Iteration - 0 Epoch - 46 Total training loss - 0.9021599292755127 
Iteration - 100 Epoch - 46 Total training loss - 1.025822326772625 
Iteration - 200 Epoch - 46 Total training loss - 1.1115956251209471 
Iteration - 300 Epoch - 46 Total training loss - 1.1090407632186892 
Iteration - 400 Epoch - 46 Total training loss - 1.0766253329399713 
Iteration - 500 Epoch - 46 Total training loss - 1.0698649572141312 
Iteration - 600 Epoch - 46 Total training loss - 1.1083973827314462 
Iteration - 0 Epoch - 46 Loss - 1.373151183128357
Iteration - 100 Epoch - 46 Loss - 1.0185001365

Iteration - 100 Epoch - 54 Loss - 1.2687351786264098
Iteration - 200 Epoch - 54 Loss - 1.2731380219483257
Iteration - 300 Epoch - 54 Loss - 1.2625276575056819
Iteration - 400 Epoch - 54 Loss - 1.24095743552705
Iteration - 500 Epoch - 54 Loss - 1.2343924097910137
Iteration - 600 Epoch - 54 Loss - 1.2330597020226985
Total validation loss 1.2363155271762627 after 54 epochs
Iteration - 0 Epoch - 55 Total training loss - 0.8679770827293396 
Iteration - 100 Epoch - 55 Total training loss - 1.1182001849986833 
Iteration - 200 Epoch - 55 Total training loss - 1.009910278902644 
Iteration - 300 Epoch - 55 Total training loss - 1.1575710604932667 
Iteration - 400 Epoch - 55 Total training loss - 1.2841478331023803 
Iteration - 500 Epoch - 55 Total training loss - 1.2176656740738039 
Iteration - 600 Epoch - 55 Total training loss - 1.1915305958094309 
Iteration - 0 Epoch - 55 Loss - 1.3243989944458008
Iteration - 100 Epoch - 55 Loss - 1.0976192092541421
Iteration - 200 Epoch - 55 Loss - 1.1129728

Iteration - 200 Epoch - 63 Loss - 1.1615372497347456
Iteration - 300 Epoch - 63 Loss - 1.1692084579966788
Iteration - 400 Epoch - 63 Loss - 1.1762158716111408
Iteration - 500 Epoch - 63 Loss - 1.1786666380431123
Iteration - 600 Epoch - 63 Loss - 1.1848434591650367
Total validation loss 1.18760709106741 after 63 epochs
Iteration - 0 Epoch - 64 Total training loss - 0.7347611784934998 
Iteration - 100 Epoch - 64 Total training loss - 1.0694724303635252 
Iteration - 200 Epoch - 64 Total training loss - 1.180374557605174 
Iteration - 300 Epoch - 64 Total training loss - 1.1378667607718476 
Iteration - 400 Epoch - 64 Total training loss - 1.1704178866412869 
Iteration - 500 Epoch - 64 Total training loss - 1.1997809953017624 
Iteration - 600 Epoch - 64 Total training loss - 1.1624319578376967 
Iteration - 0 Epoch - 64 Loss - 1.2781809568405151
Iteration - 100 Epoch - 64 Loss - 1.179548332891842
Iteration - 200 Epoch - 64 Loss - 1.140136235685491
Iteration - 300 Epoch - 64 Loss - 1.126475994

Iteration - 400 Epoch - 72 Loss - 1.1280845689989087
Iteration - 500 Epoch - 72 Loss - 1.1332144922987668
Iteration - 600 Epoch - 72 Loss - 1.142810333799106
Total validation loss 1.1416813939651276 after 72 epochs
Iteration - 0 Epoch - 73 Total training loss - 1.1203184127807617 
Iteration - 100 Epoch - 73 Total training loss - 1.2158554157583357 
Iteration - 200 Epoch - 73 Total training loss - 1.1690174861004883 
Iteration - 300 Epoch - 73 Total training loss - 1.6228275056201285 
Iteration - 400 Epoch - 73 Total training loss - 1.567003428198611 
Iteration - 500 Epoch - 73 Total training loss - 1.6306711291520672 
Iteration - 600 Epoch - 73 Total training loss - 1.5307033812238242 
Iteration - 0 Epoch - 73 Loss - 1.0150021314620972
Iteration - 100 Epoch - 73 Loss - 0.9827074235913777
Iteration - 200 Epoch - 73 Loss - 0.9620659724619258
Iteration - 300 Epoch - 73 Loss - 0.9623624437621662
Iteration - 400 Epoch - 73 Loss - 0.9765696507922729
Iteration - 500 Epoch - 73 Loss - 0.976134

Iteration - 600 Epoch - 81 Loss - 1.0758046071245002
Total validation loss 1.0819579825558243 after 81 epochs
Iteration - 0 Epoch - 82 Total training loss - 0.18910154700279236 
Iteration - 100 Epoch - 82 Total training loss - 1.433296432294468 
Iteration - 200 Epoch - 82 Total training loss - 1.2734695108282637 
Iteration - 300 Epoch - 82 Total training loss - 1.2240588463347524 
Iteration - 400 Epoch - 82 Total training loss - 1.2314150948519078 
Iteration - 500 Epoch - 82 Total training loss - 1.1991198837874513 
Iteration - 600 Epoch - 82 Total training loss - 1.172225862531327 
Iteration - 0 Epoch - 82 Loss - 0.8850122690200806
Iteration - 100 Epoch - 82 Loss - 1.1585187280532157
Iteration - 200 Epoch - 82 Loss - 1.1781747550217074
Iteration - 300 Epoch - 82 Loss - 1.644247798230561
Iteration - 400 Epoch - 82 Loss - 1.8483283313878456
Iteration - 500 Epoch - 82 Loss - 1.7389731604657963
Iteration - 600 Epoch - 82 Loss - 1.6487613710607347
Total validation loss 1.6131555093367411 a

Iteration - 0 Epoch - 91 Total training loss - 0.16107018291950226 
Iteration - 100 Epoch - 91 Total training loss - 1.0073504957029282 
Iteration - 200 Epoch - 91 Total training loss - 1.2350982655423568 
Iteration - 300 Epoch - 91 Total training loss - 1.2432640005503333 
Iteration - 400 Epoch - 91 Total training loss - 1.5233868131520891 
Iteration - 500 Epoch - 91 Total training loss - 1.4430982225282702 
Iteration - 600 Epoch - 91 Total training loss - 1.3881938295468328 
Iteration - 0 Epoch - 91 Loss - 1.2522568702697754
Iteration - 100 Epoch - 91 Loss - 1.3560168353637847
Iteration - 200 Epoch - 91 Loss - 1.465175547113466
Iteration - 300 Epoch - 91 Loss - 1.4357150542577637
Iteration - 400 Epoch - 91 Loss - 1.4387223223349697
Iteration - 500 Epoch - 91 Loss - 1.427008310596862
Iteration - 600 Epoch - 91 Loss - 1.4342407199388336
Total validation loss 1.4357589527647545 after 91 epochs
Iteration - 0 Epoch - 92 Total training loss - 0.8973705172538757 
Iteration - 100 Epoch - 92 

In [54]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for i,data in enumerate(testloader):
        inputs, labels = data
        outputs = model(inputs)
        labels = torch.argmax(labels,dim =1)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(correct,total)
print(100 * correct / total)

9 49
18.367346938775512


In [55]:
class MTL_Dataset(Dataset):

    def __init__(self, dir_path, test_key,valid_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.test_key = test_key
        self.valid_key = valid_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        train_keys = list((spk_id[i] for i in range(10) if i not in [self.test_key,self.valid_key]))
        melspecs = []
        Y = []
        for key in train_keys:
            for file in files:
                if file[:2]==key:
                    r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                    melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                    temp = self.chunk(torch.Tensor(melspec))
                    melspecs.extend(temp)
                    for _ in range(len(temp)):
                        ye = torch.zeros(7,dtype = int)
                        ye[emotion_code[file[5]]]=1
                        yg = torch.zeros(2,dtype = int)
                        yg[gender_code[file[:2]]]=1
                        Y.append((ye,yg))
        return melspecs,Y
    
    def chunk(self,melspec):
        melspec = melspec.transpose(0,1)
        res = []
        for i in range(0,melspec.size(0),50):
            temp = melspec[i:i+100,:]
            if temp.size(0)==100:
                res.append(temp)
        return res        
        
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [56]:
trmt = MTL_Dataset("EmoDB/wav/",0,1)

2629 2629


In [62]:
trainmt_dataloader = DataLoader(trmt, batch_size=4,shuffle=True)

In [60]:
class MTLVal_Dataset(Dataset):

    def __init__(self, dir_path,valid_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.valid_key = valid_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        melspecs = []
        Y = []
        for file in files:
            if file[:2]==spk_id[self.valid_key]:
                r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                temp = self.chunk(torch.Tensor(melspec))
                melspecs.extend(temp)
                for _ in range(len(temp)):
                    ye = torch.zeros(7,dtype = int)
                    ye[emotion_code[file[5]]]=1
                    yg = torch.zeros(2,dtype = int)
                    yg[gender_code[file[:2]]]=1
                    Y.append((ye,yg))
        return melspecs,Y
    
    def chunk(self,melspec):
        melspec = melspec.transpose(0,1)
        res = []
        for i in range(0,melspec.size(0),50):
            temp = melspec[i:i+100,:]
            if temp.size(0)==100:
                res.append(temp)
        return res        
        
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [61]:
vmt = MTLVal_Dataset("EmoDB/wav/",1)

399 399


In [63]:
validmt_dataloader = DataLoader(vmt, batch_size=4,shuffle=True)

In [87]:
class MTLTest_Dataset(Dataset):

    def __init__(self, dir_path,test_key, transform=None):
        self.dir_path = dir_path
        self.files = os.listdir(self.dir_path)
        self.test_key = test_key
        self.melspecs,self.Y = self.loadData(dir_path)
        print(len(self.melspecs),len(self.Y))
        
    def loadData(self,dir_path):
        files = os.listdir(dir_path)
        melspecs = []
        Y = []
        for file in files:
            if file[:2]==spk_id[self.test_key]:
                r, sr = librosa.load(dir_path + file, res_type='kaiser_fast')
                melspec = librosa.feature.melspectrogram(y=r, sr=sr,n_fft = 512, hop_length=160, win_length=320,n_mels=24)
                melspec = melspec.transpose()
                melspecs.append(melspec)
                ye = torch.zeros(7,dtype = int)
                ye[emotion_code[file[5]]]=1
                yg = torch.zeros(2,dtype = int)
                yg[gender_code[file[:2]]]=1
                Y.append((ye,yg))
        return melspecs,Y
        
    def __len__(self):
        return len(self.melspecs)

    def __getitem__(self, idx):
        return self.melspecs[idx],self.Y[idx]

In [88]:
testmt = MTLTest_Dataset("EmoDB/wav/",0)

49 49


In [89]:
testmt_dataloader = DataLoader(testmt, batch_size=1,shuffle=True)

In [71]:
class multitask(nn.Module):
    def __init__(self, input_dim = 24, em_classes=7,gen_classes = 2):
        super(multitask, self).__init__()
        self.tdnn1 = TDNN(input_dim=input_dim, output_dim=512, context_size=5, dilation=1,dropout_p=0.5)
        self.tdnn2 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=1,dropout_p=0.5)
        self.tdnn3 = TDNN(input_dim=512, output_dim=512, context_size=2, dilation=2,dropout_p=0.5)
        self.tdnn4 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1,dropout_p=0.5)
        self.tdnn5 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=3,dropout_p=0.5)
        #### Frame levelPooling
        self.segment6 = nn.Linear(1024, 512)
        self.segment7 = nn.Linear(512, 512)
        self.emotion = nn.Linear(512, em_classes)
        self.gender = nn.Linear(512, gen_classes)
#         self.softmax = nn.Softmax(dim=1)
    def forward(self, inputs):
        tdnn1_out = self.tdnn1(inputs)
#         return tdnn1_out
        tdnn2_out = self.tdnn2(tdnn1_out)
        tdnn3_out = self.tdnn3(tdnn2_out)
        tdnn4_out = self.tdnn4(tdnn3_out)
        tdnn5_out = self.tdnn5(tdnn4_out)
        ### Stat Pool
        mean = torch.mean(tdnn5_out,1)
        std = torch.std(tdnn5_out,1)
        stat_pooling = torch.cat((mean,std),1)
        segment6_out = self.segment6(stat_pooling)
        x_vec = self.segment7(segment6_out)
        em_predictions = self.emotion(x_vec)
        gen_predictions = self.gender(x_vec)
        return em_predictions,gen_predictions

In [131]:
input_feats = [4,100,24]
input = torch.rand(input_feats)
mt_model = multitask()
out = mt_model(input)
print(out)

(tensor([[-0.0098, -0.0171, -0.0305, -0.0194, -0.0344,  0.0426,  0.0026],
        [-0.0093, -0.0194, -0.0295, -0.0188, -0.0358,  0.0425,  0.0020],
        [-0.0084, -0.0177, -0.0309, -0.0175, -0.0381,  0.0416,  0.0036],
        [-0.0101, -0.0168, -0.0309, -0.0189, -0.0360,  0.0429,  0.0039]],
       grad_fn=<AddmmBackward>), tensor([[0.0078, 0.0154],
        [0.0082, 0.0159],
        [0.0083, 0.0154],
        [0.0078, 0.0159]], grad_fn=<AddmmBackward>))


In [83]:
def trainmt(train_dataloader,epoch):
    running_loss = 0.0
    train_loss_list=[]
    model.train()
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        inputs.requires_grad = True
        # zero the parameter gradients
        optimizer.zero_grad()
        labels[0] = torch.argmax(labels[0],dim =1)
        labels[1] = torch.argmax(labels[1],dim =1)
        # forward + backward + optimize
        outputs = mt_model(inputs)
        loss1 = criterion(outputs[0], labels[0])
        loss2 = criterion(outputs[1], labels[1])
        loss = loss1+loss2
        loss.backward()
        optimizer.step()
        train_loss_list.append(loss.item())
        mean_loss = np.mean(np.asarray(train_loss_list))
        if i%100==0:
            print('Iteration - {} Epoch - {} Total training loss - {} '.format(i,epoch,mean_loss))
            
def validationmt(valid_dataloader,epoch):
    model.eval()
    with torch.no_grad():
        val_loss_list=[]
        for i, data in enumerate(valid_dataloader, 0):
            inputs, labels = data
            labels[0] = torch.argmax(labels[0],dim =1)
            labels[1] = torch.argmax(labels[1],dim =1)
            outputs = mt_model(inputs)
            loss1 = criterion(outputs[0], labels[0])
            loss2 = criterion(outputs[1], labels[1])
            loss = loss1+loss2
            val_loss_list.append(loss.item())
            if i%100==0:
                print('Iteration - {} Epoch - {} Loss - {}'.format(i,epoch,np.mean(np.asarray(val_loss_list))))
                
        mean_loss = np.mean(np.asarray(val_loss_list))
        print('Total validation loss {} after {} epochs'.format(mean_loss,epoch))
        model_save_path = os.path.join( 'best_check_point_'+str(epoch)+'_'+str(mean_loss))
        state_dict = {'model': model.state_dict(),'optimizer': optimizer.state_dict(),'epoch': epoch}
        torch.save(state_dict, model_save_path)


In [None]:
for epoch in range(100):
    trainmt(trainmt_dataloader,epoch)
    validationmt(validmt_dataloader,epoch)

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for i,data in enumerate(testmt_dataloader):
        inputs, labels = data
        labels[0] = torch.argmax(labels[0],dim =1)
        labels[1] = torch.argmax(labels[1],dim =1)
        outputs = mt_model(inputs)
        print(outputs,labels)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()
# print(correct,total)
# print(100 * correct / total)