In [1]:
import os
import glob

import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from preprocessing import preprocessing, convert_spectrograms, convert_tensor
from model_ae import Encoder
from utils.optimization import WarmupLinearSchedule

In [2]:
class SELayer(nn.Module):
    def __init__(self, channel, reduction=4):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

In [3]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_attn_heads, attn_hidden_size, dropout_prob, with_focus_attn, with_se):
        super(MultiHeadedAttention, self).__init__()
        self.num_attn_heads = num_attn_heads
        self.hidden_size = attn_hidden_size
        self.dropout_prob = dropout_prob
        self.with_focus_attn = with_focus_attn
        self.with_se = with_se
        
        self.attn_head_size = int(self.hidden_size / self.num_attn_heads)
        self.all_head_size = self.num_attn_heads * self.attn_head_size

        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.softmax = nn.Softmax(dim=-1)
        
        if(with_focus_attn == True):
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()
            
            self.linear_focus_query = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                num_attn_heads * self.attn_head_size)
            self.linear_focus_global = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                 num_attn_heads * self.attn_head_size)
            
            up = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.up = Variable(up, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.up)
            
            uz = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.uz = Variable(uz, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.uz)
            
        if(with_se == True):
            self.se = SELayer(8)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attn_heads, self.attn_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        key_len = hidden_states.size(1)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        if(self.with_focus_attn == True):
            glo = torch.mean(mixed_query_layer, dim=1, keepdim=True)
            
            c = self.tanh(self.linear_focus_query(mixed_query_layer) + self.linear_focus_global(glo))
            c = self.transpose_for_scores(c)
            
            p = c * self.up
            p = p.sum(3).squeeze()
            z = c * self.uz
            z = z.sum(3).squeeze()
            
            P = self.sigmoid(p) * key_len
            Z = self.sigmoid(z) * key_len
            
            j = torch.arange(start=0, end=key_len, dtype=P.dtype).unsqueeze(0).unsqueeze(0).unsqueeze(0).to('cuda')
            P = P.unsqueeze(-1)
            Z = Z.unsqueeze(-1)
            
            G = -(j - P)**2 * 2 / (Z**2)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attn_head_size)
        
        if(self.with_focus_attn == True):
            attention_scores = attention_scores + G
            
        attention_probs = self.softmax(attention_scores)
        
        if(self.with_se == True):
            attention_probs = self.se(attention_probs)
            
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.o_proj(context_layer)

        return attention_output

class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=60, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '2d'):
            self.conv1 = nn.Sequential(
                nn.Conv2d(1, 64, (3, 3)), # (1, 128, 50) -> (64, 126, 48)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(3, 3)  # (64, 126, 48) -> (64, 42, 16)
            )
            self.conv2 = nn.Sequential(
                nn.Conv2d(64, 64, (3, 3)), # (64, 42, 16) -> (64, 40, 14)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(3, 3)  # (64, 40, 14) -> (64, 13, 4)
            )
            self.lstm = nn.LSTM(832, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.tanh = nn.Tanh()
            self.dropout = nn.Dropout(0.5)
            self.attn = MultiHeadedAttention(num_attn_heads=8, attn_hidden_size=120, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.fc = nn.Sequential(
                nn.Linear(4*2*hidden_size if bidirectional else hidden_size, 4)
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '2d'):
            out = self.conv1(x)  # (batch, 1, 128, 50) -> (batch, 64, 42, 16)
            out = self.conv2(out)  # (batch, 64, 42, 16) -> (batch, 64, 13, 4)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2], out.size()[3])
            out = out.reshape(*new_out_shape)  # (batch, 64, 13, 4) -> (batch, 832, 4)
            out = out.permute(2, 0, 1)  # (batch, 832, 4) -> (4, batch, 832)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (4, batch, 832) -> (4, batch, 2*60)
            out = self.tanh(out)
            out = self.dropout(out)
            out = out.permute(1, 0, 2)  # (4, batch, 2*60) -> (batch, 4, 2*60)
            out = self.attn(out)  # (batch, 4, 2*60) -> (batch, 4, 2*60)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)  # (batch, 4, 2*60) -> (batch, 4*2*60)
            out = self.fc(out)  # (batch, 4*2*60) -> (batch, 4)
        return out

In [4]:
# CNN -> LSTM -> self-attention -> DNN
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False, with_se=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.conv1 = nn.Sequential(
                nn.Conv1d(1, 64, (3, 1)), # (1, 40, 100) -> (64, 38, 100)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 38, 100) -> (64, 19, 100)
            )
            self.conv2 = nn.Sequential(
                nn.Conv1d(64, 64, (3, 1)), # (64, 19, 100) -> (64, 17, 100)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 17, 100) -> (64, 8, 100)
            )
            self.lstm = nn.LSTM(512, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.tanh = nn.Tanh()
            self.dropout = nn.Dropout(0.5)
            self.attn = MultiHeadedAttention(num_attn_heads=8, attn_hidden_size=128, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn, with_se=with_se)
            self.fc1 = nn.Linear(100*2*hidden_size if bidirectional else hidden_size, 128)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(128, 4)
            '''self.fc = nn.Sequential(
                nn.Linear(50*2*hidden_size if bidirectional else hidden_size, 128),
                nn.ReLU(),
                nn.Linear(128, 4)
            )'''
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.conv1(x)  # (batch, 1, 40, 100) -> (batch, 64, 19, 100)
            out = self.conv2(out)  # (batch, 64, 19, 100) -> (batch, 64, 8, 100)
            out = out.contiguous()
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2], out.size()[3])
            out = out.view(*new_out_shape)  # (batch, 64, 8, 100) -> (batch, 512, 100)
            out = out.permute(2, 0, 1)  # (batch, 512, 100) -> (100, batch, 512)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 512) -> (100, batch, 2*64)
            out = self.tanh(out)
            out = self.dropout(out)
            out = out.permute(1, 0, 2)  # (100, batch, 2*64) -> (batch, 100, 2*64)
            out = self.attn(out)  # (batch, 100, 2*64) -> (batch, 100, 2*64)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)  # (batch, 100, 2*64) -> (batch, 100*2*64)
            out_fc1 = self.fc1(out)  # (batch, 100*2*64) -> (batch, 128)
            out_relu = self.relu(out_fc1)
            out_fc2 = self.fc2(out_relu)  # (batch, 128) -> (batch, 4)
        return out_fc2

# CNN -> self-attention -> LSTM -> DNN
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.conv1 = nn.Sequential(
                nn.Conv1d(1, 64, (3, 1)), # (1, 40, 50) -> (64, 38, 50)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 38, 50) -> (64, 19, 50)
            )
            self.conv2 = nn.Sequential(
                nn.Conv1d(64, 64, (3, 1)), # (64, 19, 50) -> (64, 17, 50)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 17, 50) -> (64, 8, 50)
            )
            self.attn = MultiHeadedAttention(num_attn_heads=8, attn_hidden_size=512, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(512, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.tanh = nn.Tanh()
            self.dropout = nn.Dropout(0.5)
            self.fc = nn.Sequential(
                nn.Linear(50*2*hidden_size if bidirectional else hidden_size, 128),
                nn.ReLU(),
                nn.Linear(128, 4)
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.conv1(x)  # (batch, 1, 40, 50) -> (batch, 64, 19, 50)
            out = self.conv2(out)  # (batch, 64, 19, 50) -> (batch, 64, 8, 50)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2], out.size()[3])
            out = out.reshape(*new_out_shape)  # (batch, 64, 8, 50) -> (batch, 512, 50)
            out = out.permute(0, 2, 1)  # (batch, 512, 50) -> (batch, 50, 512)
            h = out
            out = self.attn(out)  # (batch, 50, 512) -> (batch, 50, 512)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 50, 512) -> (50, batch, 512)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (50, batch, 512) -> (50, batch, 2*64)
            #out = out[-1]  # (50, batch, 2*64) -> (batch, num_directions*hidden_size)
            out = self.tanh(out)
            out = self.dropout(out)
            out = out.permute(1, 0, 2)  # (50, batch, 2*64) -> (batch, 50, 2*64)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)  # (batch, 50, 2*64) -> (batch, 50*2*64)       
            out = self.fc(out)  # (batch, 50*2*64) -> (batch, 4)
        return out

In [5]:
class CLDNN_G(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN_G, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [6]:
def train(train_dataloader, eval_dataloader, epochs):
    print('Start training')
    max_acc = 0
    acc_list = []
    softmax = nn.Softmax(dim=1)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        nb_train_steps = 0
        correct = 0
        num_samples = 0

        if(multi_task == 'true'):
            for X_batch, y_batch, y_g_batch in train_dataloader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                y_g_batch = y_g_batch.to(device)

                optimizer.zero_grad()

                outputs = model(X_batch)
                outputs_g = model_g(X_batch)

                loss_1 = loss_func(outputs, y_batch)
                loss_2 = loss_func_g(outputs_g, y_g_batch)
                loss = loss_1 + 0.8 * loss_2
                loss.backward(retain_graph=True)

                optimizer.step()
                opt_scheduler.step()

                train_loss += loss.mean().item()
                nb_train_steps += 1

                outputs = softmax(outputs)
                outputs = torch.argmax(outputs, dim=1)
                correct += (outputs == y_batch).float().sum()
                num_samples += len(X_batch)

            train_loss = train_loss / nb_train_steps
            train_accuracy = correct / num_samples

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            correct = 0
            num_samples = 0

            for X_batch, y_batch, y_g_batch in eval_dataloader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                y_g_batch = y_g_batch.to(device)
                with torch.no_grad():
                    outputs = model(X_batch)
                    outputs_g = model_g(X_batch)

                tmp_eval_loss_1 = loss_func(outputs, y_batch)
                tmp_eval_loss_2 = loss_func_g(outputs_g, y_g_batch)
                tmp_eval_loss = tmp_eval_loss_1 + 0.8 * tmp_eval_loss_2
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1

                outputs = softmax(outputs)
                outputs = torch.argmax(outputs, dim=1)
                correct += (outputs == y_batch).float().sum()
                num_samples += len(X_batch)

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = correct / num_samples
        else:
            for X_batch, y_batch in train_dataloader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                optimizer.zero_grad()

                outputs = model(X_batch)

                loss = loss_func(outputs, y_batch)
                loss.backward()

                optimizer.step()
                if(use_warmup == 'true'):
                    opt_scheduler.step()

                train_loss += loss.mean().item()
                nb_train_steps += 1

                outputs = softmax(outputs)
                outputs = torch.argmax(outputs, dim=1)
                correct += (outputs == y_batch).float().sum()
                num_samples += len(X_batch)

            train_loss = train_loss / nb_train_steps
            train_accuracy = correct / num_samples

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            correct = 0
            num_samples = 0

            for X_batch, y_batch in eval_dataloader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                with torch.no_grad():
                    outputs = model(X_batch)

                tmp_eval_loss = loss_func(outputs, y_batch)
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1

                outputs = softmax(outputs)
                outputs = torch.argmax(outputs, dim=1)
                correct += (outputs == y_batch).float().sum()
                num_samples += len(X_batch)

            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = correct / num_samples

        for param_group in optimizer.param_groups:
            lr = param_group['lr']
        print('epoch: {:3d},    lr={:6f},    loss={:5f},    train_acc={:5f},    eval_loss={:5f},    eval_acc={:5f}'
              .format(epoch+1, lr, train_loss, train_accuracy, eval_loss, eval_accuracy))

        if((epoch+1) % save_checkpoint_steps == 0):
            '''
            correct = 0
            n = 0
            for i in range(len(eval_samples)):
                try:
                    X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                    X_new = convert_tensor(X_new).to(device)
                    y_new = model(X_new)
                    y_new = torch.argmax(torch.mean(nn.Softmax(dim=-1)(y_new), dim=0))
                    #y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                    #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                    #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                    y_new = 1 if (y_new.item() == np.array(eval_label)[i]) else 0
                    correct += y_new
                    n += 1
                except:
                    pass
            acc = correct / n
            acc_list.append(acc)
            print('Test accuray:', round(acc, 5))
            '''
            
            model_checkpoint = "CLDNN_cv%d_step%d_epoch%d.pt" % (cv_iter, en+1, epoch+1)
            output_model_file = os.path.join(output_dir, model_checkpoint)
            torch.save(model.state_dict(), output_model_file)
            print("Saving checkpoint %s" % output_model_file)
            #if(acc > max_acc):
            #    max_acc =acc
            #    torch.save(model.state_dict(), output_model_file)
            #    print("Saving checkpoint %s" % output_model_file)
    #return max(acc_list)

In [7]:
focus_attn_list = [True]

In [8]:
df = pd.read_csv('IEMOCAP_sub_label.csv')
di = {'neu': 0, 'hap': 1, 'ang': 2, 'sad': 3}
df = df.replace({'sample_label': di})

In [9]:
from sklearn.model_selection import StratifiedKFold, train_test_split

skf = StratifiedKFold(n_splits=10)

In [None]:
for with_focus_attn in focus_attn_list:
    cv_eval = []
    cv_iter = 0
    #with_focus_attn = True if(with_focus_attn == 'true') else False
    for train_index, test_index in skf.split(df['sample_name'], df['sample_label']):
        '''
    for ses_num in range(1, 6):
        eval_ses = 'Ses0' + str(ses_num)
        train_index = []
        eval_index = []
        for i, sample_name in enumerate(df['sample_name']):
            if eval_ses in sample_name:
                eval_index.append(i)
            else:
                train_index.append(i)
        '''
        cv_iter += 1
        conv_dim = '1d'
        checkpoint = ''
        hidden_size = 64
        num_layers = 2
        bidirectional = 'true'
        with_se = False

        batch_size = 128
        num_epochs = 30
        learning_rate = 0.0003

        use_warmup = 'false'
        data_dir = './wav_data/pretrain/IEMOCAP_sub/'
        multi_task = 'false'
        augmentation = 'false'
        
        save_checkpoint_steps = 5
        output_dir = './model'
        #label_path = './wav_data/pretrain/IEMOCAP_vad/'

        bidirectional = True if(bidirectional == 'true') else False  
        n_mfcc = 40 if(conv_dim == '1d') else 128

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        df_train = df.loc[train_index, :]
        
        train_samples, eval_samples, train_label, eval_label = train_test_split(df_train['sample_name'], df_train['sample_label'],
                                                                                test_size=0.1, random_state=42, 
                                                                                stratify=df_train['sample_label'])
        
        test_samples = df['sample_name'][test_index]
        test_label = df['sample_label'][test_index]
        
        train_samples = [data_dir + train_sample + '.wav' for train_sample in train_samples]
        eval_samples = [data_dir + eval_sample + '.wav' for eval_sample in eval_samples]
        test_samples = [data_dir + test_sample + '.wav' for test_sample in test_samples]

        y_train = np.array(train_label)
        y_eval = np.array(eval_label)
        y_test = np.array(test_label)
        
        #train_vad_list = []
        #for i in range(len(train_vad)):
        #    vad = np.load(train_vad[i])
        #    train_vad_list.append(vad)
        #vad_train = np.concatenate(train_vad_list)

        #eval_vad_list = []
        #for i in range(len(eval_vad)):
        #    vad = np.load(eval_vad[i])
        #    eval_vad_list.append(vad)
        #vad_eval = np.concatenate(eval_vad_list)

        if(multi_task == 'true'):
            speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
            y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

            y_g_train = y_gender[train_idx]
            y_g_eval = y_gender[eval_idx]

        X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, method='mfcc', sr=16000, labels=y_train)
        X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, method='mfcc', sr=16000, labels=y_eval)
        X_test, y_test = convert_spectrograms(test_samples, conv_dim=conv_dim, method='mfcc', sr=16000, labels=y_test)
        print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape, X_test.shape, y_test.shape)
        
        #X_train = X_train[vad_train]
        #y_train = y_train[vad_train]
        #X_eval = X_eval[vad_eval]
        #y_eval = y_eval[vad_eval]
        #print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape)

        if(augmentation == 'true'):
            X_train_flip = X_train[:, :, :, ::-1]
            y_train_flip = y_train.copy()

            X_train = np.concatenate((X_train, X_train_flip), axis=0)
            y_train = np.concatenate((y_train, y_train_flip), axis=0)

        X_train, y_train = convert_tensor(X_train, y_train)
        X_eval, y_eval = convert_tensor(X_eval, y_eval)

        y_train = y_train.long()
        y_eval = y_eval.long()

        if(multi_task == 'true'):
            _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, method='mfcc', sr=16000, labels=y_g_train)
            _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, method='mfcc', sr=16000, labels=y_g_eval)

            if(augmentation == 'true'):
                y_g_train_flip = y_g_train.copy()
                y_g_train = np.concatenate((y_g_train, y_g_train_flip))

            y_g_train = torch.tensor(y_g_train).float()
            y_g_eval = torch.tensor(y_g_eval).float()

            y_g_train = y_g_train.unsqueeze(-1)
            y_g_eval = y_g_eval.unsqueeze(-1)

        if(multi_task == 'true'):
            train_ds = TensorDataset(X_train, y_train, y_g_train)
            eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
        else:
            train_ds = TensorDataset(X_train, y_train)
            eval_ds = TensorDataset(X_eval, y_eval)

        train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
        eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0)

        for en in range(5):
            model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional,
                          with_focus_attn=with_focus_attn, with_se=with_se).to(device)

            if(multi_task == 'true'):
                model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                    num_layers=num_layers, bidirectional=bidirectional,
                                    with_focus_attn=with_focus_attn).to(device)

            if(multi_task == 'true'):
                loss_func = nn.CrossEntropyLoss()
                loss_func_g = nn.BCELoss()
                optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
            else:
                loss_func = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=learning_rate)

            if(use_warmup == 'true'):
                t_total = len(train_dataloader) // 1 * num_epochs
                opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

            eval_acc = train(train_dataloader, eval_dataloader, num_epochs)
            cv_eval.append(eval_acc)

    print('conv_dim:', conv_dim, '\twith_focus_attn:', with_focus_attn)
    print('Test accuray:', cv_eval)
    print()

4479it [00:44, 100.34it/s]
498it [00:04, 109.27it/s]
554it [00:05, 103.24it/s]


(160630, 40, 100, 1) (160630,) (17768, 40, 100, 1) (17768,) (20737, 40, 100, 1) (20737,)
Start training
epoch:   1,    lr=0.000300,    loss=0.968906,    train_acc=0.586062,    eval_loss=0.977303,    eval_acc=0.595284
epoch:   2,    lr=0.000300,    loss=0.806898,    train_acc=0.670448,    eval_loss=1.017592,    eval_acc=0.591119
epoch:   3,    lr=0.000300,    loss=0.669759,    train_acc=0.733983,    eval_loss=1.108711,    eval_acc=0.583408
epoch:   4,    lr=0.000300,    loss=0.531951,    train_acc=0.792333,    eval_loss=1.260493,    eval_acc=0.573784
epoch:   5,    lr=0.000300,    loss=0.421197,    train_acc=0.837869,    eval_loss=1.333626,    eval_acc=0.593483
Saving checkpoint ./model/CLDNN_cv1_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.328759,    train_acc=0.875006,    eval_loss=1.493588,    eval_acc=0.586110
epoch:   7,    lr=0.000300,    loss=0.265370,    train_acc=0.900468,    eval_loss=1.804505,    eval_acc=0.595621
epoch:   8,    lr=0.000300,    loss=0.221045,    trai

epoch:   7,    lr=0.000300,    loss=0.262078,    train_acc=0.900562,    eval_loss=1.715315,    eval_acc=0.596128
epoch:   8,    lr=0.000300,    loss=0.223845,    train_acc=0.915794,    eval_loss=1.963054,    eval_acc=0.589824
epoch:   9,    lr=0.000300,    loss=0.188988,    train_acc=0.928952,    eval_loss=1.898271,    eval_acc=0.601024
epoch:  10,    lr=0.000300,    loss=0.163958,    train_acc=0.939494,    eval_loss=2.235737,    eval_acc=0.581664
Saving checkpoint ./model/CLDNN_cv1_step3_epoch10.pt
epoch:  11,    lr=0.000300,    loss=0.145780,    train_acc=0.945786,    eval_loss=2.317862,    eval_acc=0.577330
epoch:  12,    lr=0.000300,    loss=0.133252,    train_acc=0.951138,    eval_loss=2.246335,    eval_acc=0.587911
epoch:  13,    lr=0.000300,    loss=0.117980,    train_acc=0.956601,    eval_loss=2.528128,    eval_acc=0.593314
epoch:  14,    lr=0.000300,    loss=0.107323,    train_acc=0.960819,    eval_loss=2.660305,    eval_acc=0.587911
epoch:  15,    lr=0.000300,    loss=0.09975

epoch:  14,    lr=0.000300,    loss=0.121710,    train_acc=0.955156,    eval_loss=2.509588,    eval_acc=0.570464
epoch:  15,    lr=0.000300,    loss=0.114566,    train_acc=0.958308,    eval_loss=2.345054,    eval_acc=0.593145
Saving checkpoint ./model/CLDNN_cv1_step5_epoch15.pt
epoch:  16,    lr=0.000300,    loss=0.106815,    train_acc=0.960638,    eval_loss=2.588459,    eval_acc=0.567931
epoch:  17,    lr=0.000300,    loss=0.098365,    train_acc=0.963361,    eval_loss=2.467973,    eval_acc=0.570689
epoch:  18,    lr=0.000300,    loss=0.090904,    train_acc=0.966551,    eval_loss=2.561226,    eval_acc=0.582620
epoch:  19,    lr=0.000300,    loss=0.086401,    train_acc=0.968015,    eval_loss=2.740087,    eval_acc=0.590556
epoch:  20,    lr=0.000300,    loss=0.080336,    train_acc=0.970345,    eval_loss=2.875748,    eval_acc=0.576542
Saving checkpoint ./model/CLDNN_cv1_step5_epoch20.pt
epoch:  21,    lr=0.000300,    loss=0.074371,    train_acc=0.972712,    eval_loss=2.833510,    eval_acc

11it [00:00, 107.47it/s]

epoch:  30,    lr=0.000300,    loss=0.049010,    train_acc=0.982917,    eval_loss=2.527106,    eval_acc=0.579300
Saving checkpoint ./model/CLDNN_cv1_step5_epoch30.pt


4480it [00:42, 106.37it/s]
498it [00:04, 107.69it/s]
553it [00:05, 104.45it/s]


(160266, 40, 100, 1) (160266,) (16965, 40, 100, 1) (16965,) (21904, 40, 100, 1) (21904,)
Start training
epoch:   1,    lr=0.000300,    loss=0.962321,    train_acc=0.583616,    eval_loss=0.982573,    eval_acc=0.601474
epoch:   2,    lr=0.000300,    loss=0.796606,    train_acc=0.672206,    eval_loss=1.036447,    eval_acc=0.594813
epoch:   3,    lr=0.000300,    loss=0.650585,    train_acc=0.738837,    eval_loss=1.120555,    eval_acc=0.595933
epoch:   4,    lr=0.000300,    loss=0.517913,    train_acc=0.796288,    eval_loss=1.202750,    eval_acc=0.607899
epoch:   5,    lr=0.000300,    loss=0.406772,    train_acc=0.842128,    eval_loss=1.499162,    eval_acc=0.593752
Saving checkpoint ./model/CLDNN_cv2_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.318030,    train_acc=0.878544,    eval_loss=1.678538,    eval_acc=0.594400
epoch:   7,    lr=0.000300,    loss=0.255488,    train_acc=0.903018,    eval_loss=1.895443,    eval_acc=0.585087
epoch:   8,    lr=0.000300,    loss=0.212603,    trai

epoch:   7,    lr=0.000300,    loss=0.279483,    train_acc=0.893158,    eval_loss=1.731427,    eval_acc=0.598408
epoch:   8,    lr=0.000300,    loss=0.228523,    train_acc=0.913526,    eval_loss=1.868662,    eval_acc=0.597760
epoch:   9,    lr=0.000300,    loss=0.194556,    train_acc=0.926549,    eval_loss=1.964434,    eval_acc=0.594695
epoch:  10,    lr=0.000300,    loss=0.169577,    train_acc=0.936857,    eval_loss=2.133591,    eval_acc=0.607132
Saving checkpoint ./model/CLDNN_cv2_step3_epoch10.pt
epoch:  11,    lr=0.000300,    loss=0.150659,    train_acc=0.944252,    eval_loss=2.169638,    eval_acc=0.588800
epoch:  12,    lr=0.000300,    loss=0.128213,    train_acc=0.952757,    eval_loss=2.392945,    eval_acc=0.583672
epoch:  13,    lr=0.000300,    loss=0.121096,    train_acc=0.955914,    eval_loss=2.318557,    eval_acc=0.594636
epoch:  14,    lr=0.000300,    loss=0.107647,    train_acc=0.960869,    eval_loss=2.410387,    eval_acc=0.592750
epoch:  15,    lr=0.000300,    loss=0.10186

epoch:  14,    lr=0.000300,    loss=0.112065,    train_acc=0.958728,    eval_loss=2.652373,    eval_acc=0.589272
epoch:  15,    lr=0.000300,    loss=0.100908,    train_acc=0.963783,    eval_loss=2.252653,    eval_acc=0.598939
Saving checkpoint ./model/CLDNN_cv2_step5_epoch15.pt
epoch:  16,    lr=0.000300,    loss=0.092846,    train_acc=0.966142,    eval_loss=2.636539,    eval_acc=0.607427
epoch:  17,    lr=0.000300,    loss=0.089057,    train_acc=0.967677,    eval_loss=2.765015,    eval_acc=0.578367
epoch:  18,    lr=0.000300,    loss=0.080456,    train_acc=0.970547,    eval_loss=2.735434,    eval_acc=0.591630
epoch:  19,    lr=0.000300,    loss=0.075012,    train_acc=0.972918,    eval_loss=2.821199,    eval_acc=0.583672
epoch:  20,    lr=0.000300,    loss=0.070799,    train_acc=0.974584,    eval_loss=2.649642,    eval_acc=0.606661
Saving checkpoint ./model/CLDNN_cv2_step5_epoch20.pt
epoch:  21,    lr=0.000300,    loss=0.066737,    train_acc=0.976244,    eval_loss=2.846184,    eval_acc

12it [00:00, 102.25it/s]

epoch:  30,    lr=0.000300,    loss=0.042882,    train_acc=0.984962,    eval_loss=3.103189,    eval_acc=0.601356
Saving checkpoint ./model/CLDNN_cv2_step5_epoch30.pt


4480it [00:30, 145.03it/s]
498it [00:03, 150.24it/s]
553it [00:03, 152.95it/s]


(161435, 40, 100, 1) (161435,) (17358, 40, 100, 1) (17358,) (20342, 40, 100, 1) (20342,)
Start training
epoch:   1,    lr=0.000300,    loss=0.958633,    train_acc=0.588837,    eval_loss=1.045775,    eval_acc=0.564984
epoch:   2,    lr=0.000300,    loss=0.807971,    train_acc=0.668833,    eval_loss=0.937292,    eval_acc=0.618216
epoch:   3,    lr=0.000300,    loss=0.676733,    train_acc=0.727913,    eval_loss=0.994456,    eval_acc=0.627319
epoch:   4,    lr=0.000300,    loss=0.551192,    train_acc=0.782923,    eval_loss=1.031934,    eval_acc=0.632158
epoch:   5,    lr=0.000300,    loss=0.440615,    train_acc=0.829098,    eval_loss=1.296534,    eval_acc=0.614068
Saving checkpoint ./model/CLDNN_cv3_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.354628,    train_acc=0.863774,    eval_loss=1.448912,    eval_acc=0.617237
epoch:   7,    lr=0.000300,    loss=0.291904,    train_acc=0.888184,    eval_loss=1.443621,    eval_acc=0.628586
epoch:   8,    lr=0.000300,    loss=0.242955,    trai

epoch:   7,    lr=0.000300,    loss=0.291693,    train_acc=0.888977,    eval_loss=1.573533,    eval_acc=0.619253
epoch:   8,    lr=0.000300,    loss=0.241088,    train_acc=0.909013,    eval_loss=1.649671,    eval_acc=0.617525
epoch:   9,    lr=0.000300,    loss=0.206471,    train_acc=0.922674,    eval_loss=1.952673,    eval_acc=0.619657
epoch:  10,    lr=0.000300,    loss=0.178618,    train_acc=0.932773,    eval_loss=2.049126,    eval_acc=0.612801
Saving checkpoint ./model/CLDNN_cv3_step3_epoch10.pt
epoch:  11,    lr=0.000300,    loss=0.160473,    train_acc=0.940313,    eval_loss=1.903753,    eval_acc=0.604851
epoch:  12,    lr=0.000300,    loss=0.140044,    train_acc=0.948231,    eval_loss=2.123794,    eval_acc=0.608307
epoch:  13,    lr=0.000300,    loss=0.126931,    train_acc=0.952970,    eval_loss=2.266547,    eval_acc=0.615970
epoch:  14,    lr=0.000300,    loss=0.118139,    train_acc=0.956669,    eval_loss=2.108441,    eval_acc=0.601049
epoch:  15,    lr=0.000300,    loss=0.10883

epoch:  14,    lr=0.000300,    loss=0.120924,    train_acc=0.955603,    eval_loss=2.204106,    eval_acc=0.595345
epoch:  15,    lr=0.000300,    loss=0.113366,    train_acc=0.958781,    eval_loss=2.156382,    eval_acc=0.613608
Saving checkpoint ./model/CLDNN_cv3_step5_epoch15.pt
epoch:  16,    lr=0.000300,    loss=0.105232,    train_acc=0.961545,    eval_loss=2.250481,    eval_acc=0.605254
epoch:  17,    lr=0.000300,    loss=0.093378,    train_acc=0.965869,    eval_loss=2.664492,    eval_acc=0.582268
epoch:  18,    lr=0.000300,    loss=0.090421,    train_acc=0.966693,    eval_loss=2.266664,    eval_acc=0.610209
epoch:  19,    lr=0.000300,    loss=0.084642,    train_acc=0.969456,    eval_loss=2.279340,    eval_acc=0.620924
epoch:  20,    lr=0.000300,    loss=0.080031,    train_acc=0.971364,    eval_loss=2.492576,    eval_acc=0.606637
Saving checkpoint ./model/CLDNN_cv3_step5_epoch20.pt
epoch:  21,    lr=0.000300,    loss=0.074307,    train_acc=0.973223,    eval_loss=2.304383,    eval_acc

14it [00:00, 133.03it/s]

epoch:  30,    lr=0.000300,    loss=0.046094,    train_acc=0.983724,    eval_loss=2.680431,    eval_acc=0.612340
Saving checkpoint ./model/CLDNN_cv3_step5_epoch30.pt


4480it [00:30, 148.49it/s]
498it [00:03, 159.37it/s]
553it [00:03, 159.70it/s]


(162094, 40, 100, 1) (162094,) (17091, 40, 100, 1) (17091,) (19950, 40, 100, 1) (19950,)
Start training
epoch:   1,    lr=0.000300,    loss=0.978721,    train_acc=0.574318,    eval_loss=0.945522,    eval_acc=0.615061
epoch:   2,    lr=0.000300,    loss=0.825190,    train_acc=0.655565,    eval_loss=1.013548,    eval_acc=0.606108
epoch:   3,    lr=0.000300,    loss=0.693882,    train_acc=0.717016,    eval_loss=0.989829,    eval_acc=0.630741
epoch:   4,    lr=0.000300,    loss=0.560266,    train_acc=0.777683,    eval_loss=1.120637,    eval_acc=0.612545
epoch:   5,    lr=0.000300,    loss=0.445862,    train_acc=0.825823,    eval_loss=1.333178,    eval_acc=0.627465
Saving checkpoint ./model/CLDNN_cv4_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.359188,    train_acc=0.861887,    eval_loss=1.432594,    eval_acc=0.618571
epoch:   7,    lr=0.000300,    loss=0.291919,    train_acc=0.888046,    eval_loss=1.439730,    eval_acc=0.620385
epoch:   8,    lr=0.000300,    loss=0.245504,    trai

In [10]:
focus_attn_list = [False]

In [11]:
for with_focus_attn in focus_attn_list:
    cv_eval = []
    cv_iter = 0
    #with_focus_attn = True if(with_focus_attn == 'true') else False
    for train_index, eval_index in skf.split(df['sample_name'], df['sample_label']):
        '''
    for ses_num in range(1, 6):
        eval_ses = 'Ses0' + str(ses_num)
        train_index = []
        eval_index = []
        for i, sample_name in enumerate(df['sample_name']):
            if eval_ses in sample_name:
                eval_index.append(i)
            else:
                train_index.append(i)
        '''
        cv_iter += 1
        conv_dim = '1d'
        checkpoint = ''
        hidden_size = 64
        num_layers = 2
        bidirectional = 'true'

        batch_size = 128
        num_epochs = 30
        learning_rate = 0.0003

        use_warmup = 'false'
        data_dir = './wav_data/pretrain/IEMOCAP_sub/'
        multi_task = 'false'
        augmentation = 'false'
        
        save_checkpoint_steps = 5
        output_dir = './without_fa'
        #label_path = './wav_data/pretrain/IEMOCAP_vad/'

        bidirectional = True if(bidirectional == 'true') else False  
        n_mfcc = 40 if(conv_dim == '1d') else 128

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        train_samples, eval_samples = df['sample_name'][train_index], df['sample_name'][eval_index]
        train_label, eval_label = df['sample_label'][train_index], df['sample_label'][eval_index]
        #train_vad = [label_path + train_sample + '.npy' for train_sample in train_samples]
        #eval_vad = [label_path + eval_sample + '.npy' for eval_sample in eval_samples]
        
        train_samples = [data_dir + train_sample + '.wav' for train_sample in train_samples]
        eval_samples = [data_dir + eval_sample + '.wav' for eval_sample in eval_samples]

        y_train = np.array(train_label)
        y_eval = np.array(eval_label)
        
        #train_vad_list = []
        #for i in range(len(train_vad)):
        #    vad = np.load(train_vad[i])
        #    train_vad_list.append(vad)
        #vad_train = np.concatenate(train_vad_list)

        #eval_vad_list = []
        #for i in range(len(eval_vad)):
        #    vad = np.load(eval_vad[i])
        #    eval_vad_list.append(vad)
        #vad_eval = np.concatenate(eval_vad_list)

        if(multi_task == 'true'):
            speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
            y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

            y_g_train = y_gender[train_idx]
            y_g_eval = y_gender[eval_idx]

        X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
        X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)
        print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape)
        
        #X_train = X_train[vad_train]
        #y_train = y_train[vad_train]
        #X_eval = X_eval[vad_eval]
        #y_eval = y_eval[vad_eval]
        #print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape)

        if(augmentation == 'true'):
            X_train_flip = X_train[:, :, :, ::-1]
            y_train_flip = y_train.copy()

            X_train = np.concatenate((X_train, X_train_flip), axis=0)
            y_train = np.concatenate((y_train, y_train_flip), axis=0)

        X_train, y_train = convert_tensor(X_train, y_train)
        X_eval, y_eval = convert_tensor(X_eval, y_eval)

        y_train = y_train.long()
        y_eval = y_eval.long()

        if(multi_task == 'true'):
            _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
            _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

            if(augmentation == 'true'):
                y_g_train_flip = y_g_train.copy()
                y_g_train = np.concatenate((y_g_train, y_g_train_flip))

            y_g_train = torch.tensor(y_g_train).float()
            y_g_eval = torch.tensor(y_g_eval).float()

            y_g_train = y_g_train.unsqueeze(-1)
            y_g_eval = y_g_eval.unsqueeze(-1)

        if(multi_task == 'true'):
            train_ds = TensorDataset(X_train, y_train, y_g_train)
            eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
        else:
            train_ds = TensorDataset(X_train, y_train)
            eval_ds = TensorDataset(X_eval, y_eval)

        train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
        eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0)

        for en in range(2):
            model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional,
                          with_focus_attn=with_focus_attn).to(device)

            if(multi_task == 'true'):
                model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                    num_layers=num_layers, bidirectional=bidirectional,
                                    with_focus_attn=with_focus_attn).to(device)

            if(multi_task == 'true'):
                loss_func = nn.CrossEntropyLoss()
                loss_func_g = nn.BCELoss()
                optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
            else:
                loss_func = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=learning_rate)

            if(use_warmup == 'true'):
                t_total = len(train_dataloader) // 1 * num_epochs
                opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

            eval_acc = train(train_dataloader, eval_dataloader, num_epochs)
            cv_eval.append(eval_acc)

    print('conv_dim:', conv_dim, '\twith_focus_attn:', with_focus_attn)
    print('Test accuray:', cv_eval)
    print()

4041it [00:29, 134.73it/s]
449it [00:03, 115.23it/s]


(163254, 40, 50, 1) (163254,) (18406, 40, 50, 1) (18406,)
Start training
epoch:   1,    lr=0.000300,    loss=0.958399,    train_acc=0.608248,    eval_loss=0.885174,    eval_acc=0.621156
epoch:   2,    lr=0.000300,    loss=0.866098,    train_acc=0.652151,    eval_loss=0.863214,    eval_acc=0.633706
epoch:   3,    lr=0.000300,    loss=0.805395,    train_acc=0.681691,    eval_loss=0.919529,    eval_acc=0.614202
epoch:   4,    lr=0.000300,    loss=0.750803,    train_acc=0.704289,    eval_loss=0.906092,    eval_acc=0.638542
epoch:   5,    lr=0.000300,    loss=0.692407,    train_acc=0.727739,    eval_loss=0.979317,    eval_acc=0.619092
Saving checkpoint ./without_fa/CLDNN_cv1_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.633242,    train_acc=0.751605,    eval_loss=0.987517,    eval_acc=0.628925
epoch:   7,    lr=0.000300,    loss=0.580896,    train_acc=0.772935,    eval_loss=1.104126,    eval_acc=0.606270
epoch:   8,    lr=0.000300,    loss=0.530665,    train_acc=0.793946,    eval_lo

13it [00:00, 127.98it/s]

epoch:  30,    lr=0.000300,    loss=0.126366,    train_acc=0.953217,    eval_loss=2.438709,    eval_acc=0.586982
Saving checkpoint ./without_fa/CLDNN_cv1_step2_epoch30.pt


4041it [00:27, 149.25it/s]
449it [00:04, 109.34it/s]


(162195, 40, 50, 1) (162195,) (19465, 40, 50, 1) (19465,)
Start training
epoch:   1,    lr=0.000300,    loss=0.963145,    train_acc=0.603585,    eval_loss=1.011702,    eval_acc=0.596866
epoch:   2,    lr=0.000300,    loss=0.867618,    train_acc=0.649998,    eval_loss=1.030117,    eval_acc=0.587824
epoch:   3,    lr=0.000300,    loss=0.813730,    train_acc=0.676623,    eval_loss=0.992731,    eval_acc=0.616748
epoch:   4,    lr=0.000300,    loss=0.761384,    train_acc=0.698667,    eval_loss=1.088680,    eval_acc=0.603545
epoch:   5,    lr=0.000300,    loss=0.710600,    train_acc=0.721124,    eval_loss=1.135838,    eval_acc=0.611611
Saving checkpoint ./without_fa/CLDNN_cv2_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.655848,    train_acc=0.743008,    eval_loss=1.143851,    eval_acc=0.606165
epoch:   7,    lr=0.000300,    loss=0.603269,    train_acc=0.764373,    eval_loss=1.166253,    eval_acc=0.615412
epoch:   8,    lr=0.000300,    loss=0.555519,    train_acc=0.782970,    eval_lo

11it [00:00, 101.82it/s]

epoch:  30,    lr=0.000300,    loss=0.127112,    train_acc=0.952545,    eval_loss=3.079246,    eval_acc=0.552068
Saving checkpoint ./without_fa/CLDNN_cv2_step2_epoch30.pt


4041it [00:27, 147.67it/s]
449it [00:03, 145.05it/s]


(162222, 40, 50, 1) (162222,) (19438, 40, 50, 1) (19438,)
Start training
epoch:   1,    lr=0.000300,    loss=0.955991,    train_acc=0.608536,    eval_loss=1.070088,    eval_acc=0.569040
epoch:   2,    lr=0.000300,    loss=0.862909,    train_acc=0.653099,    eval_loss=1.018291,    eval_acc=0.586789
epoch:   3,    lr=0.000300,    loss=0.801538,    train_acc=0.680360,    eval_loss=1.099531,    eval_acc=0.568063
epoch:   4,    lr=0.000300,    loss=0.747105,    train_acc=0.704149,    eval_loss=1.159921,    eval_acc=0.559831
epoch:   5,    lr=0.000300,    loss=0.689680,    train_acc=0.727531,    eval_loss=1.344985,    eval_acc=0.548873
Saving checkpoint ./without_fa/CLDNN_cv3_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.635084,    train_acc=0.749427,    eval_loss=1.257525,    eval_acc=0.553812
epoch:   7,    lr=0.000300,    loss=0.583071,    train_acc=0.770724,    eval_loss=1.439722,    eval_acc=0.562661
epoch:   8,    lr=0.000300,    loss=0.532597,    train_acc=0.790302,    eval_lo

5it [00:00, 45.24it/s]

epoch:  30,    lr=0.000300,    loss=0.118115,    train_acc=0.956381,    eval_loss=3.483854,    eval_acc=0.552732
Saving checkpoint ./without_fa/CLDNN_cv3_step2_epoch30.pt


4041it [00:48, 83.77it/s] 
449it [00:03, 140.86it/s]


(163779, 40, 50, 1) (163779,) (17881, 40, 50, 1) (17881,)
Start training
epoch:   1,    lr=0.000300,    loss=0.968213,    train_acc=0.598044,    eval_loss=0.841609,    eval_acc=0.691684
epoch:   2,    lr=0.000300,    loss=0.876354,    train_acc=0.643685,    eval_loss=0.896241,    eval_acc=0.651530
epoch:   3,    lr=0.000300,    loss=0.814743,    train_acc=0.672920,    eval_loss=0.883939,    eval_acc=0.669649
epoch:   4,    lr=0.000300,    loss=0.757723,    train_acc=0.697640,    eval_loss=0.937736,    eval_acc=0.664113
epoch:   5,    lr=0.000300,    loss=0.697745,    train_acc=0.723105,    eval_loss=0.990314,    eval_acc=0.650858
Saving checkpoint ./without_fa/CLDNN_cv4_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.639346,    train_acc=0.746928,    eval_loss=1.045377,    eval_acc=0.625468
epoch:   7,    lr=0.000300,    loss=0.580890,    train_acc=0.772002,    eval_loss=1.125401,    eval_acc=0.618422
epoch:   8,    lr=0.000300,    loss=0.528184,    train_acc=0.792019,    eval_lo

11it [00:00, 106.76it/s]

epoch:  30,    lr=0.000300,    loss=0.122739,    train_acc=0.953895,    eval_loss=2.655960,    eval_acc=0.589620
Saving checkpoint ./without_fa/CLDNN_cv4_step2_epoch30.pt


4041it [00:28, 141.77it/s]
449it [00:03, 139.29it/s]


(164646, 40, 50, 1) (164646,) (17014, 40, 50, 1) (17014,)
Start training
epoch:   1,    lr=0.000300,    loss=0.950255,    train_acc=0.611696,    eval_loss=1.076237,    eval_acc=0.530034
epoch:   2,    lr=0.000300,    loss=0.856248,    train_acc=0.657022,    eval_loss=1.015628,    eval_acc=0.580522
epoch:   3,    lr=0.000300,    loss=0.798171,    train_acc=0.682956,    eval_loss=1.093912,    eval_acc=0.549841
epoch:   4,    lr=0.000300,    loss=0.744014,    train_acc=0.706454,    eval_loss=1.127306,    eval_acc=0.576114
epoch:   5,    lr=0.000300,    loss=0.697456,    train_acc=0.725937,    eval_loss=1.257701,    eval_acc=0.518984
Saving checkpoint ./without_fa/CLDNN_cv5_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.642484,    train_acc=0.747163,    eval_loss=1.211593,    eval_acc=0.556718
epoch:   7,    lr=0.000300,    loss=0.594002,    train_acc=0.766895,    eval_loss=1.203899,    eval_acc=0.573057
epoch:   8,    lr=0.000300,    loss=0.550636,    train_acc=0.784300,    eval_lo

13it [00:00, 122.97it/s]

epoch:  30,    lr=0.000300,    loss=0.126328,    train_acc=0.953161,    eval_loss=3.000361,    eval_acc=0.518808
Saving checkpoint ./without_fa/CLDNN_cv5_step2_epoch30.pt


4041it [00:26, 152.31it/s]
449it [00:02, 171.47it/s]


(164203, 40, 50, 1) (164203,) (17457, 40, 50, 1) (17457,)
Start training
epoch:   1,    lr=0.000300,    loss=0.963012,    train_acc=0.603817,    eval_loss=0.982421,    eval_acc=0.584866
epoch:   2,    lr=0.000300,    loss=0.873073,    train_acc=0.648419,    eval_loss=1.044571,    eval_acc=0.558973
epoch:   3,    lr=0.000300,    loss=0.816475,    train_acc=0.674459,    eval_loss=1.057210,    eval_acc=0.580913
epoch:   4,    lr=0.000300,    loss=0.760338,    train_acc=0.699682,    eval_loss=1.093211,    eval_acc=0.576216
epoch:   5,    lr=0.000300,    loss=0.707037,    train_acc=0.721498,    eval_loss=1.200287,    eval_acc=0.554906
Saving checkpoint ./without_fa/CLDNN_cv6_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.654998,    train_acc=0.743022,    eval_loss=1.225720,    eval_acc=0.585324
epoch:   7,    lr=0.000300,    loss=0.600655,    train_acc=0.764443,    eval_loss=1.354677,    eval_acc=0.558229
epoch:   8,    lr=0.000300,    loss=0.553672,    train_acc=0.783730,    eval_lo

10it [00:00, 98.95it/s]

epoch:  30,    lr=0.000300,    loss=0.124966,    train_acc=0.953418,    eval_loss=3.168618,    eval_acc=0.519906
Saving checkpoint ./without_fa/CLDNN_cv6_step2_epoch30.pt


4041it [00:27, 146.96it/s]
449it [00:02, 162.15it/s]


(162851, 40, 50, 1) (162851,) (18809, 40, 50, 1) (18809,)
Start training
epoch:   1,    lr=0.000300,    loss=0.949125,    train_acc=0.608779,    eval_loss=1.075618,    eval_acc=0.569515
epoch:   2,    lr=0.000300,    loss=0.853187,    train_acc=0.656668,    eval_loss=1.159178,    eval_acc=0.579191
epoch:   3,    lr=0.000300,    loss=0.798412,    train_acc=0.681198,    eval_loss=1.104127,    eval_acc=0.565952
epoch:   4,    lr=0.000300,    loss=0.740460,    train_acc=0.705367,    eval_loss=1.240762,    eval_acc=0.551598
epoch:   5,    lr=0.000300,    loss=0.682399,    train_acc=0.731163,    eval_loss=1.258379,    eval_acc=0.541230
Saving checkpoint ./without_fa/CLDNN_cv7_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.630139,    train_acc=0.751990,    eval_loss=1.285888,    eval_acc=0.553139
epoch:   7,    lr=0.000300,    loss=0.577977,    train_acc=0.773536,    eval_loss=1.431580,    eval_acc=0.568239
epoch:   8,    lr=0.000300,    loss=0.529204,    train_acc=0.792815,    eval_lo

13it [00:00, 128.11it/s]

epoch:  30,    lr=0.000300,    loss=0.111277,    train_acc=0.958929,    eval_loss=3.369441,    eval_acc=0.518103
Saving checkpoint ./without_fa/CLDNN_cv7_step2_epoch30.pt


4041it [00:26, 149.84it/s]
449it [00:02, 163.47it/s]


(163836, 40, 50, 1) (163836,) (17824, 40, 50, 1) (17824,)
Start training
epoch:   1,    lr=0.000300,    loss=0.953402,    train_acc=0.607518,    eval_loss=1.057804,    eval_acc=0.568447
epoch:   2,    lr=0.000300,    loss=0.857802,    train_acc=0.654088,    eval_loss=1.053680,    eval_acc=0.585391
epoch:   3,    lr=0.000300,    loss=0.793410,    train_acc=0.682143,    eval_loss=1.086258,    eval_acc=0.581688
epoch:   4,    lr=0.000300,    loss=0.735845,    train_acc=0.707920,    eval_loss=1.163308,    eval_acc=0.569345
epoch:   5,    lr=0.000300,    loss=0.678747,    train_acc=0.733440,    eval_loss=1.160378,    eval_acc=0.582641
Saving checkpoint ./without_fa/CLDNN_cv8_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.625116,    train_acc=0.754270,    eval_loss=1.357103,    eval_acc=0.534953
epoch:   7,    lr=0.000300,    loss=0.572517,    train_acc=0.776394,    eval_loss=1.607847,    eval_acc=0.543986
epoch:   8,    lr=0.000300,    loss=0.527338,    train_acc=0.793613,    eval_lo

13it [00:00, 127.33it/s]

epoch:  30,    lr=0.000300,    loss=0.137870,    train_acc=0.948373,    eval_loss=3.024223,    eval_acc=0.512960
Saving checkpoint ./without_fa/CLDNN_cv8_step2_epoch30.pt


4041it [00:32, 122.78it/s]
449it [00:02, 159.24it/s]


(164704, 40, 50, 1) (164704,) (16956, 40, 50, 1) (16956,)
Start training
epoch:   1,    lr=0.000300,    loss=0.960736,    train_acc=0.604351,    eval_loss=1.030538,    eval_acc=0.562869
epoch:   2,    lr=0.000300,    loss=0.864636,    train_acc=0.651147,    eval_loss=0.997738,    eval_acc=0.588818
epoch:   3,    lr=0.000300,    loss=0.800115,    train_acc=0.680198,    eval_loss=1.019605,    eval_acc=0.589408
epoch:   4,    lr=0.000300,    loss=0.741715,    train_acc=0.705828,    eval_loss=1.035297,    eval_acc=0.611701
epoch:   5,    lr=0.000300,    loss=0.683605,    train_acc=0.731204,    eval_loss=1.076464,    eval_acc=0.599670
Saving checkpoint ./without_fa/CLDNN_cv9_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.627608,    train_acc=0.753657,    eval_loss=1.063668,    eval_acc=0.612880
epoch:   7,    lr=0.000300,    loss=0.572264,    train_acc=0.776694,    eval_loss=1.251726,    eval_acc=0.589054
epoch:   8,    lr=0.000300,    loss=0.523704,    train_acc=0.796954,    eval_lo

13it [00:00, 126.67it/s]

epoch:  30,    lr=0.000300,    loss=0.131913,    train_acc=0.950877,    eval_loss=2.679483,    eval_acc=0.571656
Saving checkpoint ./without_fa/CLDNN_cv9_step2_epoch30.pt


4041it [00:25, 157.13it/s]
449it [00:02, 160.64it/s]


(163250, 40, 50, 1) (163250,) (18410, 40, 50, 1) (18410,)
Start training
epoch:   1,    lr=0.000300,    loss=0.952879,    train_acc=0.605717,    eval_loss=1.021549,    eval_acc=0.584356
epoch:   2,    lr=0.000300,    loss=0.855710,    train_acc=0.656618,    eval_loss=1.052291,    eval_acc=0.579576
epoch:   3,    lr=0.000300,    loss=0.796527,    train_acc=0.682843,    eval_loss=1.079504,    eval_acc=0.573927
epoch:   4,    lr=0.000300,    loss=0.741664,    train_acc=0.706991,    eval_loss=1.121610,    eval_acc=0.572461
epoch:   5,    lr=0.000300,    loss=0.684110,    train_acc=0.730919,    eval_loss=1.323383,    eval_acc=0.550896
Saving checkpoint ./without_fa/CLDNN_cv10_step1_epoch5.pt
epoch:   6,    lr=0.000300,    loss=0.630616,    train_acc=0.753180,    eval_loss=1.217440,    eval_acc=0.580554
epoch:   7,    lr=0.000300,    loss=0.577637,    train_acc=0.775221,    eval_loss=1.321793,    eval_acc=0.549538
epoch:   8,    lr=0.000300,    loss=0.530293,    train_acc=0.794534,    eval_l

In [10]:
with_focus_attn = [0.6570155902004454, 0.6124721603563474, 0.6347438752783965, 
                   0.6926503340757239, 0.6280623608017817, 0.6169265033407573, 
                   0.5812917594654788, 0.6013363028953229, 0.6948775055679287, 0.6792873051224945]
without_focus_attn = [0.6347438752783965, 0.6258351893095768, 0.6057906458797327, 
                      0.7037861915367484, 0.6035634743875279, 0.5946547884187082, 
                      0.6146993318485523, 0.6325167037861915, 0.688195991091314, 0.6681514476614699]

In [11]:
print('with_focus_attn:', sum(with_focus_attn) / 10)
print('without_focus_attn:', sum(without_focus_attn) / 10)

with_focus_attn: 0.6398663697104678
without_focus_attn: 0.6371937639198219


In [12]:
sum([p.numel() for p in model.parameters()])

1327044

In [13]:
model

CLDNN(
  (conv1): Sequential(
    (0): Conv1d(1, 64, kernel_size=(3, 1), stride=(1,))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(64, 64, kernel_size=(3, 1), stride=(1,))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (lstm): LSTM(512, 64, num_layers=2, bidirectional=True)
  (tanh): Tanh()
  (dropout): Dropout(p=0.5, inplace=False)
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=128, out_features=128, bias=True)
    (key): Linear(in_features=128, out_features=128, bias=True)
    (value): Linear(in_features=128, out_features=128, bias=True)
    (o_proj): Linear(in_features=128, out_features=128, bias=True)
    (dropout): 

In [12]:
with_focus_attn = [0.623608017817372, 0.6503340757238307, 0.6280623608017817, 
                   0.7015590200445434, 0.6302895322939867, 0.6057906458797327, 
                   0.6057906458797327, 0.6102449888641426, 0.6547884187082406, 0.6570155902004454]
without_focus_attn = [0.6302895322939867, 0.6391982182628062, 0.623608017817372, 
                      0.7082405345211581, 0.6124721603563474, 0.6035634743875279, 
                      0.6102449888641426, 0.6124721603563474, 0.6547884187082406, 0.6636971046770601]

In [13]:
print('with_focus_attn:', sum(with_focus_attn) / 10)
print('without_focus_attn:', sum(without_focus_attn) / 10)

with_focus_attn: 0.6367483296213808
without_focus_attn: 0.6358574610244988


In [10]:
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=128, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.tanh = nn.Tanh()
            self.dropout = nn.Dropout(0.1)
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=256, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.fc = nn.Sequential(
                nn.Linear(50*2*hidden_size if bidirectional else 50*hidden_size, 64),
                nn.ReLU(),
                nn.Linear(64, 4)
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) 
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 4)
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 50) -> (batch, 8, 1, 50)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 50) -> (batch, 8, 50)
            out = out.permute(2, 0, 1)  # (batch, 8, 50) -> (50, batch, 8)
            out, _ = self.lstm(out)  # (50, batch, 8) -> (50, batch, num_directions*hidden_size)
            out = self.tanh(out)
            out = self.dropout(out)
            out = out.permute(1, 0, 2)  # (50, batch, num_directions*hidden_size) -> (batch, 50, num_directions*hidden_size)
            h = out
            out = self.attn(out) # (batch, 50, num_directions*hidden_size) -> (batch, 50, num_directions*hidden_size)
            out = h + out
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.reshape(*new_out_shape)  # (batch, 50, num_directions*hidden_size) -> (batch, 50*num_directions*hidden_size)
            out = self.fc(out)  # (batch, 50*num_directions*hidden_size) -> (batch, 4)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [11]:
for with_focus_attn in focus_attn_list:
    cv_eval = []
    #with_focus_attn = True if(with_focus_attn == 'true') else False
    for train_index, eval_index in skf.split(df['sample_name'], df['sample_label']):
        conv_dim = '1d'
        checkpoint = ''
        hidden_size = 128
        num_layers = 2
        bidirectional = 'true'

        batch_size = 256
        num_epochs = 50
        learning_rate = 0.001

        use_warmup = 'true'
        data_dir = './wav_data/pretrain/IEMOCAP_sub/'
        multi_task = 'false'
        augmentation = 'false'
        
        save_checkpoint_steps = 5
        output_dir = './model'

        bidirectional = True if(bidirectional == 'true') else False   
        n_mfcc = 40 if(conv_dim == '1d') else 128

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        train_samples, eval_samples = df['sample_name'][train_index], df['sample_name'][eval_index]
        train_label, eval_label = df['sample_label'][train_index], df['sample_label'][eval_index]
        
        train_samples = [data_dir + train_sample + '.wav' for train_sample in train_samples]
        eval_samples = [data_dir + eval_sample + '.wav' for eval_sample in eval_samples]

        y_train = np.array(train_label)
        y_eval = np.array(eval_label)

        if(multi_task == 'true'):
            speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
            y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

            y_g_train = y_gender[train_idx]
            y_g_eval = y_gender[eval_idx]

        X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
        X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

        if(augmentation == 'true'):
            X_train_flip = X_train[:, :, :, ::-1]
            y_train_flip = y_train.copy()

            X_train = np.concatenate((X_train, X_train_flip), axis=0)
            y_train = np.concatenate((y_train, y_train_flip), axis=0)

        X_train, y_train = convert_tensor(X_train, y_train)
        X_eval, y_eval = convert_tensor(X_eval, y_eval)

        y_train = y_train.long()
        y_eval = y_eval.long()

        if(multi_task == 'true'):
            _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
            _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

            if(augmentation == 'true'):
                y_g_train_flip = y_g_train.copy()
                y_g_train = np.concatenate((y_g_train, y_g_train_flip))

            y_g_train = torch.tensor(y_g_train).float()
            y_g_eval = torch.tensor(y_g_eval).float()

            y_g_train = y_g_train.unsqueeze(-1)
            y_g_eval = y_g_eval.unsqueeze(-1)

        if(multi_task == 'true'):
            train_ds = TensorDataset(X_train, y_train, y_g_train)
            eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
        else:
            train_ds = TensorDataset(X_train, y_train)
            eval_ds = TensorDataset(X_eval, y_eval)

        train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
        eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0)

        model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                      num_layers=num_layers, bidirectional=bidirectional,
                      with_focus_attn=with_focus_attn).to(device)

        if(multi_task == 'true'):
            model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                num_layers=num_layers, bidirectional=bidirectional,
                                with_focus_attn=with_focus_attn).to(device)

        if(multi_task == 'true'):
            loss_func = nn.CrossEntropyLoss()
            loss_func_g = nn.BCELoss()
            optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
        else:
            loss_func = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        if(use_warmup == 'true'):
            t_total = len(train_dataloader) // 1 * num_epochs
            opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

        eval_acc = train(train_dataloader, eval_dataloader, num_epochs)
        cv_eval.append(eval_acc)

    print('conv_dim:', conv_dim, '\twith_focus_attn:', with_focus_attn)
    print('Test accuray:', cv_eval)
    print()

4041it [00:28, 141.44it/s]
449it [00:03, 139.56it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.145764,    train_acc=0.494389,    eval_loss=0.975452,    eval_acc=0.564436
epoch:   2,    lr=0.000400,    loss=0.999604,    train_acc=0.592781,    eval_loss=0.927232,    eval_acc=0.603390
epoch:   3,    lr=0.000600,    loss=0.949947,    train_acc=0.618561,    eval_loss=0.928751,    eval_acc=0.599315
epoch:   4,    lr=0.000800,    loss=0.923217,    train_acc=0.631525,    eval_loss=0.895027,    eval_acc=0.618005
epoch:   5,    lr=0.001000,    loss=0.904673,    train_acc=0.640466,    eval_loss=0.982887,    eval_acc=0.582636
Test accuray: 0.57238
epoch:   6,    lr=0.000978,    loss=0.889310,    train_acc=0.646794,    eval_loss=0.904333,    eval_acc=0.623492
epoch:   7,    lr=0.000956,    loss=0.874368,    train_acc=0.653000,    eval_loss=0.917226,    eval_acc=0.616212
epoch:   8,    lr=0.000933,    loss=0.860490,    train_acc=0.658562,    eval_loss=0.890120,    eval_acc=0.616321
epoch:   9,    lr=0.000911,    loss=0.846037,    train_acc=

10it [00:00, 97.14it/s]

Test accuray: 0.61024


4041it [00:26, 150.54it/s]
449it [00:02, 152.42it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.134331,    train_acc=0.504795,    eval_loss=1.063269,    eval_acc=0.543951
epoch:   2,    lr=0.000400,    loss=0.996155,    train_acc=0.591232,    eval_loss=1.031240,    eval_acc=0.554123
epoch:   3,    lr=0.000600,    loss=0.946596,    train_acc=0.617700,    eval_loss=1.007301,    eval_acc=0.597329
epoch:   4,    lr=0.000800,    loss=0.926464,    train_acc=0.628752,    eval_loss=1.048615,    eval_acc=0.584536
epoch:   5,    lr=0.001000,    loss=0.909606,    train_acc=0.635380,    eval_loss=1.017928,    eval_acc=0.591266
Test accuray: 0.5657
epoch:   6,    lr=0.000978,    loss=0.895379,    train_acc=0.642174,    eval_loss=1.007189,    eval_acc=0.592859
epoch:   7,    lr=0.000956,    loss=0.882240,    train_acc=0.648259,    eval_loss=0.974077,    eval_acc=0.604572
epoch:   8,    lr=0.000933,    loss=0.867170,    train_acc=0.651986,    eval_loss=1.011275,    eval_acc=0.587413
epoch:   9,    lr=0.000911,    loss=0.852197,    train_acc=0

13it [00:00, 121.48it/s]

Test accuray: 0.58797


4041it [00:26, 152.89it/s]
449it [00:03, 133.03it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.122892,    train_acc=0.503592,    eval_loss=1.054889,    eval_acc=0.545941
epoch:   2,    lr=0.000400,    loss=1.000687,    train_acc=0.591584,    eval_loss=1.028591,    eval_acc=0.559883
epoch:   3,    lr=0.000600,    loss=0.967778,    train_acc=0.610264,    eval_loss=1.025362,    eval_acc=0.568731
epoch:   4,    lr=0.000800,    loss=0.942446,    train_acc=0.622525,    eval_loss=1.068094,    eval_acc=0.563124
epoch:   5,    lr=0.001000,    loss=0.919155,    train_acc=0.634121,    eval_loss=1.103954,    eval_acc=0.570789
Test accuray: 0.52784
epoch:   6,    lr=0.000978,    loss=0.897082,    train_acc=0.643957,    eval_loss=1.049303,    eval_acc=0.559317
epoch:   7,    lr=0.000956,    loss=0.878778,    train_acc=0.651159,    eval_loss=1.242520,    eval_acc=0.538070
epoch:   8,    lr=0.000933,    loss=0.863506,    train_acc=0.656324,    eval_loss=1.059775,    eval_acc=0.585040
epoch:   9,    lr=0.000911,    loss=0.848030,    train_acc=

18it [00:00, 175.54it/s]

Test accuray: 0.59688


4041it [00:26, 150.68it/s]
449it [00:03, 139.44it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.144920,    train_acc=0.491032,    eval_loss=1.038871,    eval_acc=0.594206
epoch:   2,    lr=0.000400,    loss=1.014668,    train_acc=0.578412,    eval_loss=0.944145,    eval_acc=0.663498
epoch:   3,    lr=0.000600,    loss=0.959869,    train_acc=0.608073,    eval_loss=0.888004,    eval_acc=0.688440
epoch:   4,    lr=0.000800,    loss=0.932449,    train_acc=0.621681,    eval_loss=0.846709,    eval_acc=0.702869
epoch:   5,    lr=0.001000,    loss=0.912382,    train_acc=0.631254,    eval_loss=0.886670,    eval_acc=0.673676
Test accuray: 0.6882
epoch:   6,    lr=0.000978,    loss=0.892746,    train_acc=0.639714,    eval_loss=0.871824,    eval_acc=0.682121
epoch:   7,    lr=0.000956,    loss=0.875067,    train_acc=0.647741,    eval_loss=0.919950,    eval_acc=0.651753
epoch:   8,    lr=0.000933,    loss=0.862153,    train_acc=0.651842,    eval_loss=0.932563,    eval_acc=0.636821
epoch:   9,    lr=0.000911,    loss=0.849390,    train_acc=0

17it [00:00, 167.14it/s]

Test accuray: 0.68374


4041it [00:27, 146.48it/s]
449it [00:02, 150.06it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.137610,    train_acc=0.497965,    eval_loss=1.108246,    eval_acc=0.529388
epoch:   2,    lr=0.000400,    loss=0.998736,    train_acc=0.587153,    eval_loss=1.098290,    eval_acc=0.537263
epoch:   3,    lr=0.000600,    loss=0.956651,    train_acc=0.611471,    eval_loss=1.045356,    eval_acc=0.564653
epoch:   4,    lr=0.000800,    loss=0.931836,    train_acc=0.623530,    eval_loss=1.067405,    eval_acc=0.563947
epoch:   5,    lr=0.001000,    loss=0.914995,    train_acc=0.632011,    eval_loss=1.103991,    eval_acc=0.551252
Test accuray: 0.61693
epoch:   6,    lr=0.000978,    loss=0.895906,    train_acc=0.641348,    eval_loss=1.090756,    eval_acc=0.569296
epoch:   7,    lr=0.000956,    loss=0.874147,    train_acc=0.652070,    eval_loss=1.072761,    eval_acc=0.562537
epoch:   8,    lr=0.000933,    loss=0.860043,    train_acc=0.657908,    eval_loss=1.059622,    eval_acc=0.582344
epoch:   9,    lr=0.000911,    loss=0.846032,    train_acc=

18it [00:00, 172.26it/s]

Test accuray: 0.60802


4041it [00:26, 154.00it/s]
449it [00:02, 171.00it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.128229,    train_acc=0.502992,    eval_loss=1.040221,    eval_acc=0.560176
epoch:   2,    lr=0.000400,    loss=0.980787,    train_acc=0.599369,    eval_loss=1.002814,    eval_acc=0.590365
epoch:   3,    lr=0.000600,    loss=0.942731,    train_acc=0.619205,    eval_loss=1.111848,    eval_acc=0.541674
epoch:   4,    lr=0.000800,    loss=0.921155,    train_acc=0.628352,    eval_loss=1.105911,    eval_acc=0.553016
epoch:   5,    lr=0.001000,    loss=0.905803,    train_acc=0.636810,    eval_loss=1.117677,    eval_acc=0.558744
Test accuray: 0.53229
epoch:   6,    lr=0.000978,    loss=0.892758,    train_acc=0.643501,    eval_loss=1.007829,    eval_acc=0.575242
epoch:   7,    lr=0.000956,    loss=0.879278,    train_acc=0.648986,    eval_loss=1.007086,    eval_acc=0.590422
epoch:   8,    lr=0.000933,    loss=0.868656,    train_acc=0.653837,    eval_loss=1.037538,    eval_acc=0.582975
epoch:   9,    lr=0.000911,    loss=0.857542,    train_acc=

17it [00:00, 161.29it/s]

Test accuray: 0.59911


4041it [00:25, 156.22it/s]
449it [00:02, 159.15it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.162923,    train_acc=0.484762,    eval_loss=1.146793,    eval_acc=0.521399
epoch:   2,    lr=0.000400,    loss=1.002536,    train_acc=0.583683,    eval_loss=1.127047,    eval_acc=0.558190
epoch:   3,    lr=0.000600,    loss=0.925923,    train_acc=0.628611,    eval_loss=1.158791,    eval_acc=0.509118
epoch:   4,    lr=0.000800,    loss=0.896138,    train_acc=0.641012,    eval_loss=1.096435,    eval_acc=0.559466
epoch:   5,    lr=0.001000,    loss=0.879179,    train_acc=0.648548,    eval_loss=1.071605,    eval_acc=0.565368
Test accuray: 0.59465
epoch:   6,    lr=0.000978,    loss=0.862817,    train_acc=0.653314,    eval_loss=1.294675,    eval_acc=0.465788
epoch:   7,    lr=0.000956,    loss=0.842159,    train_acc=0.662699,    eval_loss=1.231891,    eval_acc=0.541709
epoch:   8,    lr=0.000933,    loss=0.828345,    train_acc=0.667662,    eval_loss=1.169254,    eval_acc=0.543729
epoch:   9,    lr=0.000911,    loss=0.812619,    train_acc=

18it [00:00, 175.17it/s]

Test accuray: 0.57906


4041it [00:25, 159.59it/s]
449it [00:02, 164.22it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.139794,    train_acc=0.492432,    eval_loss=1.157606,    eval_acc=0.474192
epoch:   2,    lr=0.000400,    loss=0.997298,    train_acc=0.588071,    eval_loss=1.117960,    eval_acc=0.526088
epoch:   3,    lr=0.000600,    loss=0.946873,    train_acc=0.614082,    eval_loss=1.132647,    eval_acc=0.518627
epoch:   4,    lr=0.000800,    loss=0.919900,    train_acc=0.626565,    eval_loss=1.148605,    eval_acc=0.538095
epoch:   5,    lr=0.001000,    loss=0.902107,    train_acc=0.635209,    eval_loss=1.183619,    eval_acc=0.519917
Test accuray: 0.55679
epoch:   6,    lr=0.000978,    loss=0.885648,    train_acc=0.641817,    eval_loss=1.236310,    eval_acc=0.528333
epoch:   7,    lr=0.000956,    loss=0.870018,    train_acc=0.649098,    eval_loss=1.110236,    eval_acc=0.548193
epoch:   8,    lr=0.000933,    loss=0.855397,    train_acc=0.656427,    eval_loss=1.119264,    eval_acc=0.562724
epoch:   9,    lr=0.000911,    loss=0.841798,    train_acc=

17it [00:00, 166.92it/s]

Test accuray: 0.60802


4041it [00:26, 153.26it/s]
449it [00:02, 163.46it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.159949,    train_acc=0.483385,    eval_loss=1.060162,    eval_acc=0.543996
epoch:   2,    lr=0.000400,    loss=1.010360,    train_acc=0.583969,    eval_loss=1.011778,    eval_acc=0.582567
epoch:   3,    lr=0.000600,    loss=0.951393,    train_acc=0.617880,    eval_loss=0.995473,    eval_acc=0.598549
epoch:   4,    lr=0.000800,    loss=0.917101,    train_acc=0.633560,    eval_loss=0.977945,    eval_acc=0.603385
epoch:   5,    lr=0.001000,    loss=0.899108,    train_acc=0.640546,    eval_loss=0.957891,    eval_acc=0.615004
Test accuray: 0.6392
epoch:   6,    lr=0.000978,    loss=0.881572,    train_acc=0.646621,    eval_loss=1.005725,    eval_acc=0.597547
epoch:   7,    lr=0.000956,    loss=0.868383,    train_acc=0.653073,    eval_loss=1.017196,    eval_acc=0.578379
epoch:   8,    lr=0.000933,    loss=0.855703,    train_acc=0.656669,    eval_loss=1.028497,    eval_acc=0.577554
epoch:   9,    lr=0.000911,    loss=0.841422,    train_acc=0

18it [00:00, 174.52it/s]

Test accuray: 0.61915


4041it [00:27, 147.67it/s]
449it [00:02, 161.76it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.142504,    train_acc=0.496652,    eval_loss=1.110026,    eval_acc=0.549864
epoch:   2,    lr=0.000400,    loss=0.985986,    train_acc=0.598447,    eval_loss=1.099890,    eval_acc=0.564802
epoch:   3,    lr=0.000600,    loss=0.943775,    train_acc=0.618898,    eval_loss=1.116955,    eval_acc=0.542205
epoch:   4,    lr=0.000800,    loss=0.918358,    train_acc=0.630887,    eval_loss=1.076565,    eval_acc=0.574525
epoch:   5,    lr=0.001000,    loss=0.898630,    train_acc=0.640981,    eval_loss=1.078903,    eval_acc=0.561217
Test accuray: 0.62361
epoch:   6,    lr=0.000978,    loss=0.882110,    train_acc=0.647205,    eval_loss=1.077450,    eval_acc=0.564259
epoch:   7,    lr=0.000956,    loss=0.866749,    train_acc=0.654907,    eval_loss=1.149569,    eval_acc=0.551928
epoch:   8,    lr=0.000933,    loss=0.851252,    train_acc=0.662793,    eval_loss=1.166857,    eval_acc=0.553449
epoch:   9,    lr=0.000911,    loss=0.837563,    train_acc=

14it [00:00, 133.22it/s]

Test accuray: 0.61915
conv_dim: 1d 	with_focus_attn: True
Test accuray: [0.6146993318485523, 0.6035634743875279, 0.6169265033407573, 0.6948775055679287, 0.6169265033407573, 0.5991091314031181, 0.5946547884187082, 0.6080178173719376, 0.6391982182628062, 0.6592427616926503]



4041it [00:25, 157.29it/s]
449it [00:03, 139.11it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.168723,    train_acc=0.481156,    eval_loss=0.992027,    eval_acc=0.552211
epoch:   2,    lr=0.000400,    loss=1.038159,    train_acc=0.566903,    eval_loss=0.961758,    eval_acc=0.583397
epoch:   3,    lr=0.000600,    loss=0.980404,    train_acc=0.601992,    eval_loss=0.950368,    eval_acc=0.600402
epoch:   4,    lr=0.000800,    loss=0.945630,    train_acc=0.618800,    eval_loss=1.016733,    eval_acc=0.571390
epoch:   5,    lr=0.001000,    loss=0.924133,    train_acc=0.628600,    eval_loss=0.968829,    eval_acc=0.596436
Test accuray: 0.59465
epoch:   6,    lr=0.000978,    loss=0.905988,    train_acc=0.637829,    eval_loss=0.945625,    eval_acc=0.596762
epoch:   7,    lr=0.000956,    loss=0.888964,    train_acc=0.645568,    eval_loss=0.957549,    eval_acc=0.594317
epoch:   8,    lr=0.000933,    loss=0.871451,    train_acc=0.653319,    eval_loss=0.904963,    eval_acc=0.610127
epoch:   9,    lr=0.000911,    loss=0.853155,    train_acc=

17it [00:00, 164.16it/s]

Test accuray: 0.5902


4041it [00:26, 154.11it/s]
449it [00:03, 141.96it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.142324,    train_acc=0.494101,    eval_loss=1.089391,    eval_acc=0.554996
epoch:   2,    lr=0.000400,    loss=1.011195,    train_acc=0.583525,    eval_loss=1.074711,    eval_acc=0.556229
epoch:   3,    lr=0.000600,    loss=0.969473,    train_acc=0.608301,    eval_loss=1.026158,    eval_acc=0.579091
epoch:   4,    lr=0.000800,    loss=0.941673,    train_acc=0.620810,    eval_loss=1.119833,    eval_acc=0.544310
epoch:   5,    lr=0.001000,    loss=0.920474,    train_acc=0.630727,    eval_loss=1.039349,    eval_acc=0.550064
Test accuray: 0.58129
epoch:   6,    lr=0.000978,    loss=0.899064,    train_acc=0.639199,    eval_loss=1.206055,    eval_acc=0.506602
epoch:   7,    lr=0.000956,    loss=0.879692,    train_acc=0.648425,    eval_loss=1.015419,    eval_acc=0.597020
epoch:   8,    lr=0.000933,    loss=0.863778,    train_acc=0.655553,    eval_loss=1.037151,    eval_acc=0.589314
epoch:   9,    lr=0.000911,    loss=0.847167,    train_acc=

17it [00:00, 167.53it/s]

Test accuray: 0.62584


4041it [00:27, 147.17it/s]
449it [00:02, 149.83it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.149656,    train_acc=0.483653,    eval_loss=1.124083,    eval_acc=0.506688
epoch:   2,    lr=0.000400,    loss=1.011859,    train_acc=0.580032,    eval_loss=1.060868,    eval_acc=0.549491
epoch:   3,    lr=0.000600,    loss=0.951655,    train_acc=0.616811,    eval_loss=1.045034,    eval_acc=0.567188
epoch:   4,    lr=0.000800,    loss=0.919517,    train_acc=0.632899,    eval_loss=1.025058,    eval_acc=0.568731
epoch:   5,    lr=0.001000,    loss=0.901172,    train_acc=0.640057,    eval_loss=1.039472,    eval_acc=0.580358
Test accuray: 0.59688
epoch:   6,    lr=0.000978,    loss=0.884608,    train_acc=0.648610,    eval_loss=1.262112,    eval_acc=0.532154
epoch:   7,    lr=0.000956,    loss=0.868288,    train_acc=0.654553,    eval_loss=1.028189,    eval_acc=0.580873
epoch:   8,    lr=0.000933,    loss=0.853653,    train_acc=0.659891,    eval_loss=1.048880,    eval_acc=0.573516
epoch:   9,    lr=0.000911,    loss=0.839742,    train_acc=

16it [00:00, 158.60it/s]

Test accuray: 0.56793


4041it [00:27, 145.83it/s]
449it [00:02, 161.02it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.157935,    train_acc=0.480059,    eval_loss=1.076812,    eval_acc=0.567194
epoch:   2,    lr=0.000400,    loss=1.042795,    train_acc=0.558190,    eval_loss=0.978691,    eval_acc=0.626699
epoch:   3,    lr=0.000600,    loss=0.990339,    train_acc=0.588927,    eval_loss=0.957751,    eval_acc=0.635143
epoch:   4,    lr=0.000800,    loss=0.963022,    train_acc=0.603384,    eval_loss=0.951694,    eval_acc=0.642693
epoch:   5,    lr=0.001000,    loss=0.939227,    train_acc=0.614553,    eval_loss=0.948093,    eval_acc=0.640289
Test accuray: 0.66592
epoch:   6,    lr=0.000978,    loss=0.916642,    train_acc=0.627580,    eval_loss=1.073088,    eval_acc=0.578491
epoch:   7,    lr=0.000956,    loss=0.899697,    train_acc=0.635582,    eval_loss=0.921415,    eval_acc=0.659024
epoch:   8,    lr=0.000933,    loss=0.883640,    train_acc=0.642758,    eval_loss=0.917294,    eval_acc=0.657905
epoch:   9,    lr=0.000911,    loss=0.868350,    train_acc=

18it [00:00, 175.47it/s]

Test accuray: 0.65256


4041it [00:29, 138.04it/s]
449it [00:02, 173.66it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.137316,    train_acc=0.489472,    eval_loss=1.098030,    eval_acc=0.508581
epoch:   2,    lr=0.000400,    loss=1.004833,    train_acc=0.583599,    eval_loss=1.091668,    eval_acc=0.532503
epoch:   3,    lr=0.000600,    loss=0.951532,    train_acc=0.613263,    eval_loss=1.055630,    eval_acc=0.561302
epoch:   4,    lr=0.000800,    loss=0.915568,    train_acc=0.632053,    eval_loss=1.022553,    eval_acc=0.574880
epoch:   5,    lr=0.001000,    loss=0.897633,    train_acc=0.640947,    eval_loss=1.070955,    eval_acc=0.561302
Test accuray: 0.55234
epoch:   6,    lr=0.000978,    loss=0.881006,    train_acc=0.649549,    eval_loss=1.061693,    eval_acc=0.568473
epoch:   7,    lr=0.000956,    loss=0.866004,    train_acc=0.654737,    eval_loss=1.019504,    eval_acc=0.583813
epoch:   8,    lr=0.000933,    loss=0.854926,    train_acc=0.659409,    eval_loss=1.040939,    eval_acc=0.587105
epoch:   9,    lr=0.000911,    loss=0.838756,    train_acc=

14it [00:00, 139.23it/s]

Test accuray: 0.58352


4041it [00:26, 152.15it/s]
449it [00:03, 122.57it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.143331,    train_acc=0.488178,    eval_loss=1.091644,    eval_acc=0.535544
epoch:   2,    lr=0.000400,    loss=0.985010,    train_acc=0.601148,    eval_loss=1.073958,    eval_acc=0.549923
epoch:   3,    lr=0.000600,    loss=0.942775,    train_acc=0.622142,    eval_loss=1.041383,    eval_acc=0.576617
epoch:   4,    lr=0.000800,    loss=0.919375,    train_acc=0.633026,    eval_loss=1.011310,    eval_acc=0.595463
epoch:   5,    lr=0.001000,    loss=0.901536,    train_acc=0.641454,    eval_loss=1.053706,    eval_acc=0.565561
Test accuray: 0.55679
epoch:   6,    lr=0.000978,    loss=0.888453,    train_acc=0.646451,    eval_loss=1.040904,    eval_acc=0.571289
epoch:   7,    lr=0.000956,    loss=0.873965,    train_acc=0.652898,    eval_loss=1.144777,    eval_acc=0.545340
epoch:   8,    lr=0.000933,    loss=0.860449,    train_acc=0.659949,    eval_loss=1.080592,    eval_acc=0.553703
epoch:   9,    lr=0.000911,    loss=0.846901,    train_acc=

18it [00:00, 172.47it/s]

Test accuray: 0.55902


4041it [00:25, 157.93it/s]
449it [00:03, 148.78it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.156815,    train_acc=0.475052,    eval_loss=1.178234,    eval_acc=0.471211
epoch:   2,    lr=0.000400,    loss=1.012280,    train_acc=0.575693,    eval_loss=1.120200,    eval_acc=0.537774
epoch:   3,    lr=0.000600,    loss=0.960375,    train_acc=0.606107,    eval_loss=1.054844,    eval_acc=0.588335
epoch:   4,    lr=0.000800,    loss=0.928839,    train_acc=0.623280,    eval_loss=1.117287,    eval_acc=0.551385
epoch:   5,    lr=0.001000,    loss=0.908326,    train_acc=0.631682,    eval_loss=1.076550,    eval_acc=0.577755
Test accuray: 0.57238
epoch:   6,    lr=0.000978,    loss=0.889369,    train_acc=0.641835,    eval_loss=1.106993,    eval_acc=0.541018
epoch:   7,    lr=0.000956,    loss=0.869671,    train_acc=0.650876,    eval_loss=1.160951,    eval_acc=0.514275
epoch:   8,    lr=0.000933,    loss=0.857213,    train_acc=0.656311,    eval_loss=1.080570,    eval_acc=0.560902
epoch:   9,    lr=0.000911,    loss=0.842322,    train_acc=

16it [00:00, 156.15it/s]

Test accuray: 0.58129


4041it [00:26, 153.51it/s]
449it [00:02, 165.93it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.126528,    train_acc=0.502030,    eval_loss=1.159303,    eval_acc=0.508864
epoch:   2,    lr=0.000400,    loss=0.991743,    train_acc=0.591005,    eval_loss=1.124793,    eval_acc=0.529230
epoch:   3,    lr=0.000600,    loss=0.945133,    train_acc=0.618343,    eval_loss=1.087548,    eval_acc=0.550662
epoch:   4,    lr=0.000800,    loss=0.922064,    train_acc=0.630055,    eval_loss=1.167849,    eval_acc=0.526088
epoch:   5,    lr=0.001000,    loss=0.902826,    train_acc=0.638253,    eval_loss=1.166666,    eval_acc=0.526369
Test accuray: 0.55234
epoch:   6,    lr=0.000978,    loss=0.887495,    train_acc=0.644776,    eval_loss=1.108971,    eval_acc=0.543817
epoch:   7,    lr=0.000956,    loss=0.873057,    train_acc=0.650278,    eval_loss=1.078665,    eval_acc=0.544715
epoch:   8,    lr=0.000933,    loss=0.855330,    train_acc=0.657375,    eval_loss=1.098318,    eval_acc=0.554533
epoch:   9,    lr=0.000911,    loss=0.843446,    train_acc=

13it [00:00, 127.38it/s]

Test accuray: 0.57461


4041it [00:29, 137.44it/s]
449it [00:02, 160.76it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.129147,    train_acc=0.500917,    eval_loss=1.048761,    eval_acc=0.542522
epoch:   2,    lr=0.000400,    loss=0.990143,    train_acc=0.591836,    eval_loss=0.998643,    eval_acc=0.586577
epoch:   3,    lr=0.000600,    loss=0.948823,    train_acc=0.612856,    eval_loss=1.007501,    eval_acc=0.586046
epoch:   4,    lr=0.000800,    loss=0.919298,    train_acc=0.627770,    eval_loss=0.960853,    eval_acc=0.617363
epoch:   5,    lr=0.001000,    loss=0.899950,    train_acc=0.636294,    eval_loss=0.987752,    eval_acc=0.598726
Test accuray: 0.64588
epoch:   6,    lr=0.000978,    loss=0.883561,    train_acc=0.645655,    eval_loss=0.972180,    eval_acc=0.604270
epoch:   7,    lr=0.000956,    loss=0.869533,    train_acc=0.651250,    eval_loss=1.003222,    eval_acc=0.589290
epoch:   8,    lr=0.000933,    loss=0.854317,    train_acc=0.657398,    eval_loss=1.054475,    eval_acc=0.581623
epoch:   9,    lr=0.000911,    loss=0.842359,    train_acc=

18it [00:00, 175.68it/s]

Test accuray: 0.61024


4041it [00:26, 152.21it/s]
449it [00:02, 161.13it/s]


Start training
epoch:   1,    lr=0.000200,    loss=1.125520,    train_acc=0.500184,    eval_loss=1.126347,    eval_acc=0.528246
epoch:   2,    lr=0.000400,    loss=0.989149,    train_acc=0.594161,    eval_loss=1.145453,    eval_acc=0.553721
epoch:   3,    lr=0.000600,    loss=0.942636,    train_acc=0.618543,    eval_loss=1.076926,    eval_acc=0.560239
epoch:   4,    lr=0.000800,    loss=0.914368,    train_acc=0.632996,    eval_loss=1.045484,    eval_acc=0.565345
epoch:   5,    lr=0.001000,    loss=0.891412,    train_acc=0.645114,    eval_loss=1.101681,    eval_acc=0.555894
Test accuray: 0.63697
epoch:   6,    lr=0.000978,    loss=0.875303,    train_acc=0.652442,    eval_loss=1.284294,    eval_acc=0.504834
epoch:   7,    lr=0.000956,    loss=0.859439,    train_acc=0.658648,    eval_loss=1.131919,    eval_acc=0.558501
epoch:   8,    lr=0.000933,    loss=0.844609,    train_acc=0.665835,    eval_loss=1.104000,    eval_acc=0.566105
epoch:   9,    lr=0.000911,    loss=0.831736,    train_acc=

In [14]:
with_focus_attn = [0.6146993318485523, 0.6035634743875279, 0.6169265033407573, 
                   0.6948775055679287, 0.6169265033407573, 0.5991091314031181, 
                   0.5946547884187082, 0.6080178173719376, 0.6391982182628062, 0.6592427616926503]
without_focus_attn = [0.6080178173719376, 0.6347438752783965, 0.6191536748329621, 
                      0.6815144766146993, 0.6035634743875279, 0.5835189309576837, 
                      0.5902004454342984, 0.5746102449888641, 0.6458797327394209, 0.6503340757238307]

In [15]:
print('with_focus_attn:', sum(with_focus_attn) / 10)
print('without_focus_attn:', sum(without_focus_attn) / 10)

with_focus_attn: 0.6247216035634744
without_focus_attn: 0.6191536748329621


In [None]:
# 0.70824

In [10]:
with_focus_attn = [0.6414253897550112, 0.6280623608017817, 0.6169265033407573, 
                   0.6904231625835189, 0.6325167037861915, 0.6458797327394209, 
                   0.6035634743875279, 0.5991091314031181, 0.6614699331848553, 0.6503340757238307]

without_focus_attn = [0.6191536748329621, 0.6080178173719376, 0.6347438752783965, 
                      0.6859688195991092, 0.6302895322939867, 0.6057906458797327, 
                      0.6035634743875279, 0.6102449888641426, 0.6592427616926503, 0.6525612472160356]

In [11]:
print('with_focus_attn:', sum(with_focus_attn) / 10)
print('without_focus_attn:', sum(without_focus_attn) / 10)

with_focus_attn: 0.6369710467706013
without_focus_attn: 0.6309576837416481


In [9]:
for with_focus_attn in focus_attn_list:
    cv_eval = []
    #with_focus_attn = True if(with_focus_attn == 'true') else False
    for train_index, eval_index in skf.split(df['sample_name'], df['sample_label']):
        conv_dim = '1d'
        checkpoint = ''
        hidden_size = 128
        num_layers = 2
        bidirectional = 'true'

        batch_size = 256
        num_epochs = 50
        learning_rate = 0.001

        use_warmup = 'true'
        data_dir = './wav_data/pretrain/IEMOCAP_sub/'
        multi_task = 'false'
        augmentation = 'false'
        
        save_checkpoint_steps = 5
        output_dir = './model'
        
        bidirectional = True if(bidirectional == 'true') else False
        n_mfcc = 40 if(conv_dim == '1d') else 128

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        train_samples, eval_samples = df['sample_name'][train_index], df['sample_name'][eval_index]
        train_label, eval_label = df['sample_label'][train_index], df['sample_label'][eval_index]
        
        train_samples = [data_dir + train_sample + '.wav' for train_sample in train_samples]
        eval_samples = [data_dir + eval_sample + '.wav' for eval_sample in eval_samples]

        y_train = np.array(train_label)
        y_eval = np.array(eval_label)

        if(multi_task == 'true'):
            speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
            y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

            y_g_train = y_gender[train_idx]
            y_g_eval = y_gender[eval_idx]

        X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
        X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

        if(augmentation == 'true'):
            X_train_flip = X_train[:, :, :, ::-1]
            y_train_flip = y_train.copy()

            X_train = np.concatenate((X_train, X_train_flip), axis=0)
            y_train = np.concatenate((y_train, y_train_flip), axis=0)

        X_train, y_train = convert_tensor(X_train, y_train)
        X_eval, y_eval = convert_tensor(X_eval, y_eval)

        y_train = y_train.long()
        y_eval = y_eval.long()

        if(multi_task == 'true'):
            _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
            _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

            if(augmentation == 'true'):
                y_g_train_flip = y_g_train.copy()
                y_g_train = np.concatenate((y_g_train, y_g_train_flip))

            y_g_train = torch.tensor(y_g_train).float()
            y_g_eval = torch.tensor(y_g_eval).float()

            y_g_train = y_g_train.unsqueeze(-1)
            y_g_eval = y_g_eval.unsqueeze(-1)

        if(multi_task == 'true'):
            train_ds = TensorDataset(X_train, y_train, y_g_train)
            eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
        else:
            train_ds = TensorDataset(X_train, y_train)
            eval_ds = TensorDataset(X_eval, y_eval)

        train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
        eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0)

        model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                      num_layers=num_layers, bidirectional=bidirectional,
                      with_focus_attn=with_focus_attn).to(device)

        if(multi_task == 'true'):
            model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                num_layers=num_layers, bidirectional=bidirectional,
                                with_focus_attn=with_focus_attn).to(device)

        if(multi_task == 'true'):
            loss_func = nn.CrossEntropyLoss()
            loss_func_g = nn.BCELoss()
            optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
        else:
            loss_func = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        if(use_warmup == 'true'):
            t_total = len(train_dataloader) // 1 * num_epochs
            opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

        eval_acc = train(train_dataloader, eval_dataloader, num_epochs)
        cv_eval.append(eval_acc)

    print('conv_dim:', conv_dim, '\twith_focus_attn:', with_focus_attn)
    print('Test accuray:', cv_eval)
    print()

4041it [00:25, 158.41it/s]
449it [00:03, 146.35it/s]


Start training
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 

c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch

glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 tor

glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 tor

c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch

P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Size([1, 1, 1, 50])
G torch.Size([256, 4, 50, 50])
glo torch.Size([256, 1, 8])
c_1 torch.Size([256, 50, 8])
c_2 torch.Size([256, 4, 50, 2])
up torch.Size([4, 1, 2])
p torch.Size([256, 4, 50, 2])
uz torch.Size([4, 1, 2])
z torch.Size([256, 4, 50, 2])
P torch.Size([256, 4, 50])
Z torch.Size([256, 4, 50])
j torch.Si

KeyboardInterrupt: 

In [10]:
with_focus_attn

False

In [11]:
for with_focus_attn in focus_attn_list:
    print(with_focus_attn)

True
False


In [None]:
# window_size 25ms, hop_size 10ms
for conv_dim in conv_dim_list:
    for augmentation in augmentation_list:
        for with_focus_attn in focus_attn_list:
            for multi_task in multi_task_list:
                #conv_dim = '1d'
                checkpoint = './output/aae_' + conv_dim + '_step_500.pt'
                hidden_size = 128
                num_layers = 2
                bidirectional = 'true'
                #with_focus_attn = 'false'

                batch_size = 128
                num_epochs = 300
                learning_rate = 0.0001

                use_warmup = 'true'
                data_dir = './wav_data/pretrain/RAVDESS_resample/'
                #multi_task = 'false'
                #augmentation = 'true'

                bidirectional = True if(bidirectional == 'true') else False
                #with_focus_attn = True if(with_focus_attn == 'true') else False
                n_mfcc = 40 if(conv_dim == '1d') else 128

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
                sample_datas = sorted(sample_datas)

                acc_list = []
                for i in range(5):
                    np.random.seed(10 * i + 3)
                    idx = np.random.permutation(len(sample_datas))
                    train_idx = idx[:int(len(sample_datas)*0.75)]
                    eval_idx = idx[int(len(sample_datas)*0.75):]

                    train_samples = list(np.array(sample_datas)[train_idx])
                    eval_samples = list(np.array(sample_datas)[eval_idx])

                    y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
                    y_train = y[train_idx]
                    y_eval = y[eval_idx]

                    if(multi_task == 'true'):
                        speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
                        y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

                        y_g_train = y_gender[train_idx]
                        y_g_eval = y_gender[eval_idx]

                    X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
                    X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

                    if(augmentation == 'true'):
                        X_train_flip = X_train[:, :, :, ::-1]
                        y_train_flip = y_train.copy()

                        X_train = np.concatenate((X_train, X_train_flip), axis=0)
                        y_train = np.concatenate((y_train, y_train_flip), axis=0)
                    
                    X_train, y_train = convert_tensor(X_train, y_train)
                    X_eval, y_eval = convert_tensor(X_eval, y_eval)

                    y_train = y_train.long()
                    y_eval = y_eval.long()

                    if(multi_task == 'true'):
                        _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
                        _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

                        if(augmentation == 'true'):
                            y_g_train_flip = y_g_train.copy()
                            y_g_train = np.concatenate((y_g_train, y_g_train_flip))
                        
                        y_g_train = torch.tensor(y_g_train).float()
                        y_g_eval = torch.tensor(y_g_eval).float()

                        y_g_train = y_g_train.unsqueeze(-1)
                        y_g_eval = y_g_eval.unsqueeze(-1)

                    if(multi_task == 'true'):
                        train_ds = TensorDataset(X_train, y_train, y_g_train)
                        eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
                    else:
                        train_ds = TensorDataset(X_train, y_train)
                        eval_ds = TensorDataset(X_eval, y_eval)

                    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                    eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

                    model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                  num_layers=num_layers, bidirectional=bidirectional,
                                  with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                            num_layers=num_layers, bidirectional=bidirectional,
                                            with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        loss_func = nn.CrossEntropyLoss()
                        loss_func_g = nn.BCELoss()
                        optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
                    else:
                        loss_func = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                    if(use_warmup == 'true'):
                        t_total = len(train_dataloader) // 1 * num_epochs
                        opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

                    train(train_dataloader, eval_dataloader, num_epochs)

                    model.eval()
                    if(multi_task == 'true'):
                        model_g.eval()

                    correct = 0
                    n = 0
                    for i in range(len(eval_samples)):
                        try:
                            X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                            X_new = convert_tensor(X_new).to(device)
                            y_new = model(X_new)
                            y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                            #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                            #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                            y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
                            correct += y_new
                            n += 1
                        except:
                            pass

                    acc = correct / n
                    acc_list.append(acc)

                acc_mean = sum(acc_list) / 5
                print('conv_dim:', conv_dim, '\taugmentation', augmentation, 
                      '\twith_focus_attn:', with_focus_attn, '\tmulti_task:', multi_task)
                print('Test accuray:', round(acc_mean, 5))
                print()

In [None]:
for conv_dim in conv_dim_list:
    for augmentation in augmentation_list:
        for with_focus_attn in focus_attn_list:
            for multi_task in multi_task_list:
                #conv_dim = '1d'
                checkpoint = './output/aae_' + conv_dim + '_step_300.pt'
                hidden_size = 128
                num_layers = 2
                bidirectional = 'true'
                #with_focus_attn = 'false'

                batch_size = 128
                num_epochs = 300
                learning_rate = 0.0001

                use_warmup = 'true'
                data_dir = './wav_data/pretrain/RAVDESS_resample/'
                #multi_task = 'false'
                #augmentation = 'true'

                bidirectional = True if(bidirectional == 'true') else False
                #with_focus_attn = True if(with_focus_attn == 'true') else False
                n_mfcc = 40 if(conv_dim == '1d') else 128

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
                sample_datas = sorted(sample_datas)

                acc_list = []
                for i in range(5):
                    np.random.seed(10 * i + 3)
                    idx = np.random.permutation(len(sample_datas))
                    train_idx = idx[:int(len(sample_datas)*0.75)]
                    eval_idx = idx[int(len(sample_datas)*0.75):]

                    train_samples = list(np.array(sample_datas)[train_idx])
                    eval_samples = list(np.array(sample_datas)[eval_idx])

                    y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
                    y_train = y[train_idx]
                    y_eval = y[eval_idx]

                    if(multi_task == 'true'):
                        speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
                        y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

                        y_g_train = y_gender[train_idx]
                        y_g_eval = y_gender[eval_idx]

                    X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
                    X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

                    if(augmentation == 'true'):
                        X_train_flip = X_train[:, :, :, ::-1]
                        y_train_flip = y_train.copy()

                        X_train = np.concatenate((X_train, X_train_flip), axis=0)
                        y_train = np.concatenate((y_train, y_train_flip), axis=0)
                    
                    X_train, y_train = convert_tensor(X_train, y_train)
                    X_eval, y_eval = convert_tensor(X_eval, y_eval)

                    y_train = y_train.long()
                    y_eval = y_eval.long()

                    if(multi_task == 'true'):
                        _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
                        _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

                        if(augmentation == 'true'):
                            y_g_train_flip = y_g_train.copy()
                            y_g_train = np.concatenate((y_g_train, y_g_train_flip))
                        
                        y_g_train = torch.tensor(y_g_train).float()
                        y_g_eval = torch.tensor(y_g_eval).float()

                        y_g_train = y_g_train.unsqueeze(-1)
                        y_g_eval = y_g_eval.unsqueeze(-1)

                    if(multi_task == 'true'):
                        train_ds = TensorDataset(X_train, y_train, y_g_train)
                        eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
                    else:
                        train_ds = TensorDataset(X_train, y_train)
                        eval_ds = TensorDataset(X_eval, y_eval)

                    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                    eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

                    model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                  num_layers=num_layers, bidirectional=bidirectional,
                                  with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                            num_layers=num_layers, bidirectional=bidirectional,
                                            with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        loss_func = nn.CrossEntropyLoss()
                        loss_func_g = nn.BCELoss()
                        optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
                    else:
                        loss_func = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                    if(use_warmup == 'true'):
                        t_total = len(train_dataloader) // 1 * num_epochs
                        opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

                    train(train_dataloader, eval_dataloader, num_epochs)

                    model.eval()
                    if(multi_task == 'true'):
                        model_g.eval()

                    correct = 0
                    n = 0
                    for i in range(len(eval_samples)):
                        try:
                            X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                            X_new = convert_tensor(X_new).to(device)
                            y_new = model(X_new)
                            y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                            #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                            #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                            y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
                            correct += y_new
                            n += 1
                        except:
                            pass

                    acc = correct / n
                    acc_list.append(acc)

                acc_mean = sum(acc_list) / 5
                print('conv_dim:', conv_dim, '\taugmentation', augmentation, 
                      '\twith_focus_attn:', with_focus_attn, '\tmulti_task:', multi_task)
                print('Test accuray:', round(acc_mean, 5))
                print()