In [1]:
import os
import glob

import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from preprocessing import preprocessing, convert_spectrograms, convert_tensor
from model_ae import Encoder
from utils.optimization import WarmupLinearSchedule

In [2]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_attn_heads, attn_hidden_size, dropout_prob, with_focus_attn):
        super(MultiHeadedAttention, self).__init__()
        self.num_attn_heads = num_attn_heads
        self.hidden_size = attn_hidden_size
        self.dropout_prob = dropout_prob
        self.with_focus_attn = with_focus_attn
        
        self.attn_head_size = int(self.hidden_size / self.num_attn_heads)
        self.all_head_size = self.num_attn_heads * self.attn_head_size

        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.softmax = nn.Softmax(dim=-1)
        
        if(with_focus_attn == True):
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()
            
            self.linear_focus_query = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                num_attn_heads * self.attn_head_size)
            self.linear_focus_global = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                 num_attn_heads * self.attn_head_size)
            
            up = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.up = Variable(up, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.up)
            
            uz = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.uz = Variable(uz, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.uz)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attn_heads, self.attn_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        key_len = hidden_states.size(1)
        
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        if(self.with_focus_attn == True):
            glo = torch.mean(mixed_query_layer, dim=1, keepdim=True)
            
            c = self.tanh(self.linear_focus_query(mixed_query_layer) + self.linear_focus_global(glo))
            c = self.transpose_for_scores(c)
            
            p = c * self.up
            p = p.sum(3).squeeze()
            z = c * self.uz
            z = z.sum(3).squeeze()
            
            P = self.sigmoid(p) * key_len
            Z = self.sigmoid(z) * key_len
            
            j = torch.arange(start=0, end=key_len, dtype=P.dtype).unsqueeze(0).unsqueeze(0).unsqueeze(0).to('cuda')
            P = P.unsqueeze(-1)
            Z = Z.unsqueeze(-1)
            
            G = -(j - P)**2 * 2 / (Z**2)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attn_head_size)
        
        if(self.with_focus_attn == True):
            attention_scores = attention_scores + G
            
        attention_probs = self.softmax(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.o_proj(context_layer)

        return attention_output

In [3]:
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [4]:
class CLDNN_G(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN_G, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [5]:
def train(train_dataloader, eval_dataloader, epochs):
        #print('Start training')
        softmax = nn.Softmax(dim=1)
        for epoch in range(epochs):
            model.train()
            train_loss = 0
            nb_train_steps = 0
            correct = 0
            num_samples = 0
            
            if(multi_task == 'true'):
                for X_batch, y_batch, y_g_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)
                    outputs_g = model_g(X_batch)

                    loss_1 = loss_func(outputs, y_batch)
                    loss_2 = loss_func_g(outputs_g, y_g_batch)
                    loss = loss_1 + 0.8 * loss_2
                    loss.backward(retain_graph=True)

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch, y_g_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)
                        outputs_g = model_g(X_batch)

                    tmp_eval_loss_1 = loss_func(outputs, y_batch)
                    tmp_eval_loss_2 = loss_func_g(outputs_g, y_g_batch)
                    tmp_eval_loss = tmp_eval_loss_1 + 0.8 * tmp_eval_loss_2
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples
            else:
                for X_batch, y_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)

                    loss = loss_func(outputs, y_batch)
                    loss.backward()

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)

                    tmp_eval_loss = loss_func(outputs, y_batch)
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples
            '''
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
            print('epoch: {:3d},    lr={:6f},    loss={:5f},    train_acc={:5f},    eval_loss={:5f},    eval_acc={:5f}'
                  .format(epoch+1, lr, train_loss, train_accuracy, eval_loss, eval_accuracy))

            
            if((epoch+1) % args.save_checkpoint_steps == 0):
                model_checkpoint = "%s_%s_step_%d.pt" % ('CLDNN', args.conv_dim, epoch+1)
                output_model_file = os.path.join(args.output_dir, model_checkpoint)
                if(args.multi_gpu == 'true'):
                    torch.save(model.module.state_dict(), output_model_file)
                else:
                    torch.save(model.state_dict(), output_model_file)
                print("Saving checkpoint %s" % output_model_file)
            '''

In [6]:
conv_dim_list = ['1d']
augmentation_list = ['true', 'false']
focus_attn_list = [True, False]
multi_task_list = ['true', 'false']

In [None]:
# AI Hub
for conv_dim in conv_dim_list:
    for augmentation in augmentation_list:
        for with_focus_attn in focus_attn_list:
            for multi_task in multi_task_list:
                #conv_dim = '1d'
                checkpoint = './output/aae_' + conv_dim + '_step_300.pt'
                hidden_size = 128
                num_layers = 2
                bidirectional = 'true'
                #with_focus_attn = 'false'

                batch_size = 128
                num_epochs = 300
                learning_rate = 0.0001

                use_warmup = 'true'
                data_dir = './wav_data/pretrain/RAVDESS_resample/'
                #multi_task = 'false'
                #augmentation = 'true'

                bidirectional = True if(bidirectional == 'true') else False
                #with_focus_attn = True if(with_focus_attn == 'true') else False
                n_mfcc = 40 if(conv_dim == '1d') else 128

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
                sample_datas = sorted(sample_datas)

                acc_list = []
                for i in range(5):
                    np.random.seed(10 * i + 3)
                    idx = np.random.permutation(len(sample_datas))
                    train_idx = idx[:int(len(sample_datas)*0.75)]
                    eval_idx = idx[int(len(sample_datas)*0.75):]

                    train_samples = list(np.array(sample_datas)[train_idx])
                    eval_samples = list(np.array(sample_datas)[eval_idx])

                    y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
                    y_train = y[train_idx]
                    y_eval = y[eval_idx]

                    if(multi_task == 'true'):
                        speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
                        y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

                        y_g_train = y_gender[train_idx]
                        y_g_eval = y_gender[eval_idx]

                    X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
                    X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

                    if(augmentation == 'true'):
                        X_train_flip = X_train[:, :, :, ::-1]
                        y_train_flip = y_train.copy()

                        X_train = np.concatenate((X_train, X_train_flip), axis=0)
                        y_train = np.concatenate((y_train, y_train_flip), axis=0)
                    
                    X_train, y_train = convert_tensor(X_train, y_train)
                    X_eval, y_eval = convert_tensor(X_eval, y_eval)

                    y_train = y_train.long()
                    y_eval = y_eval.long()

                    if(multi_task == 'true'):
                        _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
                        _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

                        if(augmentation == 'true'):
                            y_g_train_flip = y_g_train.copy()
                            y_g_train = np.concatenate((y_g_train, y_g_train_flip))
                        
                        y_g_train = torch.tensor(y_g_train).float()
                        y_g_eval = torch.tensor(y_g_eval).float()

                        y_g_train = y_g_train.unsqueeze(-1)
                        y_g_eval = y_g_eval.unsqueeze(-1)

                    if(multi_task == 'true'):
                        train_ds = TensorDataset(X_train, y_train, y_g_train)
                        eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
                    else:
                        train_ds = TensorDataset(X_train, y_train)
                        eval_ds = TensorDataset(X_eval, y_eval)

                    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                    eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

                    model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                  num_layers=num_layers, bidirectional=bidirectional,
                                  with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                            num_layers=num_layers, bidirectional=bidirectional,
                                            with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        loss_func = nn.CrossEntropyLoss()
                        loss_func_g = nn.BCELoss()
                        optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
                    else:
                        loss_func = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                    if(use_warmup == 'true'):
                        t_total = len(train_dataloader) // 1 * num_epochs
                        opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

                    train(train_dataloader, eval_dataloader, num_epochs)

                    model.eval()
                    if(multi_task == 'true'):
                        model_g.eval()

                    correct = 0
                    n = 0
                    for i in range(len(eval_samples)):
                        try:
                            X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                            X_new = convert_tensor(X_new).to(device)
                            y_new = model(X_new)
                            y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                            #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                            #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                            y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
                            correct += y_new
                            n += 1
                        except:
                            pass

                    acc = correct / n
                    acc_list.append(acc)

                acc_mean = sum(acc_list) / 5
                print('conv_dim:', conv_dim, '\taugmentation', augmentation, 
                      '\twith_focus_attn:', with_focus_attn, '\tmulti_task:', multi_task)
                print('Test accuray:', round(acc_mean, 5))
                print()

1080it [00:07, 152.71it/s]
360it [00:01, 195.04it/s]
1080it [00:06, 179.13it/s]
360it [00:01, 188.43it/s]
1080it [00:05, 192.41it/s]
360it [00:01, 199.83it/s]
1080it [00:05, 194.11it/s]
360it [00:01, 198.00it/s]
1080it [00:05, 188.87it/s]
360it [00:02, 134.73it/s]
1080it [00:06, 157.84it/s]
360it [00:01, 198.79it/s]
1080it [00:05, 195.13it/s]
360it [00:01, 194.85it/s]
1080it [00:07, 136.18it/s]
360it [00:02, 138.39it/s]
1080it [00:06, 174.14it/s]
360it [00:01, 198.55it/s]
1080it [00:05, 186.11it/s]
360it [00:01, 186.41it/s]
20it [00:00, 195.79it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: True 	multi_task: true
Test accuray: 0.66444



1080it [00:05, 184.54it/s]
360it [00:02, 138.25it/s]
1080it [00:06, 170.93it/s]
360it [00:01, 194.46it/s]
1080it [00:05, 184.99it/s]
360it [00:02, 140.23it/s]
1080it [00:05, 194.14it/s]
360it [00:01, 196.27it/s]
1080it [00:05, 195.47it/s]
360it [00:01, 199.36it/s]


In [7]:
# window_size 25ms, hop_size 10ms
for conv_dim in conv_dim_list:
    for augmentation in augmentation_list:
        for with_focus_attn in focus_attn_list:
            for multi_task in multi_task_list:
                #conv_dim = '1d'
                checkpoint = './output/aae_' + conv_dim + '_step_500.pt'
                hidden_size = 128
                num_layers = 2
                bidirectional = 'true'
                #with_focus_attn = 'false'

                batch_size = 128
                num_epochs = 300
                learning_rate = 0.0001

                use_warmup = 'true'
                data_dir = './wav_data/pretrain/RAVDESS_resample/'
                #multi_task = 'false'
                #augmentation = 'true'

                bidirectional = True if(bidirectional == 'true') else False
                #with_focus_attn = True if(with_focus_attn == 'true') else False
                n_mfcc = 40 if(conv_dim == '1d') else 128

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
                sample_datas = sorted(sample_datas)

                acc_list = []
                for i in range(5):
                    np.random.seed(10 * i + 3)
                    idx = np.random.permutation(len(sample_datas))
                    train_idx = idx[:int(len(sample_datas)*0.75)]
                    eval_idx = idx[int(len(sample_datas)*0.75):]

                    train_samples = list(np.array(sample_datas)[train_idx])
                    eval_samples = list(np.array(sample_datas)[eval_idx])

                    y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
                    y_train = y[train_idx]
                    y_eval = y[eval_idx]

                    if(multi_task == 'true'):
                        speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
                        y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

                        y_g_train = y_gender[train_idx]
                        y_g_eval = y_gender[eval_idx]

                    X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
                    X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

                    if(augmentation == 'true'):
                        X_train_flip = X_train[:, :, :, ::-1]
                        y_train_flip = y_train.copy()

                        X_train = np.concatenate((X_train, X_train_flip), axis=0)
                        y_train = np.concatenate((y_train, y_train_flip), axis=0)
                    
                    X_train, y_train = convert_tensor(X_train, y_train)
                    X_eval, y_eval = convert_tensor(X_eval, y_eval)

                    y_train = y_train.long()
                    y_eval = y_eval.long()

                    if(multi_task == 'true'):
                        _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
                        _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

                        if(augmentation == 'true'):
                            y_g_train_flip = y_g_train.copy()
                            y_g_train = np.concatenate((y_g_train, y_g_train_flip))
                        
                        y_g_train = torch.tensor(y_g_train).float()
                        y_g_eval = torch.tensor(y_g_eval).float()

                        y_g_train = y_g_train.unsqueeze(-1)
                        y_g_eval = y_g_eval.unsqueeze(-1)

                    if(multi_task == 'true'):
                        train_ds = TensorDataset(X_train, y_train, y_g_train)
                        eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
                    else:
                        train_ds = TensorDataset(X_train, y_train)
                        eval_ds = TensorDataset(X_eval, y_eval)

                    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                    eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

                    model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                  num_layers=num_layers, bidirectional=bidirectional,
                                  with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                            num_layers=num_layers, bidirectional=bidirectional,
                                            with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        loss_func = nn.CrossEntropyLoss()
                        loss_func_g = nn.BCELoss()
                        optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
                    else:
                        loss_func = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                    if(use_warmup == 'true'):
                        t_total = len(train_dataloader) // 1 * num_epochs
                        opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

                    train(train_dataloader, eval_dataloader, num_epochs)

                    model.eval()
                    if(multi_task == 'true'):
                        model_g.eval()

                    correct = 0
                    n = 0
                    for i in range(len(eval_samples)):
                        try:
                            X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                            X_new = convert_tensor(X_new).to(device)
                            y_new = model(X_new)
                            y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                            #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                            #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                            y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
                            correct += y_new
                            n += 1
                        except:
                            pass

                    acc = correct / n
                    acc_list.append(acc)

                acc_mean = sum(acc_list) / 5
                print('conv_dim:', conv_dim, '\taugmentation', augmentation, 
                      '\twith_focus_attn:', with_focus_attn, '\tmulti_task:', multi_task)
                print('Test accuray:', round(acc_mean, 5))
                print()

1080it [00:07, 152.07it/s]
360it [00:02, 135.77it/s]
1080it [00:07, 145.02it/s]
360it [00:02, 179.74it/s]
1080it [00:07, 138.82it/s]
360it [00:02, 148.03it/s]
1080it [00:08, 132.02it/s]
360it [00:02, 132.90it/s]
1080it [00:05, 191.33it/s]
360it [00:02, 170.37it/s]
1080it [00:06, 168.55it/s]
360it [00:01, 192.66it/s]
1080it [00:06, 173.92it/s]
360it [00:01, 187.85it/s]
1080it [00:05, 187.68it/s]
360it [00:01, 187.75it/s]
1080it [00:09, 119.01it/s]
360it [00:02, 161.21it/s]
1080it [00:05, 187.06it/s]
360it [00:01, 184.49it/s]
14it [00:00, 135.56it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: True 	multi_task: true
Test accuray: 0.65833



1080it [00:07, 143.31it/s]
360it [00:02, 136.37it/s]
1080it [00:07, 140.99it/s]
360it [00:02, 136.69it/s]
1080it [00:06, 160.39it/s]
360it [00:02, 134.02it/s]
1080it [00:05, 195.86it/s]
360it [00:01, 196.55it/s]
1080it [00:05, 192.06it/s]
360it [00:01, 191.59it/s]
20it [00:00, 195.21it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: True 	multi_task: false
Test accuray: 0.635



1080it [00:06, 177.41it/s]
360it [00:01, 189.22it/s]
1080it [00:05, 189.18it/s]
360it [00:01, 195.85it/s]
1080it [00:08, 127.91it/s]
360it [00:02, 160.11it/s]
1080it [00:07, 140.40it/s]
360it [00:02, 121.80it/s]
1080it [00:06, 170.81it/s]
360it [00:02, 174.91it/s]
1080it [00:05, 186.47it/s]
360it [00:01, 188.32it/s]
1080it [00:05, 180.82it/s]
360it [00:01, 195.41it/s]
1080it [00:05, 190.79it/s]
360it [00:01, 195.58it/s]
1080it [00:05, 180.70it/s]
360it [00:01, 198.38it/s]
1080it [00:06, 175.51it/s]
360it [00:02, 134.51it/s]
19it [00:00, 186.66it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: False 	multi_task: true
Test accuray: 0.66556



1080it [00:06, 162.71it/s]
360it [00:02, 135.80it/s]
1080it [00:06, 168.54it/s]
360it [00:02, 136.04it/s]
1080it [00:07, 136.51it/s]
360it [00:02, 143.10it/s]
1080it [00:07, 151.91it/s]
360it [00:02, 155.34it/s]
1080it [00:05, 196.49it/s]
360it [00:01, 197.73it/s]
20it [00:00, 194.57it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: False 	multi_task: false
Test accuray: 0.63722



1080it [00:07, 138.60it/s]
360it [00:02, 137.13it/s]
1080it [00:07, 135.88it/s]
360it [00:02, 137.08it/s]
1080it [00:05, 195.76it/s]
360it [00:01, 195.37it/s]
1080it [00:05, 191.76it/s]
360it [00:01, 192.06it/s]
1080it [00:08, 134.77it/s]
360it [00:02, 134.27it/s]
1080it [00:08, 133.57it/s]
360it [00:02, 146.71it/s]
1080it [00:05, 184.18it/s]
360it [00:02, 173.07it/s]
1080it [00:06, 159.87it/s]
360it [00:02, 136.24it/s]
1080it [00:05, 196.70it/s]
360it [00:01, 194.51it/s]
1080it [00:06, 168.51it/s]
360it [00:02, 137.40it/s]
20it [00:00, 190.14it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: True 	multi_task: true
Test accuray: 0.55667



1080it [00:05, 189.00it/s]
360it [00:01, 192.38it/s]
1080it [00:07, 146.52it/s]
360it [00:02, 179.53it/s]
1080it [00:07, 143.40it/s]
360it [00:02, 136.78it/s]
1080it [00:06, 173.32it/s]
360it [00:02, 137.28it/s]
1080it [00:05, 188.31it/s]
360it [00:02, 177.87it/s]
20it [00:00, 192.76it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: True 	multi_task: false
Test accuray: 0.57722



1080it [00:05, 195.39it/s]
360it [00:01, 196.20it/s]
1080it [00:05, 192.40it/s]
360it [00:01, 196.32it/s]
1080it [00:07, 149.02it/s]
360it [00:02, 136.62it/s]
1080it [00:08, 134.97it/s]
360it [00:02, 136.88it/s]
1080it [00:07, 143.21it/s]
360it [00:02, 136.77it/s]
1080it [00:06, 169.02it/s]
360it [00:02, 136.66it/s]
1080it [00:05, 185.00it/s]
360it [00:01, 192.64it/s]
1080it [00:05, 193.63it/s]
360it [00:01, 195.60it/s]
1080it [00:06, 154.57it/s]
360it [00:02, 137.14it/s]
1080it [00:07, 136.54it/s]
360it [00:02, 144.13it/s]
20it [00:00, 189.91it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: False 	multi_task: true
Test accuray: 0.56056



1080it [00:05, 191.88it/s]
360it [00:01, 195.77it/s]
1080it [00:05, 188.07it/s]
360it [00:01, 186.29it/s]
1080it [00:05, 193.84it/s]
360it [00:01, 194.13it/s]
1080it [00:05, 190.89it/s]
360it [00:01, 185.61it/s]
1080it [00:07, 144.38it/s]
360it [00:02, 137.01it/s]


conv_dim: 1d 	augmentation false 	with_focus_attn: False 	multi_task: false
Test accuray: 0.56222



In [7]:
for conv_dim in conv_dim_list:
    for augmentation in augmentation_list:
        for with_focus_attn in focus_attn_list:
            for multi_task in multi_task_list:
                #conv_dim = '1d'
                checkpoint = './output/aae_' + conv_dim + '_step_300.pt'
                hidden_size = 128
                num_layers = 2
                bidirectional = 'true'
                #with_focus_attn = 'false'

                batch_size = 128
                num_epochs = 300
                learning_rate = 0.0001

                use_warmup = 'true'
                data_dir = './wav_data/pretrain/RAVDESS_resample/'
                #multi_task = 'false'
                #augmentation = 'true'

                bidirectional = True if(bidirectional == 'true') else False
                #with_focus_attn = True if(with_focus_attn == 'true') else False
                n_mfcc = 40 if(conv_dim == '1d') else 128

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
                sample_datas = sorted(sample_datas)

                acc_list = []
                for i in range(5):
                    np.random.seed(10 * i + 3)
                    idx = np.random.permutation(len(sample_datas))
                    train_idx = idx[:int(len(sample_datas)*0.75)]
                    eval_idx = idx[int(len(sample_datas)*0.75):]

                    train_samples = list(np.array(sample_datas)[train_idx])
                    eval_samples = list(np.array(sample_datas)[eval_idx])

                    y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
                    y_train = y[train_idx]
                    y_eval = y[eval_idx]

                    if(multi_task == 'true'):
                        speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0]), sample_datas)))
                        y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

                        y_g_train = y_gender[train_idx]
                        y_g_eval = y_gender[eval_idx]

                    X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
                    X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

                    if(augmentation == 'true'):
                        X_train_flip = X_train[:, :, :, ::-1]
                        y_train_flip = y_train.copy()

                        X_train = np.concatenate((X_train, X_train_flip), axis=0)
                        y_train = np.concatenate((y_train, y_train_flip), axis=0)
                    
                    X_train, y_train = convert_tensor(X_train, y_train)
                    X_eval, y_eval = convert_tensor(X_eval, y_eval)

                    y_train = y_train.long()
                    y_eval = y_eval.long()

                    if(multi_task == 'true'):
                        _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
                        _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)

                        if(augmentation == 'true'):
                            y_g_train_flip = y_g_train.copy()
                            y_g_train = np.concatenate((y_g_train, y_g_train_flip))
                        
                        y_g_train = torch.tensor(y_g_train).float()
                        y_g_eval = torch.tensor(y_g_eval).float()

                        y_g_train = y_g_train.unsqueeze(-1)
                        y_g_eval = y_g_eval.unsqueeze(-1)

                    if(multi_task == 'true'):
                        train_ds = TensorDataset(X_train, y_train, y_g_train)
                        eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
                    else:
                        train_ds = TensorDataset(X_train, y_train)
                        eval_ds = TensorDataset(X_eval, y_eval)

                    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
                    eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

                    model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                  num_layers=num_layers, bidirectional=bidirectional,
                                  with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                                            num_layers=num_layers, bidirectional=bidirectional,
                                            with_focus_attn=with_focus_attn).to(device)

                    if(multi_task == 'true'):
                        loss_func = nn.CrossEntropyLoss()
                        loss_func_g = nn.BCELoss()
                        optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
                    else:
                        loss_func = nn.CrossEntropyLoss()
                        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

                    if(use_warmup == 'true'):
                        t_total = len(train_dataloader) // 1 * num_epochs
                        opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

                    train(train_dataloader, eval_dataloader, num_epochs)

                    model.eval()
                    if(multi_task == 'true'):
                        model_g.eval()

                    correct = 0
                    n = 0
                    for i in range(len(eval_samples)):
                        try:
                            X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                            X_new = convert_tensor(X_new).to(device)
                            y_new = model(X_new)
                            y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                            #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                            #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                            y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
                            correct += y_new
                            n += 1
                        except:
                            pass

                    acc = correct / n
                    acc_list.append(acc)

                acc_mean = sum(acc_list) / 5
                print('conv_dim:', conv_dim, '\taugmentation', augmentation, 
                      '\twith_focus_attn:', with_focus_attn, '\tmulti_task:', multi_task)
                print('Test accuray:', round(acc_mean, 5))
                print()

1080it [00:08, 121.94it/s]
360it [00:02, 156.14it/s]
1080it [00:08, 125.02it/s]
360it [00:03, 111.12it/s]
1080it [00:09, 118.74it/s]
360it [00:02, 156.54it/s]
1080it [00:06, 156.71it/s]
360it [00:02, 152.83it/s]
1080it [00:07, 140.59it/s]
360it [00:02, 156.72it/s]
1080it [00:06, 157.17it/s]
360it [00:02, 154.44it/s]
1080it [00:09, 112.73it/s]
360it [00:03, 113.30it/s]
1080it [00:09, 112.31it/s]
360it [00:03, 112.25it/s]
1080it [00:07, 141.24it/s]
360it [00:02, 157.43it/s]
1080it [00:06, 156.25it/s]
360it [00:02, 156.27it/s]
16it [00:00, 152.81it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: True 	multi_task: true
Test accuray: 0.66611



1080it [00:08, 125.27it/s]
360it [00:03, 100.73it/s]
1080it [00:08, 129.36it/s]
360it [00:03, 110.84it/s]
1080it [00:08, 122.18it/s]
360it [00:02, 147.25it/s]
1080it [00:09, 113.51it/s]
360it [00:03, 112.76it/s]
1080it [00:06, 156.06it/s]
360it [00:02, 158.27it/s]
15it [00:00, 148.20it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: True 	multi_task: false
Test accuray: 0.65167



1080it [00:07, 153.84it/s]
360it [00:02, 157.00it/s]
1080it [00:07, 153.46it/s]
360it [00:02, 149.52it/s]
1080it [00:09, 115.36it/s]
360it [00:02, 134.57it/s]
1080it [00:07, 142.55it/s]
360it [00:02, 152.47it/s]
1080it [00:06, 156.84it/s]
360it [00:02, 155.79it/s]
1080it [00:06, 154.90it/s]
360it [00:02, 156.61it/s]
1080it [00:08, 134.12it/s]
360it [00:02, 140.49it/s]
1080it [00:07, 153.78it/s]
360it [00:02, 120.59it/s]
1080it [00:09, 118.57it/s]
360it [00:03, 112.30it/s]
1080it [00:09, 111.86it/s]
360it [00:03, 112.50it/s]
12it [00:00, 111.48it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: False 	multi_task: true
Test accuray: 0.63722



1080it [00:09, 111.29it/s]
360it [00:02, 158.29it/s]
1080it [00:08, 124.71it/s]
360it [00:02, 156.93it/s]
1080it [00:06, 155.90it/s]
360it [00:02, 154.90it/s]
1080it [00:09, 112.15it/s]
360it [00:03, 112.40it/s]
1080it [00:08, 130.50it/s]
360it [00:02, 125.18it/s]
15it [00:00, 147.16it/s]

conv_dim: 1d 	augmentation true 	with_focus_attn: False 	multi_task: false
Test accuray: 0.67889



1080it [00:09, 112.75it/s]
360it [00:03, 106.24it/s]
1080it [00:07, 138.67it/s]
360it [00:02, 158.10it/s]
1080it [00:08, 123.67it/s]
360it [00:02, 157.94it/s]
1080it [00:09, 116.57it/s]
360it [00:03, 112.75it/s]
1080it [00:08, 132.42it/s]
360it [00:03, 111.67it/s]
1080it [00:09, 118.12it/s]
360it [00:02, 158.01it/s]
1080it [00:07, 148.03it/s]
360it [00:02, 144.80it/s]
1080it [00:06, 155.73it/s]
360it [00:02, 157.92it/s]
1080it [00:07, 145.93it/s]
360it [00:02, 151.40it/s]
1080it [00:06, 155.70it/s]
360it [00:02, 135.73it/s]
16it [00:00, 152.83it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: True 	multi_task: true
Test accuray: 0.58056



1080it [00:10, 100.20it/s]
360it [00:03, 99.02it/s] 
1080it [00:07, 152.37it/s]
360it [00:02, 151.50it/s]
1080it [00:08, 132.41it/s]
360it [00:02, 140.44it/s]
1080it [00:06, 156.10it/s]
360it [00:02, 152.53it/s]
1080it [00:07, 136.20it/s]
360it [00:02, 121.28it/s]
16it [00:00, 154.44it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: True 	multi_task: false
Test accuray: 0.555



1080it [00:10, 107.17it/s]
360it [00:03, 98.99it/s]
1080it [00:09, 117.82it/s]
360it [00:02, 153.38it/s]
1080it [00:06, 157.77it/s]
360it [00:02, 157.11it/s]
1080it [00:10, 102.07it/s]
360it [00:03, 99.10it/s] 
1080it [00:06, 156.38it/s]
360it [00:02, 156.58it/s]
1080it [00:06, 155.63it/s]
360it [00:02, 149.67it/s]
1080it [00:06, 155.18it/s]
360it [00:02, 155.31it/s]
1080it [00:06, 155.18it/s]
360it [00:02, 150.41it/s]
1080it [00:08, 132.19it/s]
360it [00:03, 112.17it/s]
1080it [00:09, 111.78it/s]
360it [00:03, 112.21it/s]
16it [00:00, 155.33it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: False 	multi_task: true
Test accuray: 0.57389



1080it [00:08, 132.40it/s]
360it [00:02, 157.34it/s]
1080it [00:07, 153.75it/s]
360it [00:02, 153.63it/s]
1080it [00:08, 131.31it/s]
360it [00:03, 110.72it/s]
1080it [00:07, 150.66it/s]
360it [00:02, 139.47it/s]
1080it [00:06, 154.29it/s]
360it [00:02, 158.65it/s]
11it [00:00, 109.84it/s]

conv_dim: 1d 	augmentation false 	with_focus_attn: False 	multi_task: false
Test accuray: 0.56833



1080it [00:08, 129.52it/s]
360it [00:02, 151.79it/s]
1080it [00:06, 154.57it/s]
360it [00:02, 153.58it/s]
1080it [00:07, 142.93it/s]
360it [00:02, 121.67it/s]
1080it [00:07, 141.18it/s]
360it [00:02, 153.88it/s]
1080it [00:09, 110.56it/s]
360it [00:03, 110.50it/s]
1080it [00:07, 146.73it/s]
360it [00:03, 117.78it/s]
1080it [00:08, 120.55it/s]
360it [00:02, 152.17it/s]
1080it [00:07, 140.66it/s]
360it [00:02, 142.08it/s]
1080it [00:08, 120.92it/s]
360it [00:02, 155.59it/s]
1080it [00:07, 153.97it/s]
360it [00:02, 155.30it/s]
16it [00:00, 154.79it/s]

conv_dim: 2d 	augmentation true 	with_focus_attn: True 	multi_task: true
Test accuray: 0.63944



1080it [00:08, 123.73it/s]
360it [00:02, 140.28it/s]
1080it [00:08, 121.29it/s]
360it [00:03, 110.63it/s]
1080it [00:08, 124.50it/s]
360it [00:02, 154.17it/s]
1080it [00:07, 143.76it/s]
360it [00:02, 155.68it/s]
1080it [00:07, 151.43it/s]
360it [00:02, 145.39it/s]
11it [00:00, 109.83it/s]

conv_dim: 2d 	augmentation true 	with_focus_attn: True 	multi_task: false
Test accuray: 0.61778



1080it [00:07, 143.10it/s]
360it [00:02, 130.42it/s]
1080it [00:08, 124.34it/s]
360it [00:02, 155.48it/s]
1080it [00:08, 121.30it/s]
360it [00:03, 98.74it/s]
1080it [00:07, 142.10it/s]
360it [00:02, 130.59it/s]
1080it [00:07, 147.42it/s]
360it [00:02, 135.23it/s]
1080it [00:07, 136.17it/s]
360it [00:02, 154.60it/s]
1080it [00:09, 111.49it/s]
360it [00:03, 109.01it/s]
1080it [00:09, 108.81it/s]
360it [00:03, 111.52it/s]
1080it [00:07, 141.06it/s]
360it [00:03, 111.98it/s]
1080it [00:07, 151.65it/s]
360it [00:03, 111.79it/s]
14it [00:00, 137.55it/s]

conv_dim: 2d 	augmentation true 	with_focus_attn: False 	multi_task: true
Test accuray: 0.63056



1080it [00:07, 148.98it/s]
360it [00:02, 151.43it/s]
1080it [00:07, 151.54it/s]
360it [00:02, 155.02it/s]
1080it [00:07, 138.66it/s]
360it [00:02, 136.73it/s]
1080it [00:09, 111.15it/s]
360it [00:03, 108.49it/s]
1080it [00:09, 113.02it/s]
360it [00:03, 110.60it/s]
12it [00:00, 110.54it/s]

conv_dim: 2d 	augmentation true 	with_focus_attn: False 	multi_task: false
Test accuray: 0.63944



1080it [00:09, 109.56it/s]
360it [00:03, 98.30it/s]
1080it [00:10, 103.30it/s]
360it [00:03, 111.06it/s]
1080it [00:07, 140.18it/s]
360it [00:03, 98.69it/s] 
1080it [00:10, 98.29it/s]
360it [00:03, 103.38it/s]
1080it [00:09, 119.51it/s]
360it [00:02, 147.77it/s]
1080it [00:09, 112.68it/s]
360it [00:03, 110.90it/s]
1080it [00:06, 154.88it/s]
360it [00:02, 155.13it/s]
1080it [00:07, 144.10it/s]
360it [00:02, 155.91it/s]
1080it [00:09, 110.89it/s]
360it [00:02, 131.46it/s]
1080it [00:08, 124.91it/s]
360it [00:03, 114.70it/s]
11it [00:00, 109.91it/s]

conv_dim: 2d 	augmentation false 	with_focus_attn: True 	multi_task: true
Test accuray: 0.61222



1080it [00:09, 110.76it/s]
360it [00:03, 111.39it/s]
1080it [00:09, 111.56it/s]
360it [00:03, 112.44it/s]
1080it [00:08, 120.74it/s]
360it [00:03, 111.89it/s]
1080it [00:08, 123.92it/s]
360it [00:03, 111.05it/s]
1080it [00:09, 108.39it/s]
360it [00:03, 118.91it/s]
12it [00:00, 111.63it/s]

conv_dim: 2d 	augmentation false 	with_focus_attn: True 	multi_task: false
Test accuray: 0.62444



1080it [00:09, 111.65it/s]
360it [00:02, 129.49it/s]
1080it [00:09, 117.31it/s]
360it [00:03, 105.92it/s]
1080it [00:07, 141.94it/s]
360it [00:03, 111.80it/s]
1080it [00:08, 125.33it/s]
360it [00:03, 111.48it/s]
1080it [00:09, 112.83it/s]
360it [00:02, 131.55it/s]
1080it [00:07, 141.00it/s]
360it [00:02, 155.75it/s]
1080it [00:07, 152.82it/s]
360it [00:02, 129.43it/s]
1080it [00:07, 148.54it/s]
360it [00:02, 154.90it/s]
1080it [00:09, 114.44it/s]
360it [00:03, 111.55it/s]
1080it [00:09, 109.60it/s]
360it [00:03, 111.91it/s]
11it [00:00, 108.73it/s]

conv_dim: 2d 	augmentation false 	with_focus_attn: False 	multi_task: true
Test accuray: 0.62167



1080it [00:08, 123.88it/s]
360it [00:02, 152.88it/s]
1080it [00:06, 155.50it/s]
360it [00:02, 157.96it/s]
1080it [00:06, 155.31it/s]
360it [00:02, 137.84it/s]
1080it [00:07, 143.49it/s]
360it [00:02, 132.59it/s]
1080it [00:07, 138.26it/s]
360it [00:02, 155.36it/s]


conv_dim: 2d 	augmentation false 	with_focus_attn: False 	multi_task: false
Test accuray: 0.59389

