# server3

In [1]:
import os
import glob

import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from preprocessing import preprocessing, convert_spectrograms, convert_tensor
from model_ae import Encoder
from utils.optimization import WarmupLinearSchedule

In [2]:
conv_dim = '1d'
checkpoint = './model/aae_1d_step_300.pt'
hidden_size = 128
num_layers = 2
bidirectional = 'true'
with_focus_attn = 'true'

batch_size = 256
num_epochs = 300
learning_rate = 0.0001

use_warmup = 'true'
train_dir = './wav_data/pretrain/wav_train/'
eval_dir = './wav_data/pretrain/wav_eval/'
multi_task = 'false'
augmentation = 'true'

save_checkpoint_steps = 10
output_dir = 'model_check'

bidirectional = True if(bidirectional == 'true') else False
with_focus_attn = True if(with_focus_attn == 'true') else False
n_mfcc = 40 if(conv_dim == '1d') else 128

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

train_datas = glob.glob(os.path.join(train_dir, '**', '*wav'), recursive=True)
eval_datas = glob.glob(os.path.join(eval_dir, '**', '*wav'), recursive=True)
train_datas = sorted(train_datas, key=len)
eval_datas = sorted(eval_datas, key=len)

In [3]:
len(train_datas), len(eval_datas)

(1228, 165)

In [4]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_attn_heads, attn_hidden_size, dropout_prob, with_focus_attn):
        super(MultiHeadedAttention, self).__init__()
        self.num_attn_heads = num_attn_heads
        self.hidden_size = attn_hidden_size
        self.dropout_prob = dropout_prob
        self.with_focus_attn = with_focus_attn
        
        self.attn_head_size = int(self.hidden_size / self.num_attn_heads)
        self.all_head_size = self.num_attn_heads * self.attn_head_size

        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.softmax = nn.Softmax(dim=-1)
        
        if(with_focus_attn == True):
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()
            
            self.linear_focus_query = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                num_attn_heads * self.attn_head_size)
            self.linear_focus_global = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                 num_attn_heads * self.attn_head_size)
            
            up = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.up = Variable(up, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.up)
            
            uz = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.uz = Variable(uz, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.uz)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attn_heads, self.attn_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        key_len = hidden_states.size(1)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        if(self.with_focus_attn == True):
            glo = torch.mean(mixed_query_layer, dim=1, keepdim=True)
            
            c = self.tanh(self.linear_focus_query(mixed_query_layer) + self.linear_focus_global(glo))
            c = self.transpose_for_scores(c)
            
            p = c * self.up
            p = p.sum(3).squeeze()
            z = c * self.uz
            z = z.sum(3).squeeze()
            
            P = self.sigmoid(p) * key_len
            Z = self.sigmoid(z) * key_len
            
            j = torch.arange(start=0, end=key_len, dtype=P.dtype).unsqueeze(0).unsqueeze(0).unsqueeze(0).to('cuda')
            P = P.unsqueeze(-1)
            Z = Z.unsqueeze(-1)
            
            G = -(j - P)**2 * 2 / (Z**2)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attn_head_size)
        
        if(self.with_focus_attn == True):
            attention_scores = attention_scores + G
            
        attention_probs = self.softmax(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.o_proj(context_layer)

        return attention_output

In [5]:
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

np.random.seed(42)
idx = np.random.permutation(int(len(sample_datas)))  # add

train_idx = idx[:int((len(sample_datas))*0.75)]  # add
eval_idx = idx[int((len(sample_datas))*0.75):]  # add
#noise_idx = np.arange((int(len(sample_datas)/2)), len(sample_datas))  # add
#train_idx = np.r_[train_idx, noise_idx]  # add

train_samples = list(np.array(sample_datas)[train_idx])
eval_samples = list(np.array(sample_datas)[eval_idx])

In [6]:
X_train = convert_spectrograms(train_datas, conv_dim=conv_dim, sr=16000)
X_eval = convert_spectrograms(eval_datas, conv_dim=conv_dim, sr=16000)

1228it [00:54, 22.63it/s]
165it [00:06, 23.67it/s]


In [7]:
train_label_list = []
for i in range(len(train_datas)):
    if 'noise' in train_datas[i]:
        label_filename = train_datas[i].split('/')[-1].split('_')[0] + '.wav' + '.npy'
    else:
        label_filename = train_datas[i].split('/')[-1] + '.npy'
    
    label_filepath = os.path.join('./wav_data/pretrain/label/', label_filename)
    label = np.load(label_filepath)
    train_label_list.append(label)

In [8]:
y_train = np.concatenate(train_label_list)

In [9]:
eval_label_list = []
for i in range(len(eval_datas)):
    if 'noise' in eval_datas[i]:
        label_filename = eval_datas[i].split('/')[-1].split('_')[0] + '.wav' + '.npy'
    else:
        label_filename = eval_datas[i].split('/')[-1] + '.npy'
    
    label_filepath = os.path.join('./wav_data/pretrain/label/', label_filename)
    label = np.load(label_filepath)
    eval_label_list.append(label)

In [10]:
y_eval = np.concatenate(eval_label_list)

In [11]:
X_train.shape, y_train.shape, X_eval.shape, y_eval.shape

((486288, 40, 50, 1), (486288,), (65340, 40, 50, 1), (65340,))

In [12]:
if(multi_task == 'true'):
    speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0].split('_')[0]), 
                                sample_datas)))  # add
    y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

    y_g_train = y_gender[train_idx]
    y_g_eval = y_gender[eval_idx]

In [13]:
if(augmentation == 'true'):
    X_train_flip = X_train[:, :, :, ::-1]
    y_train_flip = y_train.copy()

    X_train = np.concatenate((X_train, X_train_flip), axis=0)
    y_train = np.concatenate((y_train, y_train_flip), axis=0)

In [14]:
X_train, y_train = convert_tensor(X_train, y_train)
X_eval, y_eval = convert_tensor(X_eval, y_eval)

In [15]:
y_train = y_train.unsqueeze(-1)
y_eval = y_eval.unsqueeze(-1)

In [16]:
if(multi_task == 'true'):
    _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
    _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)
    
    if(augmentation == 'true'):
        y_g_train_flip = y_g_train.copy()
        y_g_train = np.concatenate((y_g_train, y_g_train_flip))
    
    y_g_train = torch.tensor(y_g_train).float()
    y_g_eval = torch.tensor(y_g_eval).float()

    y_g_train = y_g_train.unsqueeze(-1)
    y_g_eval = y_g_eval.unsqueeze(-1)

In [17]:
if(multi_task == 'true'):
    print(X_train.shape, y_train.shape, y_g_train.shape, X_eval.shape, y_eval.shape, y_g_eval.shape)
else:
    print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape)

torch.Size([972576, 1, 40, 50]) torch.Size([972576, 1]) torch.Size([65340, 1, 40, 50]) torch.Size([65340, 1])


In [18]:
if(multi_task == 'true'):
    train_ds = TensorDataset(X_train, y_train, y_g_train)
    eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
else:
    train_ds = TensorDataset(X_train, y_train)
    eval_ds = TensorDataset(X_eval, y_eval)

train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

In [19]:
model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)

In [20]:
if(multi_task == 'true'):
    model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                        num_layers=num_layers, bidirectional=bidirectional,
                        with_focus_attn=with_focus_attn).to(device)

In [21]:
if(multi_task == 'true'):
    loss_func = nn.CrossEntropyLoss()
    loss_func_g = nn.BCELoss()
    optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
else:
    loss_func = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
if(use_warmup == 'true'):
    t_total = len(train_dataloader) // 1 * num_epochs
    opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

In [23]:
def train(train_dataloader, eval_dataloader, epochs):
        print('Start training')
        softmax = nn.Softmax(dim=1)
        for epoch in range(epochs):
            model.train()
            train_loss = 0
            nb_train_steps = 0
            correct = 0
            num_samples = 0
            
            if(multi_task == 'true'):
                for X_batch, y_batch, y_g_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)
                    outputs_g = model_g(X_batch)

                    loss_1 = loss_func(outputs, y_batch)
                    loss_2 = loss_func_g(outputs_g, y_g_batch)
                    loss = loss_1 + 0.8 * loss_2
                    loss.backward(retain_graph=True)

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch, y_g_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)
                        outputs_g = model_g(X_batch)

                    tmp_eval_loss_1 = loss_func(outputs, y_batch)
                    tmp_eval_loss_2 = loss_func_g(outputs_g, y_g_batch)
                    tmp_eval_loss = tmp_eval_loss_1 + 0.8 * tmp_eval_loss_2
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples
            else:
                for X_batch, y_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)

                    loss = loss_func(outputs, y_batch)
                    loss.backward()

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = (outputs >= 0.5).float()
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)

                    tmp_eval_loss = loss_func(outputs, y_batch)
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = (outputs >= 0.5).float()
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples

            for param_group in optimizer.param_groups:
                lr = param_group['lr']
            print('epoch: {:3d},    lr={:6f},    loss={:5f},    train_acc={:5f},    eval_loss={:5f},    eval_acc={:5f}'
                  .format(epoch+1, lr, train_loss, train_accuracy, eval_loss, eval_accuracy))

            
            if((epoch+1) % save_checkpoint_steps == 0):
                model_checkpoint = "%s_%s_step_%d.pt" % ('CLDNN', conv_dim, epoch+1)
                output_model_file = os.path.join(output_dir, model_checkpoint)
                torch.save(model.state_dict(), output_model_file)
                print("Saving checkpoint %s" % output_model_file)

In [24]:
train(train_dataloader, eval_dataloader, num_epochs)

Start training
epoch:   1,    lr=0.000003,    loss=0.686816,    train_acc=0.531667,    eval_loss=0.676886,    eval_acc=0.589997
epoch:   2,    lr=0.000007,    loss=0.674499,    train_acc=0.595937,    eval_loss=0.676570,    eval_acc=0.589997
epoch:   3,    lr=0.000010,    loss=0.670660,    train_acc=0.596986,    eval_loss=0.662371,    eval_acc=0.602512
epoch:   4,    lr=0.000013,    loss=0.660583,    train_acc=0.611931,    eval_loss=0.659504,    eval_acc=0.611995
epoch:   5,    lr=0.000017,    loss=0.657842,    train_acc=0.617348,    eval_loss=0.659252,    eval_acc=0.612607
epoch:   6,    lr=0.000020,    loss=0.656114,    train_acc=0.619829,    eval_loss=0.665010,    eval_acc=0.611290
epoch:   7,    lr=0.000023,    loss=0.654776,    train_acc=0.621233,    eval_loss=0.660871,    eval_acc=0.613205
epoch:   8,    lr=0.000027,    loss=0.653519,    train_acc=0.622847,    eval_loss=0.664339,    eval_acc=0.613281
epoch:   9,    lr=0.000030,    loss=0.651852,    train_acc=0.624621,    eval_loss

epoch:  71,    lr=0.000085,    loss=0.586844,    train_acc=0.682008,    eval_loss=0.707217,    eval_acc=0.598407
epoch:  72,    lr=0.000084,    loss=0.584913,    train_acc=0.683394,    eval_loss=0.714081,    eval_acc=0.595803
epoch:  73,    lr=0.000084,    loss=0.582976,    train_acc=0.684781,    eval_loss=0.710791,    eval_acc=0.593658
epoch:  74,    lr=0.000084,    loss=0.580805,    train_acc=0.686373,    eval_loss=0.717950,    eval_acc=0.596094
epoch:  75,    lr=0.000083,    loss=0.578778,    train_acc=0.688302,    eval_loss=0.725075,    eval_acc=0.592862
epoch:  76,    lr=0.000083,    loss=0.576678,    train_acc=0.689262,    eval_loss=0.726011,    eval_acc=0.599571
epoch:  77,    lr=0.000083,    loss=0.574377,    train_acc=0.691413,    eval_loss=0.721984,    eval_acc=0.590028
epoch:  78,    lr=0.000082,    loss=0.571998,    train_acc=0.693084,    eval_loss=0.721935,    eval_acc=0.591360
epoch:  79,    lr=0.000082,    loss=0.570115,    train_acc=0.694541,    eval_loss=0.730858,    e

epoch: 141,    lr=0.000059,    loss=0.432113,    train_acc=0.785074,    eval_loss=0.996871,    eval_acc=0.580668
epoch: 142,    lr=0.000059,    loss=0.430496,    train_acc=0.786387,    eval_loss=1.013242,    eval_acc=0.576900
epoch: 143,    lr=0.000058,    loss=0.428674,    train_acc=0.787482,    eval_loss=1.021653,    eval_acc=0.578998
epoch: 144,    lr=0.000058,    loss=0.426369,    train_acc=0.788909,    eval_loss=1.020313,    eval_acc=0.576961
epoch: 145,    lr=0.000057,    loss=0.425496,    train_acc=0.789375,    eval_loss=1.026765,    eval_acc=0.578983
epoch: 146,    lr=0.000057,    loss=0.422416,    train_acc=0.790417,    eval_loss=1.017006,    eval_acc=0.573223
epoch: 147,    lr=0.000057,    loss=0.421018,    train_acc=0.791982,    eval_loss=1.027991,    eval_acc=0.576900
epoch: 148,    lr=0.000056,    loss=0.419246,    train_acc=0.792941,    eval_loss=1.041621,    eval_acc=0.578431
epoch: 149,    lr=0.000056,    loss=0.417550,    train_acc=0.793838,    eval_loss=1.044054,    e

epoch: 211,    lr=0.000033,    loss=0.337742,    train_acc=0.838517,    eval_loss=1.391519,    eval_acc=0.570757
epoch: 212,    lr=0.000033,    loss=0.336680,    train_acc=0.839244,    eval_loss=1.381087,    eval_acc=0.571232
epoch: 213,    lr=0.000032,    loss=0.335117,    train_acc=0.840225,    eval_loss=1.376592,    eval_acc=0.574219
epoch: 214,    lr=0.000032,    loss=0.334610,    train_acc=0.840329,    eval_loss=1.371699,    eval_acc=0.569792
epoch: 215,    lr=0.000031,    loss=0.334008,    train_acc=0.840564,    eval_loss=1.382538,    eval_acc=0.571324
epoch: 216,    lr=0.000031,    loss=0.332918,    train_acc=0.841209,    eval_loss=1.388898,    eval_acc=0.570267
epoch: 217,    lr=0.000031,    loss=0.331756,    train_acc=0.842185,    eval_loss=1.392808,    eval_acc=0.569455
epoch: 218,    lr=0.000030,    loss=0.331379,    train_acc=0.842206,    eval_loss=1.378594,    eval_acc=0.569930
epoch: 219,    lr=0.000030,    loss=0.330711,    train_acc=0.842372,    eval_loss=1.396689,    e

epoch: 281,    lr=0.000007,    loss=0.293154,    train_acc=0.861501,    eval_loss=1.651639,    eval_acc=0.570343
epoch: 282,    lr=0.000007,    loss=0.294121,    train_acc=0.861610,    eval_loss=1.624542,    eval_acc=0.568842
epoch: 283,    lr=0.000006,    loss=0.293674,    train_acc=0.861502,    eval_loss=1.622840,    eval_acc=0.568045
epoch: 284,    lr=0.000006,    loss=0.292491,    train_acc=0.861913,    eval_loss=1.627914,    eval_acc=0.570665
epoch: 285,    lr=0.000006,    loss=0.292904,    train_acc=0.861772,    eval_loss=1.619719,    eval_acc=0.570956
epoch: 286,    lr=0.000005,    loss=0.291677,    train_acc=0.862335,    eval_loss=1.652431,    eval_acc=0.569455
epoch: 287,    lr=0.000005,    loss=0.291339,    train_acc=0.862586,    eval_loss=1.651491,    eval_acc=0.568183
epoch: 288,    lr=0.000004,    loss=0.292256,    train_acc=0.862147,    eval_loss=1.635071,    eval_acc=0.570818
epoch: 289,    lr=0.000004,    loss=0.291699,    train_acc=0.862684,    eval_loss=1.658300,    e

In [None]:
# CLDNN_6: 0.62

# predict

In [27]:
import time
import pandas as pd
from data_loader import AudioInferenceDataset, two_hot_encode
from preprocessing import preprocessing, convert_tensor

In [28]:
model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model.load_state_dict(torch.load('./model/CLDNN_1d_step_300.pt'))
model.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [29]:
testset = AudioInferenceDataset(root_dir='./wav_data/pretrain/sample')

In [30]:
test_loader = DataLoader(dataset=testset, batch_size=64, shuffle=False)

In [31]:
def calculate(pred):
    num_zero = 0
    num_one = 0
    start = 0
    end = 1
    starts = []
    ends = []
    num_ones = []

    for k, value in enumerate(pred):
        if((value == 1) & (num_zero > 16)):
            starts.append(start)
            ends.append(end)
            num_ones.append(num_one)
            start = k
            num_one = 0
            num_zero = 0
            num_one += 1
        elif(value == 1):
            if(start == 0):
                start = k
            num_zero = 0
            num_one += 1
            end = k if(k != start) else start+1
        elif(value == 0):
            num_zero += 1
    starts.append(start)
    ends.append(end)
    num_ones.append(num_one)
    
    if(len(starts) == 1):
        start = 0
        end = 0
    else:
        idx = np.argmax(np.array([num_ones[i]**2/(ends[i]-starts[i]) for i in range(len(starts))]))
        start = round(starts[idx] * 0.1)
        end = np.ceil(ends[idx] * 0.1).astype('int')
    return start, end

In [32]:
def inference(model) -> pd.DataFrame:
    # testset = AudioInferenceDataset(root_dir=f'{test_path}/test_data')
    testset = AudioInferenceDataset(root_dir='./wav_data/pretrain/sample')

    test_loader = DataLoader(dataset=testset, batch_size=64, shuffle=False)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    y_start = []
    y_end = []
    filename_list = []

    with torch.no_grad():
        for i, data in enumerate(test_loader):
            x = data['audio']
            x = x.to(device)
            starts = []
            ends = []
            for j in range(len(x)):
                pred = model(x[j])
                pred = pred.detach().cpu().squeeze().numpy()
                pred = (pred >= 0.5).astype('float32')
                start, end = calculate(pred)
                starts.append(start)
                ends.append(end)
            
            filename_list += data['file_name']
            y_start += starts
            y_end += ends
            
        '''
        for i, data in enumerate(test_loader):
            x = data['audio']
            x = x.to(device)
            _, pred = model(x)

            filename_list += data['file_name']
            y_start += pred.detach().cpu()[:, 0].squeeze().tolist()
            y_end += pred.detach().cpu()[:, 1].squeeze().tolist()
        '''

    ret = pd.DataFrame({'file_name': filename_list, 'start': y_start, 'end': y_end})
    return ret

In [35]:
df = inference(model)

In [36]:
df

Unnamed: 0,file_name,start,end
0,122.wav,0,20
1,1.wav,16,39
2,2.wav,22,30
3,3.wav,21,40
4,4.wav,27,33
...,...,...,...
898,924.wav,20,40
899,925.wav,0,0
900,926.wav,13,40
901,927.wav,0,17


In [28]:
model_1 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_1.load_state_dict(torch.load('./ensemble/CLDNN_1.pt'))
model_1.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [29]:
model_2 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_2.load_state_dict(torch.load('./ensemble/CLDNN_2.pt'))
model_2.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [30]:
model_3 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_3.load_state_dict(torch.load('./ensemble/CLDNN_3.pt'))
model_3.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [25]:
model_1 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_1.load_state_dict(torch.load('./model/CLDNN_1d_step_200_1.pt'))
model_1.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [27]:
model_2 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=64,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=False).to(device)
model_2.load_state_dict(torch.load('./model/VAD_model.pt'))
model_2.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [28]:
torch.save({
    'model_1': model_1.state_dict(),
    'model_2': model_2.state_dict(),
    #'model_3': model_3.state_dict(),
    #'model_4': model_4.state_dict(),
    #'model_5': model_5.state_dict(),
    #'model_6': model.state_dict(),
}, './model/model_ensemble.pt')

In [66]:
model_1 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_2 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_3 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_4 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_5 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)
model_6 = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)

In [62]:
model_1.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_1'])
model_2.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_2'])
model_3.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_3'])
model_4.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_4'])
model_5.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_5'])
model_6.load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_6'])

model_1.eval()
model_2.eval()
model_3.eval()
model_4.eval()
model_5.eval()
model_6.eval()

CLDNN(
  (encoder): Encoder(
    (conv1): Sequential(
      (0): Conv1d(1, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv2): Sequential(
      (0): Conv1d(4, 4, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv3): Sequential(
      (0): Conv1d(4, 8, kernel_size=(11, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (conv4): Sequential(
      (0): Conv1d(8, 8, kernel_size=(10, 1), stride=(1,))
      (1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
  )
  (attn): MultiHeadedAttention(
    (query): Linear(in_features=8, out_features=8, bias=True)
    (key): Linear(in_features=8, out_features=8, bias=True)
    (value): Linear(in_fe

In [71]:
for par in model_1.encoder.conv1.parameters():
    print(par)

Parameter containing:
tensor([[[[-0.0905],
          [-0.0448],
          [-0.0492],
          [-0.1189],
          [-0.1578],
          [-0.1203],
          [ 0.0539],
          [ 0.6332],
          [ 1.1040],
          [ 1.2801],
          [ 0.8445]]],


        [[[-0.8201],
          [ 0.2812],
          [-0.0367],
          [ 0.3458],
          [-0.1257],
          [ 0.1367],
          [-0.2569],
          [-0.0129],
          [-0.0209],
          [ 0.2502],
          [-0.0149]]],


        [[[ 0.6740],
          [-0.3954],
          [ 0.0535],
          [-0.2752],
          [-0.0772],
          [-0.0634],
          [ 0.1458],
          [-0.0594],
          [-0.0542],
          [-0.4355],
          [-0.0871]]],


        [[[ 0.7231],
          [-0.3695],
          [ 0.1311],
          [-0.3867],
          [-0.0832],
          [-0.1032],
          [-0.0713],
          [ 0.0707],
          [ 0.1685],
          [ 0.1891],
          [ 0.1545]]]], device='cuda:0', requires_grad=True)
Pa

In [74]:
for par in models[3].encoder.conv1.parameters():
    print(par)

Parameter containing:
tensor([[[[-0.0905],
          [-0.0448],
          [-0.0492],
          [-0.1189],
          [-0.1578],
          [-0.1203],
          [ 0.0539],
          [ 0.6332],
          [ 1.1040],
          [ 1.2801],
          [ 0.8445]]],


        [[[-0.8201],
          [ 0.2812],
          [-0.0367],
          [ 0.3458],
          [-0.1257],
          [ 0.1367],
          [-0.2569],
          [-0.0129],
          [-0.0209],
          [ 0.2502],
          [-0.0149]]],


        [[[ 0.6740],
          [-0.3954],
          [ 0.0535],
          [-0.2752],
          [-0.0772],
          [-0.0634],
          [ 0.1458],
          [-0.0594],
          [-0.0542],
          [-0.4355],
          [-0.0871]]],


        [[[ 0.7231],
          [-0.3695],
          [ 0.1311],
          [-0.3867],
          [-0.0832],
          [-0.1032],
          [-0.0713],
          [ 0.0707],
          [ 0.1685],
          [ 0.1891],
          [ 0.1545]]]], device='cuda:0', requires_grad=True)
Pa

In [None]:
models

In [70]:
models = [model_1, model_2, model_3, model_4, model_5, model_6]

for i in range(len(models)):
    models[i].load_state_dict(torch.load('./model_all_data/model_ensemble.pt')['model_%d'%(i+1)])
    print(i)

0
1
2
3
4
5


In [83]:
pred_1 = model_1(X_train[0:10].to(device))
pred_2 = model_2(X_train[0:10].to(device))
pred_3 = model_3(X_train[0:10].to(device))
pred_4 = model_4(X_train[0:10].to(device))
pred_5 = model_5(X_train[0:10].to(device))
pred_6 = model_6(X_train[0:10].to(device))

In [87]:
(pred_1 + pred_2 + pred_3 + pred_4 + pred_5 + pred_6) / 6

tensor([[0.3260],
        [0.3904],
        [0.9989],
        [0.9996],
        [0.6323],
        [0.5515],
        [0.4103],
        [0.0939],
        [0.1353],
        [0.0649]], device='cuda:0', grad_fn=<DivBackward0>)

In [128]:
for i, data in enumerate(test_loader):
    x = data['audio']
    x = x.to(device)
    starts = []
    ends = []
    for j in range(len(x)):
        pred = model(x[j])
        pred = pred.detach().cpu().squeeze().numpy()
        pred = (pred >= 0.5).astype('float32')
        start, end = calculate(pred)
        starts.append(start)
        ends.append(end)

[6, 150, 188, 217, 315, 367, 390]
[110, 158, 199, 297, 349, 349, 395]
[0, 38, 181]
[1, 161, 395]
[0, 196, 268]
[1, 213, 328]
[1, 266]
[245, 395]
[9, 28, 56, 126, 180]
[10, 35, 75, 75, 395]
[1, 314, 382]
[296, 355, 395]
[1, 212]
[185, 395]
[1, 308, 333, 363]
[288, 309, 336, 390]
[1, 62, 153, 183, 299, 337, 360]
[42, 135, 164, 234, 318, 318, 373]
[0, 34, 95, 173, 344, 391]
[1, 60, 146, 316, 358, 395]
[6, 52, 113, 312, 331]
[27, 94, 284, 313, 352]
[1, 87, 149, 168]
[63, 127, 127, 386]
[1, 62, 199, 310]
[42, 181, 291, 394]
[6, 106]
[66, 395]
[4, 206, 264, 388]
[176, 237, 347, 389]
[6, 106, 175, 209]
[85, 132, 177, 387]
[0, 28, 160, 195]
[1, 44, 44, 395]
[0, 26, 61, 84, 111, 155, 195, 330, 387]
[1, 29, 63, 85, 134, 158, 287, 341, 391]
[0, 50, 104]
[1, 84, 328]
[5, 68, 96, 124, 155, 223, 270]
[38, 38, 38, 132, 196, 228, 395]
[4, 46]
[26, 395]
[2, 241, 342, 371]
[223, 320, 344, 395]
[10, 77, 119, 239]
[16, 93, 218, 395]
[0, 36, 79]
[1, 44, 386]
[5, 51, 92, 216, 254, 354, 394]
[26, 53, 195, 23

In [131]:
len(starts)

31

In [132]:
len(ends)

31

In [133]:
len(x)

31

In [42]:
pred = model(x[0])
pred = pred.detach().cpu().squeeze().numpy()
pred = (pred >= 0.5).astype('float32')
np.argwhere(pred == 1)

array([[  6],
       [  7],
       [  8],
       [  9],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 15],
       [ 17],
       [ 21],
       [ 22],
       [ 23],
       [ 24],
       [ 25],
       [ 26],
       [ 27],
       [ 29],
       [ 36],
       [ 37],
       [ 38],
       [ 39],
       [ 45],
       [ 46],
       [ 48],
       [ 50],
       [ 51],
       [ 52],
       [ 55],
       [ 60],
       [ 76],
       [ 77],
       [ 85],
       [ 86],
       [ 87],
       [ 96],
       [109],
       [110],
       [150],
       [151],
       [152],
       [158],
       [188],
       [190],
       [199],
       [217],
       [218],
       [219],
       [220],
       [221],
       [222],
       [223],
       [224],
       [225],
       [226],
       [227],
       [228],
       [229],
       [230],
       [231],
       [232],
       [233],
       [234],
       [235],
       [236],
       [237],
       [238],
       [239],
       [240],
       [241],
      

In [100]:
calculate(pred)

(22, 30)

In [51]:
root_dir='./wav_data/pretrain/sample'

In [63]:
data_list = [img for img in os.listdir(root_dir) if not img.startswith('.')]

In [67]:
def load_audio(path):
    x = preprocessing(path, method='mfcc', sr=16000, n_mels=40, n_mfcc=40)
    x = convert_tensor(x)
    return x

In [68]:
audio_path

'./wav_data/pretrain/sample/2.wav'

In [69]:
audio_name = data_list[0]
audio_path = os.path.join(root_dir, audio_name)
audio = load_audio(audio_path)

In [72]:
audio.shape

torch.Size([396, 1, 40, 50])

In [25]:
model.eval()
if(multi_task == 'true'):
    model_g.eval()

In [23]:
correct = 0
n = 0

for i in range(len(eval_samples)):
    try:
        X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
        X_new = convert_tensor(X_new).to(device)
        y_new = model(X_new)
        y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
        #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
        #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
        y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
        correct += y_new
        n += 1
    except:
        pass
    
acc = correct / n

In [24]:
print('Test accuray:', round(acc, 5))

Test accuray: 0.88611


In [24]:
print('Test accuray:', round(acc, 5))  # 0.7111

Test accuray: 0.85278


- 1DCNN  
Test accuray: 0.64722

In [30]:
len(eval_samples)

360

In [29]:
len(set(eval_samples) - set(train_samples))

360