In [1]:
import os
import glob

import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from preprocessing import preprocessing, convert_spectrograms, convert_tensor
from model_ae import Encoder
from utils.optimization import WarmupLinearSchedule

In [2]:
conv_dim = '1d'
checkpoint = './output/aae_1d_step_300.pt'
hidden_size = 128
num_layers = 2
bidirectional = 'true'
with_focus_attn = 'true'

batch_size = 128
num_epochs = 300
learning_rate = 0.0001

use_warmup = 'true'
data_dir = './wav_data/pretrain/RAVDESS_resample/'
multi_task = 'true'
augmentation = 'true'

bidirectional = True if(bidirectional == 'true') else False
with_focus_attn = True if(with_focus_attn == 'true') else False
n_mfcc = 40 if(conv_dim == '1d') else 128

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

sample_datas = glob.glob(os.path.join(data_dir, '**', '*wav'), recursive=True)
sample_datas = sorted(sample_datas)

In [3]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_attn_heads, attn_hidden_size, dropout_prob, with_focus_attn):
        super(MultiHeadedAttention, self).__init__()
        self.num_attn_heads = num_attn_heads
        self.hidden_size = attn_hidden_size
        self.dropout_prob = dropout_prob
        self.with_focus_attn = with_focus_attn
        
        self.attn_head_size = int(self.hidden_size / self.num_attn_heads)
        self.all_head_size = self.num_attn_heads * self.attn_head_size

        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.softmax = nn.Softmax(dim=-1)
        
        if(with_focus_attn == True):
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()
            
            self.linear_focus_query = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                num_attn_heads * self.attn_head_size)
            self.linear_focus_global = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                 num_attn_heads * self.attn_head_size)
            
            up = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.up = Variable(up, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.up)
            
            uz = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.uz = Variable(uz, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.uz)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attn_heads, self.attn_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        key_len = hidden_states.size(1)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        if(self.with_focus_attn == True):
            glo = torch.mean(mixed_query_layer, dim=1, keepdim=True)
            
            c = self.tanh(self.linear_focus_query(mixed_query_layer) + self.linear_focus_global(glo))
            c = self.transpose_for_scores(c)
            
            p = c * self.up
            p = p.sum(3).squeeze()
            z = c * self.uz
            z = z.sum(3).squeeze()
            
            P = self.sigmoid(p) * key_len
            Z = self.sigmoid(z) * key_len
            
            j = torch.arange(start=0, end=key_len, dtype=P.dtype).unsqueeze(0).unsqueeze(0).unsqueeze(0).to('cuda')
            P = P.unsqueeze(-1)
            Z = Z.unsqueeze(-1)
            
            G = -(j - P)**2 * 2 / (Z**2)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attn_head_size)
        
        if(self.with_focus_attn == True):
            attention_scores = attention_scores + G
            
        attention_probs = self.softmax(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.o_proj(context_layer)

        return attention_output

In [4]:
# hidden_size = 8
ff_dim = 32
dropout_prob = 0.1

class PositionWiseFeedForward(nn.Module):
    def __init__(self):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(8, ff_dim)
        self.fc2 = nn.Linear(ff_dim, 8)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        intermediate = self.fc1(x)
        ff_out = self.dropout(self.fc2(self.relu(intermediate)))
        return ff_out

In [5]:
# hidden_size = 8

class Block(nn.Module):
    def __init__(self):
        super(Block, self).__init__()
        self.attention_norm = nn.LayerNorm(8, eps=1e-12)
        self.ffn_norm = nn.LayerNorm(8, eps=1e-12)
        self.ffn = PositionWiseFeedForward()
        self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)

    def forward(self, x):
        # Attention
        h = x
        x = self.attn(x)
        x = h + x
        x = self.attention_norm(x)

        # FFN
        h = x
        x = self.ffn(x)
        x = x + h
        x = self.ffn_norm(x)
        return x

In [6]:
class CTransformer(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CTransformer, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn1 = Block()
            self.attn2 = Block()
            self.attn3 = Block()
            self.attn4 = Block()
            self.fc = nn.Sequential(
                nn.Linear(400, 8),
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            out = self.attn1(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn2(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn3(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn4(out)  # (batch, 100, 8) -> (batch, 100, 8)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)
            out = self.fc(out)  # (batch, 800) -> (batch, 8)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [7]:
class CTransformer_G(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CTransformer_G, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn1 = Block()
            self.attn2 = Block()
            self.attn3 = Block()
            self.attn4 = Block()
            self.fc = nn.Sequential(
                nn.Linear(400, 1),
                nn.Sigmoid()
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            out = self.attn1(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn2(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn3(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = self.attn4(out)  # (batch, 100, 8) -> (batch, 100, 8)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)
            out = self.fc(out)  # (batch, 800) -> (batch, 8)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [4]:
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 8),
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [5]:
class CLDNN_G(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN_G, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=8, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.lstm = nn.LSTM(8, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        elif(conv_dim == '2d'):
            self.encoder = Encoder(conv_dim)
            if checkpoint:
                self.encoder.load_state_dict(torch.load(checkpoint))
            self.attn = MultiHeadedAttention(num_attn_heads=4, attn_hidden_size=176, dropout_prob=0.1, 
                                             with_focus_attn=with_focus_attn)
            self.gap = nn.AdaptiveAvgPool2d((1, 11))
            self.lstm = nn.LSTM(11, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.fc = nn.Sequential(
                nn.Linear(hidden_size*2 if bidirectional else hidden_size, 1),
                nn.Sigmoid()
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.encoder(x)  # (batch, 1, 40, 100) -> (batch, 8, 1, 100)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 100) -> (batch, 8, 100)
            out = out.permute(0, 2, 1)  # (batch, 8, 100) -> (batch, 100, 8)
            h = out
            out = self.attn(out)  # (batch, 100, 8) -> (batch, 100, 8)
            out = h + out
            out = out.permute(1, 0, 2)  # (batch, 100, 8) -> (100, batch, 8)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (100, batch, 8) -> (100, batch, num_directions*hidden_size)
            out = out[-1]  # (100, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        elif(self.conv_dim == '2d'):
            out = self.encoder(x)  # (batch, 1, 128, 100) -> (batch, 16, 11, 8)
            out = out.permute(0, 3, 1, 2)  # (batch, 16, 11, 8) -> (batch, 8, 16, 11)
            h = out
            new_out_shape = out.size()[:2] + (out.size()[2] * out.size()[3],)
            out = out.view(*new_out_shape)  # (batch, 8, 16, 11) -> (batch, 8, 176)
            out = self.attn(out)  # (batch, 8, 176) -> (batch, 8, 176)
            out = out.view(h.size())  # (batch, 8, 176) -> (batch, 8, 16, 11)
            out = h + out
            out = self.gap(out)  # (batch, 8, 16, 11) -> (batch, 8, 1, 11)
            out = torch.squeeze(out, 2)  # (batch, 8, 1, 11) -> (batch, 8, 11)
            out = out.permute(1, 0, 2)  # (batch, 8, 11) -> (8, batch, 11)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (8, batch, 11) -> (8, batch, num_directions*hidden_size)
            out = out[-1]  # (8, batch, num_directions*hidden_size) -> (batch, num_directions*hidden_size)
            out = self.fc(out)  # (batch, num_directions*hidden_size) -> (batch, 1)
        return out

In [6]:
np.random.seed(42)
idx = np.random.permutation(int(len(sample_datas)/2))  # add

train_idx = idx[:int((len(sample_datas)/2)*0.75)]  # add
eval_idx = idx[int((len(sample_datas)/2)*0.75):]  # add
noise_idx = np.arange((int(len(sample_datas)/2)), len(sample_datas))  # add
train_idx = np.r_[train_idx, noise_idx]  # add

train_samples = list(np.array(sample_datas)[train_idx])
eval_samples = list(np.array(sample_datas)[eval_idx])

In [7]:
y = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[2]) - 1, sample_datas)))
y_train = y[train_idx]
y_eval = y[eval_idx]

In [8]:
if(multi_task == 'true'):
    speaker = np.array(list(map(lambda x: int(x.split('/')[-1].split('-')[-1].split('.')[0].split('_')[0]), 
                                sample_datas)))  # add
    y_gender = np.array(list(map(lambda x: 1 if x % 2 ==0 else 0, speaker)))

    y_g_train = y_gender[train_idx]
    y_g_eval = y_gender[eval_idx]

In [9]:
X_train, y_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_train)
X_eval, y_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_eval)

2520it [00:13, 182.85it/s]
360it [00:01, 189.69it/s]


In [10]:
if(augmentation == 'true'):
    X_train_flip = X_train[:, :, :, ::-1]
    y_train_flip = y_train.copy()

    X_train = np.concatenate((X_train, X_train_flip), axis=0)
    y_train = np.concatenate((y_train, y_train_flip), axis=0)

In [11]:
X_train, y_train = convert_tensor(X_train, y_train)
X_eval, y_eval = convert_tensor(X_eval, y_eval)

In [12]:
y_train = y_train.long()
y_eval = y_eval.long()

In [13]:
if(multi_task == 'true'):
    _, y_g_train = convert_spectrograms(train_samples, conv_dim=conv_dim, sr=16000, labels=y_g_train)
    _, y_g_eval = convert_spectrograms(eval_samples, conv_dim=conv_dim, sr=16000, labels=y_g_eval)
    
    if(augmentation == 'true'):
        y_g_train_flip = y_g_train.copy()
        y_g_train = np.concatenate((y_g_train, y_g_train_flip))
    
    y_g_train = torch.tensor(y_g_train).float()
    y_g_eval = torch.tensor(y_g_eval).float()

    y_g_train = y_g_train.unsqueeze(-1)
    y_g_eval = y_g_eval.unsqueeze(-1)

2520it [00:13, 187.38it/s]
360it [00:01, 197.91it/s]


In [14]:
if(multi_task == 'true'):
    print(X_train.shape, y_train.shape, y_g_train.shape, X_eval.shape, y_eval.shape, y_g_eval.shape)
else:
    print(X_train.shape, y_train.shape, X_eval.shape, y_eval.shape)

torch.Size([164482, 1, 40, 50]) torch.Size([164482]) torch.Size([164482, 1]) torch.Size([11753, 1, 40, 50]) torch.Size([11753]) torch.Size([11753, 1])


In [15]:
if(multi_task == 'true'):
    train_ds = TensorDataset(X_train, y_train, y_g_train)
    eval_ds = TensorDataset(X_eval, y_eval, y_g_eval)
else:
    train_ds = TensorDataset(X_train, y_train)
    eval_ds = TensorDataset(X_eval, y_eval)

train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
eval_dataloader = DataLoader(eval_ds, batch_size=batch_size, num_workers=0, drop_last=True)

In [16]:
model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
              num_layers=num_layers, bidirectional=bidirectional,
              with_focus_attn=with_focus_attn).to(device)

In [17]:
if(multi_task == 'true'):
    model_g = CLDNN_G(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                        num_layers=num_layers, bidirectional=bidirectional,
                        with_focus_attn=with_focus_attn).to(device)

In [18]:
if(multi_task == 'true'):
    loss_func = nn.CrossEntropyLoss()
    loss_func_g = nn.BCELoss()
    optimizer = optim.Adam(list(model.parameters()) + list(model_g.parameters()), lr=learning_rate)
else:
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
if(use_warmup == 'true'):
    t_total = len(train_dataloader) // 1 * num_epochs
    opt_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total)

In [20]:
def train(train_dataloader, eval_dataloader, epochs):
        print('Start training')
        softmax = nn.Softmax(dim=1)
        for epoch in range(epochs):
            model.train()
            train_loss = 0
            nb_train_steps = 0
            correct = 0
            num_samples = 0
            
            if(multi_task == 'true'):
                for X_batch, y_batch, y_g_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)
                    outputs_g = model_g(X_batch)

                    loss_1 = loss_func(outputs, y_batch)
                    loss_2 = loss_func_g(outputs_g, y_g_batch)
                    loss = loss_1 + 0.8 * loss_2
                    loss.backward(retain_graph=True)

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch, y_g_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    y_g_batch = y_g_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)
                        outputs_g = model_g(X_batch)

                    tmp_eval_loss_1 = loss_func(outputs, y_batch)
                    tmp_eval_loss_2 = loss_func_g(outputs_g, y_g_batch)
                    tmp_eval_loss = tmp_eval_loss_1 + 0.8 * tmp_eval_loss_2
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples
            else:
                for X_batch, y_batch in train_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)

                    optimizer.zero_grad()

                    outputs = model(X_batch)

                    loss = loss_func(outputs, y_batch)
                    loss.backward()

                    optimizer.step()
                    opt_scheduler.step()

                    train_loss += loss.mean().item()
                    nb_train_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                train_loss = train_loss / nb_train_steps
                train_accuracy = correct / num_samples

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                correct = 0
                num_samples = 0

                for X_batch, y_batch in eval_dataloader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)
                    with torch.no_grad():
                        outputs = model(X_batch)

                    tmp_eval_loss = loss_func(outputs, y_batch)
                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1

                    outputs = softmax(outputs)
                    outputs = torch.argmax(outputs, dim=1)
                    correct += (outputs == y_batch).float().sum()
                    num_samples += len(X_batch)

                eval_loss = eval_loss / nb_eval_steps
                eval_accuracy = correct / num_samples

            for param_group in optimizer.param_groups:
                lr = param_group['lr']
            print('epoch: {:3d},    lr={:6f},    loss={:5f},    train_acc={:5f},    eval_loss={:5f},    eval_acc={:5f}'
                  .format(epoch+1, lr, train_loss, train_accuracy, eval_loss, eval_accuracy))

            '''
            if((epoch+1) % args.save_checkpoint_steps == 0):
                model_checkpoint = "%s_%s_step_%d.pt" % ('CLDNN', args.conv_dim, epoch+1)
                output_model_file = os.path.join(args.output_dir, model_checkpoint)
                if(args.multi_gpu == 'true'):
                    torch.save(model.module.state_dict(), output_model_file)
                else:
                    torch.save(model.state_dict(), output_model_file)
                print("Saving checkpoint %s" % output_model_file)
            '''

In [21]:
train(train_dataloader, eval_dataloader, num_epochs)

Start training
epoch:   1,    lr=0.000003,    loss=2.632734,    train_acc=0.138053,    eval_loss=2.629200,    eval_acc=0.133156
epoch:   2,    lr=0.000007,    loss=2.620503,    train_acc=0.144018,    eval_loss=2.607507,    eval_acc=0.152473
epoch:   3,    lr=0.000010,    loss=2.610122,    train_acc=0.146541,    eval_loss=2.607075,    eval_acc=0.181576
epoch:   4,    lr=0.000013,    loss=2.585874,    train_acc=0.161479,    eval_loss=2.532885,    eval_acc=0.191878
epoch:   5,    lr=0.000017,    loss=2.425351,    train_acc=0.209953,    eval_loss=2.422811,    eval_acc=0.208620
epoch:   6,    lr=0.000020,    loss=2.347336,    train_acc=0.222477,    eval_loss=2.428290,    eval_acc=0.218922
epoch:   7,    lr=0.000023,    loss=2.295809,    train_acc=0.230569,    eval_loss=2.525782,    eval_acc=0.222012
epoch:   8,    lr=0.000027,    loss=2.243804,    train_acc=0.240406,    eval_loss=2.546046,    eval_acc=0.225790
epoch:   9,    lr=0.000030,    loss=2.209056,    train_acc=0.246237,    eval_loss

epoch:  74,    lr=0.000084,    loss=1.487203,    train_acc=0.498711,    eval_loss=1.897858,    eval_acc=0.420416
epoch:  75,    lr=0.000083,    loss=1.476993,    train_acc=0.502797,    eval_loss=1.950447,    eval_acc=0.419042
epoch:  76,    lr=0.000083,    loss=1.468575,    train_acc=0.507594,    eval_loss=1.904272,    eval_acc=0.430031
epoch:  77,    lr=0.000083,    loss=1.459561,    train_acc=0.509174,    eval_loss=1.906553,    eval_acc=0.432349
epoch:  78,    lr=0.000082,    loss=1.446509,    train_acc=0.513856,    eval_loss=1.925823,    eval_acc=0.429516
epoch:  79,    lr=0.000082,    loss=1.439263,    train_acc=0.517449,    eval_loss=1.923069,    eval_acc=0.425996
epoch:  80,    lr=0.000081,    loss=1.430059,    train_acc=0.520264,    eval_loss=1.906843,    eval_acc=0.438187
epoch:  81,    lr=0.000081,    loss=1.419465,    train_acc=0.524994,    eval_loss=1.919400,    eval_acc=0.433207
epoch:  82,    lr=0.000081,    loss=1.410512,    train_acc=0.527365,    eval_loss=1.910300,    e

epoch: 147,    lr=0.000057,    loss=0.998842,    train_acc=0.675152,    eval_loss=2.132060,    eval_acc=0.481027
epoch: 148,    lr=0.000056,    loss=0.993906,    train_acc=0.675292,    eval_loss=2.120951,    eval_acc=0.478280
epoch: 149,    lr=0.000056,    loss=0.991375,    train_acc=0.676958,    eval_loss=2.130699,    eval_acc=0.481113
epoch: 150,    lr=0.000056,    loss=0.985840,    train_acc=0.677468,    eval_loss=2.117728,    eval_acc=0.486350
epoch: 151,    lr=0.000055,    loss=0.980751,    train_acc=0.679663,    eval_loss=2.136845,    eval_acc=0.477764
epoch: 152,    lr=0.000055,    loss=0.976481,    train_acc=0.680697,    eval_loss=2.144647,    eval_acc=0.481628
epoch: 153,    lr=0.000054,    loss=0.973107,    train_acc=0.682515,    eval_loss=2.195138,    eval_acc=0.483602
epoch: 154,    lr=0.000054,    loss=0.970835,    train_acc=0.683962,    eval_loss=2.195131,    eval_acc=0.483602
epoch: 155,    lr=0.000054,    loss=0.965202,    train_acc=0.685664,    eval_loss=2.162697,    e

epoch: 220,    lr=0.000030,    loss=0.806187,    train_acc=0.741233,    eval_loss=2.571776,    eval_acc=0.482229
epoch: 221,    lr=0.000029,    loss=0.802775,    train_acc=0.741762,    eval_loss=2.546746,    eval_acc=0.484804
epoch: 222,    lr=0.000029,    loss=0.799335,    train_acc=0.742947,    eval_loss=2.544183,    eval_acc=0.486264
epoch: 223,    lr=0.000029,    loss=0.799088,    train_acc=0.743501,    eval_loss=2.550378,    eval_acc=0.492445
epoch: 224,    lr=0.000028,    loss=0.800311,    train_acc=0.742625,    eval_loss=2.558586,    eval_acc=0.487380
epoch: 225,    lr=0.000028,    loss=0.795194,    train_acc=0.745282,    eval_loss=2.527855,    eval_acc=0.489354
epoch: 226,    lr=0.000027,    loss=0.795928,    train_acc=0.743282,    eval_loss=2.560362,    eval_acc=0.490642
epoch: 227,    lr=0.000027,    loss=0.797524,    train_acc=0.742850,    eval_loss=2.560166,    eval_acc=0.488067
epoch: 228,    lr=0.000027,    loss=0.789786,    train_acc=0.745313,    eval_loss=2.552467,    e

epoch: 293,    lr=0.000003,    loss=0.730578,    train_acc=0.766762,    eval_loss=2.760309,    eval_acc=0.487294
epoch: 294,    lr=0.000002,    loss=0.729227,    train_acc=0.767224,    eval_loss=2.779840,    eval_acc=0.484375
epoch: 295,    lr=0.000002,    loss=0.729555,    train_acc=0.765777,    eval_loss=2.788429,    eval_acc=0.484289
epoch: 296,    lr=0.000001,    loss=0.729703,    train_acc=0.766914,    eval_loss=2.774443,    eval_acc=0.487981
epoch: 297,    lr=0.000001,    loss=0.729008,    train_acc=0.766111,    eval_loss=2.779016,    eval_acc=0.485663
epoch: 298,    lr=0.000001,    loss=0.730030,    train_acc=0.766513,    eval_loss=2.774753,    eval_acc=0.485491
epoch: 299,    lr=0.000000,    loss=0.727641,    train_acc=0.767777,    eval_loss=2.777169,    eval_acc=0.486693
epoch: 300,    lr=0.000000,    loss=0.726772,    train_acc=0.767437,    eval_loss=2.790295,    eval_acc=0.485577


In [25]:
model.eval()
if(multi_task == 'true'):
    model_g.eval()

In [23]:
correct = 0
n = 0

for i in range(len(eval_samples)):
    try:
        X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
        X_new = convert_tensor(X_new).to(device)
        y_new = model(X_new)
        y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
        #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
        #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
        y_new = 1 if (y_new.item() == y[eval_idx][i].item()) else 0
        correct += y_new
        n += 1
    except:
        pass
    
acc = correct / n

In [24]:
print('Test accuray:', round(acc, 5))

Test accuray: 0.88611


In [24]:
print('Test accuray:', round(acc, 5))  # 0.7111

Test accuray: 0.85278


- 1DCNN  
Test accuray: 0.64722

In [30]:
len(eval_samples)

360

In [29]:
len(set(eval_samples) - set(train_samples))

360