In [1]:
import os
import glob

import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from preprocessing import preprocessing, convert_spectrograms, convert_tensor
from model_ae import Encoder
from utils.optimization import WarmupLinearSchedule

In [2]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_attn_heads, attn_hidden_size, dropout_prob, with_focus_attn):
        super(MultiHeadedAttention, self).__init__()
        self.num_attn_heads = num_attn_heads
        self.hidden_size = attn_hidden_size
        self.dropout_prob = dropout_prob
        self.with_focus_attn = with_focus_attn
        
        self.attn_head_size = int(self.hidden_size / self.num_attn_heads)
        self.all_head_size = self.num_attn_heads * self.attn_head_size

        self.query = nn.Linear(self.hidden_size, self.all_head_size)
        self.key = nn.Linear(self.hidden_size, self.all_head_size)
        self.value = nn.Linear(self.hidden_size, self.all_head_size)

        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.softmax = nn.Softmax(dim=-1)
        
        if(with_focus_attn == True):
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()
            
            self.linear_focus_query = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                num_attn_heads * self.attn_head_size)
            self.linear_focus_global = nn.Linear(num_attn_heads * self.attn_head_size, 
                                                 num_attn_heads * self.attn_head_size)
            
            up = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.up = Variable(up, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.up)
            
            uz = torch.randn(num_attn_heads, 1, self.attn_head_size)
            self.uz = Variable(uz, requires_grad=True).cuda()
            torch.nn.init.xavier_uniform_(self.uz)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attn_heads, self.attn_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states):
        key_len = hidden_states.size(1)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)
        
        if(self.with_focus_attn == True):
            glo = torch.mean(mixed_query_layer, dim=1, keepdim=True)
            
            c = self.tanh(self.linear_focus_query(mixed_query_layer) + self.linear_focus_global(glo))
            c = self.transpose_for_scores(c)
            
            p = c * self.up
            p = p.sum(3).squeeze()
            z = c * self.uz
            z = z.sum(3).squeeze()
            
            P = self.sigmoid(p) * key_len
            Z = self.sigmoid(z) * key_len
            
            j = torch.arange(start=0, end=key_len, dtype=P.dtype).unsqueeze(0).unsqueeze(0).unsqueeze(0).to('cuda')
            P = P.unsqueeze(-1)
            Z = Z.unsqueeze(-1)
            
            G = -(j - P)**2 * 2 / (Z**2)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attn_head_size)
        
        if(self.with_focus_attn == True):
            attention_scores = attention_scores + G
            
        attention_probs = self.softmax(attention_scores)
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.o_proj(context_layer)

        return attention_output

In [3]:
# CNN -> LSTM -> self-attention -> DNN
class CLDNN(nn.Module):
    def __init__(self, conv_dim, checkpoint=None, hidden_size=64, num_layers=2,
                 bidirectional=True, with_focus_attn=False):
        super(CLDNN, self).__init__()
        self.conv_dim = conv_dim
        if(conv_dim == '1d'):
            self.conv1 = nn.Sequential(
                nn.Conv1d(1, 64, (3, 1)), # (1, 40, 50) -> (64, 38, 50)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 38, 50) -> (64, 19, 50)
            )
            self.conv2 = nn.Sequential(
                nn.Conv1d(64, 64, (3, 1)), # (64, 19, 50) -> (64, 17, 50)
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d((2, 1), (2, 1))  # (64, 17, 50) -> (64, 8, 50)
            )
            self.lstm = nn.LSTM(512, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional)
            self.tanh = nn.Tanh()
            self.dropout = nn.Dropout(0.5)
            self.attn = MultiHeadedAttention(num_attn_heads=8, attn_hidden_size=128, dropout_prob=0.1,
                                             with_focus_attn=with_focus_attn)
            self.fc = nn.Sequential(
                nn.Linear(50*2*hidden_size if bidirectional else hidden_size, 128),
                nn.ReLU(),
                nn.Linear(128, 4)
            )
        else:
            raise ValueError("Convolution dimension not found: %s" % (conv_dim))
            
    def forward(self, x):
        if(self.conv_dim == '1d'):
            out = self.conv1(x)  # (batch, 1, 40, 50) -> (batch, 64, 19, 50)
            out = self.conv2(out)  # (batch, 64, 19, 50) -> (batch, 64, 8, 50)
            out = out.contiguous()
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2], out.size()[3])
            out = out.view(*new_out_shape)  # (batch, 64, 8, 50) -> (batch, 512, 50)
            out = out.permute(2, 0, 1)  # (batch, 512, 50) -> (50, batch, 512)
            self.lstm.flatten_parameters()
            out, _ = self.lstm(out)  # (50, batch, 512) -> (50, batch, 2*64)
            out = self.tanh(out)
            out = self.dropout(out)
            out = out.permute(1, 0, 2)  # (50, batch, 2*64) -> (batch, 50, 2*64)
            out = self.attn(out)  # (batch, 50, 2*64) -> (batch, 50, 2*64)
            new_out_shape = out.size()[:1] + (out.size()[1] * out.size()[2],)
            out = out.view(*new_out_shape)  # (batch, 50, 2*64) -> (batch, 50*2*64)
            out = self.fc(out)  # (batch, 50*2*64) -> (batch, 4)
        return out

In [4]:
conv_dim = '1d'
checkpoint = ''
hidden_size = 64
num_layers = 2
bidirectional = 'true'
with_focus_attn = True

bidirectional = True if(bidirectional == 'true') else False  
n_mfcc = 40 if(conv_dim == '1d') else 128

#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"

model_dir = './output'
model_path_list = glob.glob(os.path.join(model_dir, '*pt'), recursive=True)

In [5]:
model_path_list

['./output/CLDNN_cv1_step1_epoch5.pt',
 './output/CLDNN_cv1_step1_epoch10.pt',
 './output/CLDNN_cv1_step1_epoch15.pt',
 './output/CLDNN_cv1_step1_epoch20.pt',
 './output/CLDNN_cv1_step1_epoch25.pt',
 './output/CLDNN_cv1_step1_epoch30.pt',
 './output/CLDNN_cv1_step2_epoch5.pt',
 './output/CLDNN_cv1_step2_epoch10.pt',
 './output/CLDNN_cv1_step2_epoch15.pt',
 './output/CLDNN_cv1_step2_epoch20.pt',
 './output/CLDNN_cv1_step2_epoch25.pt',
 './output/CLDNN_cv1_step2_epoch30.pt',
 './output/CLDNN_cv2_step1_epoch5.pt',
 './output/CLDNN_cv2_step1_epoch10.pt',
 './output/CLDNN_cv2_step1_epoch15.pt',
 './output/CLDNN_cv2_step1_epoch20.pt',
 './output/CLDNN_cv2_step1_epoch25.pt',
 './output/CLDNN_cv2_step1_epoch30.pt',
 './output/CLDNN_cv2_step2_epoch5.pt',
 './output/CLDNN_cv2_step2_epoch10.pt',
 './output/CLDNN_cv2_step2_epoch15.pt',
 './output/CLDNN_cv2_step2_epoch20.pt',
 './output/CLDNN_cv2_step2_epoch25.pt',
 './output/CLDNN_cv2_step2_epoch30.pt',
 './output/CLDNN_cv3_step1_epoch5.pt',
 './o

In [6]:
df = pd.read_csv('IEMOCAP_sub_label.csv')
di = {'neu': 0, 'hap': 1, 'ang': 2, 'sad': 3}
df = df.replace({'sample_label': di})

In [7]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

In [8]:
cv_iter = 0
for train_index, eval_index in skf.split(df['sample_name'], df['sample_label']):
    cv_iter += 1
    data_dir = './wav_data/pretrain/IEMOCAP_sub/'
    
    train_samples, eval_samples = df['sample_name'][train_index], df['sample_name'][eval_index]
    train_label, eval_label = df['sample_label'][train_index], df['sample_label'][eval_index]

    train_samples = [data_dir + train_sample + '.wav' for train_sample in train_samples]
    eval_samples = [data_dir + eval_sample + '.wav' for eval_sample in eval_samples]
    
    model_paths = [ckpt for ckpt in model_path_list if 'cv%d'%1 in ckpt]
    
    for ckpt in model_paths:
        model = CLDNN(conv_dim=conv_dim, checkpoint=checkpoint, hidden_size=hidden_size,
                              num_layers=num_layers, bidirectional=bidirectional,
                              with_focus_attn=with_focus_attn).to(device)
        model.load_state_dict(torch.load(ckpt))
        model.eval()

        correct = 0
        n = 0
        for i in tqdm(range(len(eval_samples))):
            try:
                X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
                X_new = convert_tensor(X_new).to(device)
                y_new = model(X_new)
                y_new = torch.argmax(torch.mean(nn.Softmax(dim=-1)(y_new), dim=0))
                #y_new = torch.argmax(nn.Softmax(dim=-1)(torch.mean(y_new, dim=0)))
                #y_new = sorted(dict(collections.Counter(torch.argmax(nn.Softmax(dim=-1)(y_new), dim=1).cpu().numpy()))
                #               .items(), key=(lambda x: x[1]), reverse=True)[0][0]
                y_new = 1 if (y_new.item() == np.array(eval_label)[i]) else 0
                correct += y_new
                n += 1
            except:
                pass
        acc = correct / n
        
        print(ckpt.split('/')[-1], ':', acc)

100%|██████████| 449/449 [00:41<00:00, 10.93it/s]


ZeroDivisionError: division by zero

In [10]:
X_new = preprocessing(eval_samples[i], method='mfcc', sr=16000, n_mfcc=n_mfcc)
X_new = convert_tensor(X_new).to(device)
y_new = model(X_new)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!