In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from warpctc_pytorch import CTCLoss
from ctcdecode import CTCBeamDecoder

from phoneme_list import *
import time

In [2]:
use_cuda = True
pin_memory = use_cuda

dev_feats = np.load('../data/dev.npy')
dev_labels = np.load('../data/dev_phonemes.npy')

train_feats = np.load('../data/train.npy')
train_labels = np.load('../data/train_phonemes.npy')

test_feats = np.load('../data/test.npy')
test_labels = [np.zeros(1) for i in range(test_feats.shape[0])]

In [3]:
class SpeechDataset(Dataset):
    def __init__(self, feats, labels):
        self.feats = feats
        self.labels = np.asarray([label+1 for label in labels])
        self.length = feats.shape[0]

    def __getitem__(self, index):
        return (self.feats[index], self.labels[index])

    def __len__(self):
        return self.length

In [4]:
def collate_fn(data):
    data = sorted(data, key=lambda x: x[0].shape[0], reverse=True)
    feats_batch, labels_batch = zip(*data)
    feats_batch = list(feats_batch)
    labels_batch = list(labels_batch)
    
    batch_size = len(feats_batch)

    feats_batch_lens = np.zeros(batch_size)
    max_feats_batch_len = 0
    for i in range(batch_size):
        feats_len = feats_batch[i].shape[0]
        feats_batch_lens[i] = feats_len
        max_feats_batch_len = max(max_feats_batch_len, feats_len)
    
    labels_batch_lens = np.zeros(batch_size)
    labels_batch_concat = []
    for i in range(batch_size):
        labels = labels_batch[i]
        labels_batch_concat.extend(labels)
        labels_batch_lens[i] = labels.shape[0]
    
    feats_batch_padded = []
    for feats in feats_batch:
        pad = max_feats_batch_len - feats.shape[0]
        feats_padded = np.pad(feats, [(0, pad), (0,0)], 'constant')
        feats_batch_padded.append(feats_padded)
    
    labels_batch = torch.from_numpy(np.asarray(labels_batch_concat)).int()
    assert(labels_batch.shape[0] == np.sum(labels_batch_lens, axis=0))
    labels_batch_lens = torch.from_numpy(labels_batch_lens).int()
    
    feats_batch_padded = torch.from_numpy(np.asarray(feats_batch_padded)).float()
    feats_batch_padded = feats_batch_padded.transpose(1,0)
    assert(feats_batch_padded.shape[0] == max_feats_batch_len)
    assert(feats_batch_padded.shape[1] == batch_size)
    assert(feats_batch_padded.shape[2] == 40)
    
    feats_batch_lens = torch.from_numpy(feats_batch_lens).int()
    
    return (feats_batch_padded, feats_batch_lens, labels_batch, labels_batch_lens)

In [5]:
class RLSTMModel(nn.Module):
    def __init__(self, embed_size, hidden_size, out_size, num_layers):
        super(RLSTMModel, self).__init__()
        self.rnns = nn.ModuleList([
            nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True)
        ])
        self.linear1 = nn.Linear(2*hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, out_size)
        
    def forward(self, inputs, lengths):
        packed_h = pack_padded_sequence(inputs, lengths)
        for rnn in self.rnns:
            packed_h, state = rnn(packed_h)
        h, _ = pad_packed_sequence(packed_h) 
        h = self.linear1(h)
        h = self.linear2(h)
        
        return h

In [6]:
batch_size = 16
num_workers = 4

dset_dev = SpeechDataset(dev_feats, dev_labels)
dev_loader = DataLoader(dset_dev, shuffle=True, batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=num_workers, pin_memory=pin_memory)

dset_train = SpeechDataset(train_feats, train_labels)
train_loader = DataLoader(dset_train, shuffle=True, batch_size=batch_size,
    collate_fn=collate_fn,
    num_workers=num_workers, pin_memory=pin_memory)

dset_test = SpeechDataset(test_feats, test_labels)
test_loader = DataLoader(dset_test, shuffle=False, batch_size=1,
    collate_fn=collate_fn,
    num_workers=num_workers, pin_memory=pin_memory)

In [7]:
num_epochs = 15
lr = 0.001
embed_size = 40
hidden_size = 512
out_size = 47
num_layers = 3

model = RLSTMModel(embed_size, hidden_size, out_size, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

ctc_loss = CTCLoss()
label_map = [' '] + PHONEME_MAP
decoder = CTCBeamDecoder(
    labels=label_map,
    blank_id=0
    )

if use_cuda:
    model = model.cuda()

In [8]:
def train(epoch):
    model.train()
    train_loss = 0
    utterance_count = 0
    
    for batch_idx, (feats, feats_lens, labels, labels_lens) in enumerate(train_loader):
        if use_cuda:
            feats = feats.cuda()
        
        optimizer.zero_grad()
        logits = model(Variable(feats), feats_lens.numpy())
        loss = ctc_loss(logits, Variable(labels), Variable(feats_lens), Variable(labels_lens))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.data[0]
        utterance_count += feats_lens.shape[0]
        
        if batch_idx%100 == 0:
            print("Batch %d Loss %f" % (batch_idx, train_loss/utterance_count))

    print("Epoch %d Train Loss %f" % (epoch, train_loss/utterance_count))
    torch.save(model.state_dict(), str(epoch)+'-model.pkl')

In [9]:
def eval(epoch):
    model.eval()
    dev_loss = 0
    
    for batch_idx, (feats, feats_lens, labels, labels_lens) in enumerate(dev_loader):
        if use_cuda:
            feats = feats.cuda()

        logits = model(Variable(feats), feats_lens.numpy())
        loss = ctc_loss(logits, Variable(labels), Variable(feats_lens), Variable(labels_lens))
        dev_loss += loss.data[0]

    print("Epoch %d Dev Loss %f" % (epoch, dev_loss/dev_feats.shape[0]))

In [None]:
def test(epoch):
    model.load_state_dict(torch.load(str(epoch)+'-model.pkl'))

    f = open('output.txt', 'w')
    f.write('Id,Predicted\n')

    for batch_idx, (feats, feats_lens, labels, labels_lens) in enumerate(test_loader):
        if use_cuda:
            feats = feats.cuda()

        logits = model(Variable(feats), feats_lens.numpy())
        probs = F.softmax(logits, dim=2).data.cpu()
        output, scores, timesteps, out_seq_len = decoder.decode(probs=probs, seq_lens=feature_lengths)
        for i in range(output.size(0)):
            chrs = "".join(label_map[o] for o in output[i, 0, :out_seq_len[i, 0]])
            f.write(batch_idx+','+chrs+'\n')

    f.close()

In [None]:
epoch = 7
model.load_state_dict(torch.load(str(epoch)+'-model.pkl'))

for epoch in range(8, num_epochs):
    epoch_start_time = time.time()
    train(epoch)
    print((time.time() - epoch_start_time)/60)
    eval(epoch)
    print((time.time() - epoch_start_time)/60)

Batch 0 Loss 22.945805
Batch 100 Loss 19.935591
Batch 200 Loss 20.308666
Batch 300 Loss 20.817042
Batch 400 Loss 21.093894
Batch 500 Loss 21.300864
Batch 600 Loss 21.294295
Batch 700 Loss 21.435289
Batch 800 Loss 21.525262
Batch 900 Loss 21.478372
Batch 1000 Loss 21.397946
Batch 1100 Loss 21.518361
Batch 1200 Loss 21.856545
Batch 1300 Loss 21.957307
Batch 1400 Loss 22.053671
Batch 1500 Loss 22.114248
Epoch 8 Train Loss 22.077339
46.97245885928472
Epoch 8 Dev Loss 27.705164
47.99044941663742
Batch 0 Loss 20.660088
Batch 100 Loss 17.130318
Batch 200 Loss 18.015152
Batch 300 Loss 18.608673
Batch 400 Loss 18.913439
Batch 500 Loss 18.884309
Batch 600 Loss 19.152796
Batch 700 Loss 19.309788
Batch 800 Loss 19.362247
Batch 900 Loss 19.631692
Batch 1000 Loss 19.879479
Batch 1100 Loss 20.015452
Batch 1200 Loss 20.107218
Batch 1300 Loss 20.214188
Batch 1400 Loss 20.228809
Batch 1500 Loss 20.408028
Epoch 9 Train Loss 20.428026
46.92634385029475
Epoch 9 Dev Loss 27.759174
47.94999454418818
Batch 0 