In [71]:
import os
import torch
import numpy as np
import torch.nn as nn
import torch.nn.utils as utils
# from ctcdecode import CTCBeamDecoder
# from warpctc_pytorch import CTCLoss
from torch.utils.data import Dataset, DataLoader

In [72]:
class PhonemeDataset(Dataset):
    def __init__(self, path, name):
        self.data = np.load(os.path.join(path, '{}.npy'.format(name)), encoding='bytes')
        self.labels = np.load(os.path.join(path, '{}_labels.npy'.format(name)), encoding='bytes')
        # save dimensions of data
        self.length = self.data.shape[0]
        self.seq_lens = [seq.shape[0] for seq in self.data]
        self.lab_lens = [lab.shape[0] for lab in self.labels]
        self.seq_len = max(self.seq_lens)
        self.lab_len = max(self.lab_lens)
        # tensorize
        self.data = [torch.Tensor(d) for d in self.data]
        self.labels = [torch.LongTensor(l) for l in self.labels]
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, i):
        return self.data[i], self.labels[i]
    
    @staticmethod
    def collate(seq_list):
        inputs, targets = zip(*seq_list)
        lens = [len(seq) for seq in inputs]
        seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
        inputs = [inputs[i] for i in seq_order]
        targets = [targets[i] for i in seq_order]
        return inputs, targets

In [68]:
ts = PhonemeDataset('data', 'train')
vs = PhonemeDataset('data', 'dev')

In [69]:
tl = DataLoader(ts, batch_size=64, shuffle=True, collate_fn=TrainDataset.collate)
vl = DataLoader(vs, batch_size=64, shuffle=True, collate_fn=TrainDataset.collate)

In [152]:
class BaselineModel(nn.Module):
    def __init__(self):
        """Baseline Model
        - 3 stacked BiLSTM layers, each of 256 units.
        - 1 Dense layer of 47 (num_classes) units.
        - Adam optimizer with default learning rate of 1e-3
        - Decoding with beam search and a beam width of 100.
        """
        super(BaselineModel, self).__init__()

        # save parameters
        self.n_layers = 3
        self.input_size = 40
        self.hidden_size = 256
        self.output_size = 46

        # layers
        self.recurrent = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.n_layers, bias=False, bidirectional=True)
        self.scoring = nn.Linear(2*self.hidden_size, self.output_size)

    def forward(self, seq_list):
        """Forward pass in the model.
        Takes:
        - seq_batch: list of length B, with tensors of dim L x 40 (L differs)
        Returns:
        - output: list of length B, with tensors of dim L x V (L differs)
        Dimensions:
        - L: sequence length
        - B: batch size
        - H: hidden size
        - V: vocab size
        """
        # packed padded sequence
        batch_size = len(seq_list)
        lengths = [len(s) for s in seq_list]
        print(lengths)
        print(sum(lengths))
        packed_input = utils.rnn.pack_sequence(seq_list)
        print('packed input', packed_input.data.size())
        # LSTM - embedded data as L x B
        hidden = None
        output_packed, hidden = self.recurrent(packed_input, hidden)
        print('output packed', output_packed.data.size())
        output_padded, _ = utils.rnn.pad_packed_sequence(output_packed)
        print('output',output_padded.shape)
        output_flatten = torch.cat([output_padded[:lengths[i], i] for i in range(batch_size)])
        # full linear layer for scoring
        print(output_flatten.shape)
        print(batch_size)
        output_scoring = self.scoring(output_flatten)
        print(output_scoring.size())
        output_unflatten = [output_scoring[sum(lengths[:i]) : sum(lengths[:i+1]), :] for i in range(batch_size)]
        return output_unflatten

In [153]:
lengths = [2,2,3,4,5]

In [154]:
sum(lengths[:1]), lengths[0:2]

(2, [2, 2])

In [155]:
model = BaselineModel()

In [166]:
class Trainer():
  def __init__(self, model, train_loader, val_loader, epochs, run_id='exp'):
    self.model = model
    self.run_id = run_id
    self.epochs = 0
    self.n_epochs = epochs
    self.val_loader = val_loader
    self.train_loader = train_loader
    self.gpu = torch.cuda.is_available()
    self.val_losses = []
    self.train_losses = []

    if self.gpu: 
      self.model = self.model.cuda()
    
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
    self.criterion = nn.CrossEntropyLoss()
#     self.criterion = CTCLoss()
  
  def train(self):
    self.model.train()

    epoch_loss = 0

    for batch_idx, (inputs, targets) in enumerate(self.train_loader):
      print('\rBatch {:03}/{:03}'.format(batch_idx+1, len(self.train_loader)), end='')
      epoch_loss += self.train_batch(inputs, targets)
    
    epoch_loss = epoch_loss / len(self.train_loader)

    self.train_losses.append(epoch_loss)

    self.epochs += 1

    print('\r[TRAIN] Epoch {:02}/{:02} Loss {:7.4f}'.format(
      self.epochs, self.n_epochs, epoch_loss
    ), end='\t')
  
  def train_batch(self, inputs, targets):
    if self.gpu:
      inputs = inputs.cuda()
      targets = targets.cuda()
    
    outputs = self.model(inputs)
#     print('raw output', len(outputs), outputs[0].size(), outputs[1].size())
    output_sizes = [out.size(0) for out in outputs]
    outputs = utils.rnn.pad_sequence(outputs)
#     print('padded output',outputs.size())
#     targets = targets
#     .view(-1).type(torch.LongTensor)
    target_sizes = [tar.size(0) for tar in targets]
    target_sizes = torch.LongTensor(target_sizes)
    targets = torch.cat(targets)
#     print('padded targets', targets.size())
#     print('target sizes:', len(target_sizes))
#     print('target sizes sum:', sum(target_sizes))

    loss = self.criterion(outputs, targets, output_sizes, target_sizes)

    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    return loss.detach().cpu().item()
  
  def save(self):
    torch.save(
      {'state_dict': self.model.state_dict()},
      os.path.join('experiments', self.run_id, 'model_{}.pkl'.format(self.epochs))
    )
    with open(os.path.join('experiments', self.run_id, 'train_losses.txt'), 'w') as fw:
      for i in range(len(self.train_losses)):
        fw.write('{:02} {:10.6}\n'.format(i, self.train_losses[i]))

In [167]:
trainer = Trainer(model, tl, vl, 10)

In [168]:
trainer.train()

Batch 001/387[1248, 1247, 1146, 1055, 1036, 1020, 998, 984, 981, 907, 905, 896, 883, 856, 850, 832, 827, 826, 825, 824, 768, 758, 747, 739, 732, 699, 697, 687, 675, 657, 613, 611, 607, 597, 587, 572, 555, 554, 543, 517, 501, 485, 479, 478, 475, 468, 461, 451, 449, 442, 434, 421, 411, 407, 404, 381, 362, 337, 330, 301, 270, 250, 206, 149]
41413
packed input torch.Size([41413, 40])
output packed torch.Size([41413, 512])
output torch.Size([1248, 64, 512])
torch.Size([41413, 512])
64
torch.Size([41413, 46])
raw output 64 torch.Size([1248, 46]) torch.Size([1247, 46])
padded output torch.Size([1248, 64, 46])
padded targets torch.Size([5034])
target sizes: 64
target sizes sum: 5034


ValueError: Expected input batch_size (1248) to match target batch_size (5034).

In [32]:
labels = np.load(os.path.join('data', 'train_labels.npy'), encoding='bytes')

In [42]:
max([l.shape[0] for l in labels])

210

In [34]:
labels = [torch.LongTensor(lab) for lab in labels]

In [38]:
l = utils.rnn.pad_sequence(labels, batch_first=True)

In [41]:
l.shape

torch.Size([24724, 210])