<center> <h1> Sequence-to-Sequence Models </h1> </center>


In [0]:
!wget https://raw.githubusercontent.com/jhu-intro-hlt/jhu-intro-hlt.github.io/master/data-seq2seq-hw/cmudict.{dev,train,test}.src -q -nc
!wget https://raw.githubusercontent.com/jhu-intro-hlt/jhu-intro-hlt.github.io/master/data-seq2seq-hw/cmudict.{dev,train,test}.tgt -q -nc
!wget https://raw.githubusercontent.com/jhu-intro-hlt/jhu-intro-hlt.github.io/master/data-seq2seq-hw/cmudict.small.train.src -q -nc
!wget https://raw.githubusercontent.com/jhu-intro-hlt/jhu-intro-hlt.github.io/master/data-seq2seq-hw/cmudict.small.train.tgt -q -nc

In [0]:
!pr -m -t cmudict.small.train.src cmudict.small.train.tgt | head

In [0]:
import math
import random
import torch
import time
import editdistance as ed
random.seed(1234)
torch.manual_seed(1234)
torch.cuda.set_device(0)

# Data Reader

In [0]:
SPL_SYMS = ['<BOS>', '<EOS>', '<UNK>']


class ParallelCorpus(object):
    def __init__(self,
                 src_file, tgt_file,
                 src_vocab=None, tgt_vocab=None):
        self.src_vocab = self.make_vocab(src_file, src_vocab)
        self.tgt_vocab = self.make_vocab(tgt_file, tgt_vocab)
        self.src_idx2vocab = self.make_idx2vocab(self.src_vocab)
        self.tgt_idx2vocab = self.make_idx2vocab(self.tgt_vocab)
        self.src_data = self.numberize(src_file, self.src_vocab)
        self.tgt_data = self.numberize(tgt_file, self.tgt_vocab) if tgt_file is not None else None
        assert len(self.src_data) == len(self.tgt_data), 'Source and Target have unequal lengths!'
        self.data_size = len(self.src_data)

    def numberize(self, txt, vocab):
        data = []
        with open(txt, 'r', encoding='utf8') as corpus:
            for l in corpus:
                d = [vocab['<BOS>']] + [vocab.get(tok, vocab['<UNK>']) for tok in l.strip().split()] + [vocab['<EOS>']]
                d = torch.Tensor(d).long()
                d = d.unsqueeze(0) # shape = (1, N)
                data.append((d, l.strip()))
        return data

    def make_idx2vocab(self, vocab):
        if vocab is not None:
            idx2vocab = {v: k for k, v in vocab.items()}
            return idx2vocab
        else:
            return None

    def make_vocab(self, txt, vocab):
        if vocab is None and txt is not None:
            v = {i: idx for idx, i in enumerate(SPL_SYMS)}
            with open(txt, 'r', encoding='utf8') as corpus:
                for line in corpus:
                    for token in line.strip().split():
                        v[token] = v.get(token, len(v))
            return v
        else:
            return vocab

    def get(self, idx):
        if self.tgt_data is not None:
            return self.src_data[idx], self.tgt_data[idx]
        else:
            return self.src_data[idx], (None, None)

In [0]:
train_corpus = ParallelCorpus('cmudict.small.train.src',
                              'cmudict.small.train.tgt')
dev_corpus = ParallelCorpus('cmudict.dev.src', 
                            'cmudict.dev.tgt',
                            train_corpus.src_vocab, 
                            train_corpus.tgt_vocab)
test_corpus = ParallelCorpus('cmudict.test.src',
                             'cmudict.test.tgt',
                             train_corpus.src_vocab, 
                             train_corpus.tgt_vocab)
print(train_corpus.data_size, dev_corpus.data_size, test_corpus.data_size)

## Seq2Seq with LSTM Language Models



In [0]:
class LSTMLM(torch.nn.Module):
  def __init__(self,
              vocab_size,
              embedding_size,
              hidden_size,
              num_layers=1,
              dropout=0.1):
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout = torch.nn.Dropout(dropout)
    
    self.embedding = torch.nn.Embedding(vocab_size, embedding_size)

    self.rnn = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

    self.output = torch.nn.Linear(hidden_size, vocab_size)
    
    pass

  def forward(self, x, init_hidden_state=None):
    assert x.shape[0] == 1 
  
    emb = self.embedding(x)
    
    emb = self.dropout(emb) 

    if init_hidden_state is None:
      h0 = torch.zeros(self.num_layers, 1, self.hidden_size)
      h0 = h0.cuda()
      c0 = torch.zeros(self.num_layers, 1, self.hidden_size)
      c0 = c0.cuda()
    else:
      h0, c0 = init_hidden_state
    hidden_states, (hn, hc) = self.rnn(emb, (h0, c0))
    hidden_states = hidden_states.cuda()

    final_hidden_state = hn
    final_cell_state = hc
    final_state = [final_hidden_state, final_cell_state]

  
    hidden_states = self.dropout(hidden_states) 

    output_dist = self.output(hidden_states)

    return output_dist, hidden_states, final_state 
    
  def generate(self, start_idx, end_idx, init_hidden_state=None, idx2vocab=None, max_len=50):

    if init_hidden_state is None:
      h = torch.zeros((self.num_layers, 1, self.hidden_size))
      c = torch.zeros((self.num_layers, 1, self.hidden_size))
      h = h.cuda() if self.embedding.weight.is_cuda else h
      c = c.cuda() if self.embedding.weight.is_cuda else c
    else:
      h, c = init_hidden_state
    
    inp = torch.tensor([start_idx]).long().unsqueeze(0)
    inp = inp.cuda() if self.embedding.weight.is_cuda else inp

    out = []
    for _ in range(max_len):
      with torch.no_grad():
        emb = self.embedding(inp)
        o, (h, c) = self.rnn(emb, (h, c))
        o_dist = self.output(o)
        _, pred = o_dist.max(dim=2)
        if pred.item() == end_idx:
          break
        out.append(pred.item() if idx2vocab is None else idx2vocab[pred.item()])
        inp = pred
    return out

In [0]:
class EncoderDecoder(torch.nn.Module):
  def __init__(self,
              src_vocab_size,
              tgt_vocab_size,
              embedding_size,
              hidden_size,
              num_layers=1,
              dropout=0.1,
              max_grad_norm=5.0):
    super().__init__()
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.max_grad_norm = max_grad_norm
    self.encoder = LSTMLM(src_vocab_size, embedding_size, hidden_size, num_layers, dropout)
    self.decoder = LSTMLM(tgt_vocab_size, embedding_size, hidden_size, num_layers, dropout)
    self.log_smax = torch.nn.LogSoftmax(dim=-1)
    self.loss = torch.nn.NLLLoss(reduction='mean', ignore_index=-1)
    #We are going to package the optimizer inside this class
    self.optimizer = torch.optim.Adam(self.parameters())

  def train_step(self, x, y):

    self.optimizer.zero_grad()
    _loss, acc = self(x, y)
    _loss.backward()
    grad_norm = torch.nn.utils.clip_grad_norm_(self.parameters(),
                                                self.max_grad_norm)

    if math.isnan(grad_norm):
      print('skipping update grad_norm is nan!')
    else:
      self.optimizer.step()
    loss = _loss.item()
    return loss, acc

  def forward(self, x, y):
    out_src, hidden_src, final_src = self.encoder(x)
    
    y_input = y[:, :-1]
    y_output = y[:, 1:]

    out_tgt, hidden_tgt, final_tgt = self.decoder(y_input, final_src)
    out_tgt_lsm = self.log_smax(out_tgt)
    
    
    loss = self.loss(out_tgt_lsm.squeeze(0), y_output.squeeze(0))
    _, pred = out_tgt_lsm.max(dim=2)
    accuracy = float((pred == y_output).sum()) / y_output.numel()
    return loss, accuracy

  def generate(self, x, start_idx, end_idx, idx2vocab=None, max_len=50):
    _, _, final_src = self.encoder(x)
    out = self.decoder.generate(start_idx, end_idx, final_src, idx2vocab)
    return out

We create an instance of our EncoderDecoder below.

In [0]:
model = EncoderDecoder(src_vocab_size=len(train_corpus.src_vocab),
                       tgt_vocab_size=len(train_corpus.tgt_vocab),
                       embedding_size=73,
                       hidden_size=73,
                       num_layers=1)
model = model.cuda()
print(model)
print('num parameters:', sum([p.numel() for p in model.parameters()]))

## Training routine

The `train` method defines our training routine. For the first seq2seq model `max_epochs` should be at least 15. For the second model, max_epochs can be reduced to 10.

In [0]:
def train(model, train_corpus, dev_corpus, max_epochs=15):
  sum_loss, sum_acc = 0., 0.
  train_instances_idxs = list(range(train_corpus.data_size))
  st = time.time()
  for epoch_i in range(max_epochs):
    sum_loss, sum_acc = 0., 0.
    random.shuffle(train_instances_idxs)
    model.train()
    for i in train_instances_idxs:
      (x, _), (y, _) = train_corpus.get(i)
      x, y = x.cuda(), y.cuda()
      l, a = model.train_step(x, y)
      sum_loss += l
      sum_acc += a
    print(f"epoch: {epoch_i} time elapsed: {time.time() - st:.2f}")
    print(f"train loss: {sum_loss/train_corpus.data_size:.4f} train acc: {sum_acc/train_corpus.data_size:.4f}")
    sum_loss, sum_acc = 0., 0.
    model.eval()
    for dev_i in range(dev_corpus.data_size):
      (x, x_str), (y, y_str) = dev_corpus.get(dev_i)
      x, y = x.cuda(), y.cuda()
      with torch.no_grad():
        l, a = model(x, y)
        sum_loss += l.item()
        sum_acc += a
    print(f"  dev loss: {sum_loss/dev_corpus.data_size:.4f}   dev acc: {sum_acc/dev_corpus.data_size:.4f}")
  return model

## Evaluation Routine
We are going to evaluate our model's predictions using Character Error Rate (CER). This measures the number of edits (insertions, deletions and substitutions) needed to convert our model's prediction to the correct output sequence. The method before computes and prints the CER for each word in the test set and reports the average CER for the entire test set in the end.

In [0]:
def evaluate(model, test_corpus):
  print('Evaluation:')
  sum_cer = 0.0
  model.eval()
  for test_i in range(test_corpus.data_size):
    (x, x_str), (y, y_str) = test_corpus.get(test_i)
    x, y = x.cuda(), y.cuda()
    pred_seq = model.generate(x,
                              test_corpus.tgt_vocab[SPL_SYMS[0]],
                              test_corpus.tgt_vocab[SPL_SYMS[1]],
                              test_corpus.tgt_idx2vocab)
    cer = float(ed.eval(y_str.split(), pred_seq)) / len(y_str.split())
    y_hat = ' '.join(pred_seq)
    x_str = ''.join(x_str.split())
    print(f"{test_i} {x_str} pred: {y_hat} ref: {y_str} cer: {cer:.4f}")
    sum_cer += cer
  print(f"Avg CER: {sum_cer/test_corpus.data_size:.4f}")

## Training the EncoderDecoder

In [0]:
model = train(model, train_corpus, dev_corpus) 

In [0]:
evaluate(model, test_corpus)

## Seq2Seq with Attention

In [0]:
class Attention(torch.nn.Module):
  def __init__(self,
                hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    m = torch.Tensor(hidden_size, hidden_size)
    torch.nn.init.xavier_uniform_(m)
    self.attn_weight_matrix = torch.nn.Parameter(m)
      
  def forward(self, encoder_states, prev_decoder_state):

    prev_decoder_state = prev_decoder_state.view(self.hidden_size, 1)

    attn_wts = torch.mm(self.attn_weight_matrix, prev_decoder_state)
    
    attn_wts = attn_wts/math.sqrt(64)

    self.softm = torch.nn.LogSoftmax(dim=0)
    attn_probs = self.softm(attn_wts)

    context_vector = attn_probs * torch.sum(encoder_states)
    
    context_vector = context_vector.unsqueeze(1) 
    return context_vector

In [0]:
class AttentionDecoder(torch.nn.Module):
  def __init__(self,
                vocab_size,
                embedding_size,
                hidden_size,
                num_layers=1,
                dropout=0.1):
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_size = embedding_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.dropout_prop = dropout
    self.dropout = torch.nn.Dropout(dropout)
    self.num_layers = num_layers
    self.embedding = torch.nn.Embedding(self.vocab_size, self.embedding_size)
    self.output_proj = torch.nn.Linear(self.hidden_size, self.vocab_size)
    self.rnn = torch.nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers,
                              batch_first=True, dropout=dropout, bidirectional=False)
    self.attention = Attention(self.hidden_size)

  def forward(self, encoder_states, y):

    h, c = (torch.zeros(self.num_layers, 1, self.hidden_size),
            torch.zeros(self.num_layers, 1, self.hidden_size))

    tgt_embedding = self.embedding(y[0, 0]).view(1, self.embedding_size)

    output_buffer = []
    for tgt_idx in range(y.shape[1] - 1):
      context_vector = self.attention(encoder_states, h).view(1, -1)

      decoder_input = torch.cat((tgt_embedding, context_vector), -1).unsqueeze(0)

      o, (h, c) = self.rnn(decoder_input, (h, c))

      output = self.output_proj(o).view(1, 1, self.vocab_size)
      
      output_buffer.append(output)
      tgt = y[:, tgt_idx + 1]
      tgt_embedding = self.embedding(tgt)
    output_dist = torch.cat(output_buffer, dim=1)
    return output_dist

  def generate(self, encoder_states, start_idx, end_idx, idx2vocab=None, max_len=50):

    h, c = (torch.zeros(self.num_layers, 1, self.hidden_size).type_as(encoder_states),
            torch.zeros(self.num_layers, 1, self.hidden_size).type_as(encoder_states))
    
    inp = torch.tensor(start_idx).view(1, 1)
    inp = inp.cuda()


    out = [] 
    for _ in range(max_len):

      tgt_embedding = self.embedding(inp).view(1, self.embedding_size)

      context_vector = self.attention(encoder_states, h).view(1, -1)

      decoder_input = torch.cat((context_vector, tgt_embedding), -1 ).unsqueeze(0)
      o, (h, c) = self.rnn(decoder_input, (h, c))

      o_dist = self.output_proj(o).view(1, 1, self.vocab_size)

      max_idxa = torch.argmax(o_dist)
      max_idx = max_idxa.item()
 
      if max_idx == end_idx:
        break
      else:
        idx2vocab = str(max_idx)
        out.append(idx2vocab)
      inp = torch.tensor(max_idx)
      inp = inp.cuda()
    return out

In [0]:
class EncoderDecoderAttention(torch.nn.Module):
  def __init__(self,
                src_vocab_size,
                tgt_vocab_size,
                embedding_size,
                hidden_size,
                num_layers=1,
                dropout=0.0,
                max_grad_norm=5.0):
    super().__init__()
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.max_grad_norm = max_grad_norm
    self.encoder = LSTMLM(src_vocab_size, embedding_size, hidden_size, num_layers, dropout)
    self.decoder = AttentionDecoder(tgt_vocab_size, embedding_size, hidden_size, num_layers, dropout)
    self.log_smax = torch.nn.LogSoftmax(dim=-1)
    self.loss = torch.nn.NLLLoss(reduction='mean', ignore_index=-1)
    self.optimizer = torch.optim.Adam(self.parameters())

  def train_step(self, x, y):
    self.optimizer.zero_grad()
    _loss, acc = self(x, y)
    _loss.backward()
    grad_norm = torch.nn.utils.clip_grad_norm_(self.parameters(),
                                                self.max_grad_norm)

    if math.isnan(grad_norm):
      print('skipping update grad_norm is nan!')
    else:
      self.optimizer.step()
    loss = _loss.item()
    return loss, acc

  def forward(self, x, y):
    _, encoder_states, _ = self.encoder(x)
    out_tgt = self.decoder(encoder_states, y)
    out_tgt_lsm = self.log_smax(out_tgt)
    y_output = y[:, 1:]
    loss = self.loss(out_tgt_lsm.squeeze(0), y_output.squeeze(0))
    _, pred = out_tgt.max(dim=2)
    accuracy = float((pred == y_output).sum()) / y_output.numel()
    return loss, accuracy

  def generate(self, x, start_idx, end_idx, idx2vocab=None, max_len=50):
    _, encoder_states, _ = self.encoder(x)
    out = self.decoder.generate(encoder_states, start_idx, end_idx, idx2vocab)
    return out

In [0]:
print(train_corpus.data_size, dev_corpus.data_size, test_corpus.data_size)
model_attn = EncoderDecoderAttention(src_vocab_size=len(train_corpus.src_vocab),
                                tgt_vocab_size=len(train_corpus.tgt_vocab),
                                embedding_size=64,
                                hidden_size=64,
                                num_layers=1)
model_attn = model_attn.cuda()
print(model_attn)
print('num parameters:', sum([p.numel() for p in model_attn.parameters()]))

In [0]:
model_attn = train(model_attn, train_corpus, dev_corpus, max_epochs=10) 

RuntimeError: ignored

In [0]:
evaluate(model_attn, test_corpus)

Evaluation:
0 vanlaningham pred: 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 ref: V AE2 N L AE1 N IH0 NG HH AE2 M cer: 4.5455
1 utility's pred: 55 35 35 35 15 16 35 15 16 35 15 16 35 15 16 15 16 35 15 16 15 16 15 16 35 15 16 15 16 35 15 16 15 16 15 16 35 15 16 15 16 15 16 35 15 16 15 16 15 16 ref: Y UW0 T IH1 L AH0 T IY0 Z cer: 5.5556
2 rothenberg pred: 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 ref: R AO1 TH AH0 N B ER0 G cer: 6.2500
3 kinesiology pred: 35 73 21 35 73 41 35 73 41 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 35 ref: K IH2 N IH0 S IY2 AA1 L AH0 JH IY0 cer: 4.5455
4 reclassified pred: 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 

KeyboardInterrupt: ignored