<center> <h1> Transfer Learning: Sentence Similarity Task </h1> </center>

# Dataset

In [0]:
!wget https://raw.githubusercontent.com/jhu-intro-hlt/jhu-intro-hlt.github.io/master/data-transfer-learning-hw/{dev,test,train}.tsv -q -nc
!head train.tsv 

Can you suggest a best budget phone below 15k?	What is the best phone I can buy under the price of 15000?	1
How can I make a disabled or accident-prone spouse feel useful and respected?	How does it feel to suddenly realize that your children are the product of a broken home because of you and/or your spouse?	0
I had an overdraft in Wells Fargo US! What will happen if I don't pay it?	How do you stop payment on a Wells Fargo check?	0
What are the best places to visit this December in India?	What will be the best place to visit in December in India?	1
What should I do if I want to renew an expired driver's license, but had an accident while the license was expired? (India)	In what US state is it easy to get a driver's license?	0
How/why did Stanford develop such a strong entrepreneurial culture? Why doesn't UC Berkeley have such a strong entrepreneurial culture in comparison?	How strong is the startup culture in Berkeley?	0
How much weight can a honey bee lift?	How much force in Newton re

# Pretrained Models
(https://github.com/huggingface/transformers) 

In [0]:
!pip install pytorch-transformers -q 

In [0]:
import torch
import random
import time
import math
from pytorch_transformers import DistilBertModel as BertModel
from pytorch_transformers import DistilBertTokenizer as BertTokenizer
random.seed(1234)
torch.manual_seed(1234)
torch.cuda.set_device(0)

# Data Reader

In [0]:
SPL_SYMS = ['<PAD>','<BOS>', '<EOS>', '<UNK>']


class STSCorpus(object):
  def __init__(self,
              file,
              vocab=None,
              cuda=False,
              batch_size=1, bert_format=0):
    self.bert_format = bert_format
    if self.bert_format == 0:
      self.bert_tokenizer = None
      self.max_vocab = 64000
    else:
      self.bert_tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')
      self.max_vocab = self.bert_tokenizer.vocab_size
    self.max_size = 0
    self.batch_size = batch_size
    self.vocab = self.make_vocab(file, vocab)
    self.idx2vocab = self.make_idx2vocab(self.vocab)
    self.data = self.numberize(file, self.vocab, cuda)
    self.batch_data = self.batchify()
    self.data_size = len(self.batch_data)

  def batchify(self,):
    self.batch_data = []
    curr_batch = []
    max_x1, max_x2 = 0, 0
    for x1, x2, y in self.data:
      if len(curr_batch) < self.batch_size:
        curr_batch.append((x1, x2, y))
        max_x1 = max(max_x1, x1.shape[1])
        if self.bert_format == 0:
          max_x2 = max(max_x2, x2.shape[1]) 
      else:
        
        _x1, _x2, _y = zip(*curr_batch)
        
        
        if self.bert_format == 0:
          _x1 = [torch.cat((torch.zeros(1, max_x1 - i.shape[1]).type_as(i), i), dim=1) for i in _x1]
          batch_x1 = torch.cat(_x1, dim=0)
          _x2 = [torch.cat((torch.zeros(1, max_x2 - i.shape[1]).type_as(i), i), dim=1) for i in _x2]
          batch_x2 = torch.cat(_x2, dim=0) if _x2[0] is not None else None
        else:
          _x1 = [torch.cat((i, torch.zeros(1, max_x1 - i.shape[1]).type_as(i)), dim=1) for i in _x1]
          batch_x1 = torch.cat(_x1, dim=0)
          batch_x2 = None
        batch_y = torch.cat(_y, dim=0)
        self.batch_data.append((batch_x1, batch_x2, batch_y))
        curr_batch = []
        max_x1, max_x2 = 0, 0

    if len(curr_batch) > 0:
      print(len(self.batch_data),  max_x1, max_x2)
      _x1, _x2, _y = zip(*curr_batch)
      
      
      if self.bert_format == 0:
        _x1 = [torch.cat((torch.zeros(1, max_x1 - i.shape[1]).type_as(i), i), dim=1) for i in _x1]
        batch_x1 = torch.cat(_x1, dim=0)
        _x2 = [torch.cat((torch.zeros(1, max_x2 - i.shape[1]).type_as(i), i), dim=1) for i in _x2]
        batch_x2 = torch.cat(_x2, dim=0) if _x2[0] is not None else None
      else:
        _x1 = [torch.cat((i, torch.zeros(1, max_x1 - i.shape[1]).type_as(i)), dim=1) for i in _x1]
        batch_x1 = torch.cat(_x1, dim=0)
        batch_x2 = None
      batch_y = torch.cat(_y, dim=0)
      self.batch_data.append((batch_x1, batch_x2, batch_y))
    return self.batch_data

  def numberize(self, txt, vocab, cuda=False):
    data = []
    max_size = 0
    with open(txt, 'r', encoding='utf8') as corpus:
      for l in corpus:
        l1, l2, y = l.split('\t')
        y = torch.Tensor([[float(y)]]).float()
        if self.bert_format == 0:
          d1 = [vocab['<BOS>']] + [vocab.get(t, vocab['<UNK>']) for t in l1.strip().split()] + [vocab['<EOS>']]
          d1 = torch.Tensor(d1).long()
          d1 = d1.unsqueeze(0) 
          d2 = [vocab['<BOS>']] + [vocab.get(t, vocab['<UNK>']) for t in l2.strip().split()] + [vocab['<EOS>']]
          d2 = torch.Tensor(d2).long()
          d2 = d2.unsqueeze(0) 
          max_size = max(d1.shape[1], d2.shape[1], max_size)
          if cuda:
            d1 = d1.cuda()
            d2 = d2.cuda()
            y = y.cuda()
        elif self.bert_format == 1:
          _d1 = torch.Tensor(self.bert_tokenizer.encode("[CLS] " + l1 + " [SEP]")).long()
          _d2 = torch.Tensor(self.bert_tokenizer.encode(" " + l2 + " [SEP]")).long()
          d = torch.cat([_d1, _d2], dim=0).unsqueeze(0)
          max_size = max(d.shape[1], max_size)
          if cuda:
            d1 = d.cuda()
            d2 = None
            y = y.cuda()
        else:
          pass
        data.append((d1, d2, y))
    self.max_size = max_size
    return data

  def make_idx2vocab(self, vocab):
    if vocab is not None:
      idx2vocab = {v: k for k, v in vocab.items()}
      return idx2vocab
    else:
      return None

  def make_vocab(self, txt, vocab):
    if vocab is None and txt is not None:
      vc = {}
      for line in open(txt, 'r', encoding='utf-8').readlines():
        x1, x2, y = line.strip().split('\t')
        for w in x1.split() + x2.split():
          vc[w] = vc.get(w, 0) + 1
      cv = sorted([(c, w) for w, c in vc.items()], reverse=True)
      cv = cv[:self.max_vocab]
      _, v = zip(*cv)
      v = SPL_SYMS + list(v)
      vocab = {w: idx for idx, w in enumerate(v)}
      return vocab
    else:
      return vocab

  def get(self, idx):
    return self.batch_data[idx]

Creating train, dev and test data objects. (with `bert_format=0`) and places the data on the GPU.

In [0]:
train_corpus = STSCorpus(file='train.tsv',
                         cuda=True,
                         batch_size=32, 
                         bert_format=0)
dev_corpus = STSCorpus(file='dev.tsv', vocab=train_corpus.vocab,
                       cuda=True,
                       batch_size=32, 
                       bert_format=0)
test_corpus = STSCorpus(file='test.tsv', vocab=train_corpus.vocab,
                        cuda=True,
                        batch_size=1,
                        bert_format=0)
print(train_corpus.data_size, dev_corpus.data_size, test_corpus.data_size)

1212 19 15
151 30 23
1213 152 2500


In [0]:
print(train_corpus.batch_data[0][:2])

(tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     1,    48,    17,  1022,     7,    23,
          1199,   166,  1932,  6382,     2],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             1,    13,    18,     9,    76,     7,  4012,    26, 45538,  2312,
           144,  1052,    12, 33066,     2],
        [    0,     0,     0,     0,     0,     0,     0,     1,     9,   171,
            32, 13296,     8, 14337, 18610, 46694,     5,    39,   149,    35,
             9,    96,   468,   111,     2],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     1,     5,    14,     4,    23,   241,     6,   249,
            91,  2087,     8,    64,     2],
        [    1,     5,    38,     9,    15,    35,     9,   105,     6, 11283,
            32,  5460,  6182, 21845,   118,   171,    32, 12094,   178,     4,
          1515,    75, 40327,

In [0]:
print( train_corpus.batch_data[0][2])

tensor([[1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.]], device='cuda:0')


# Training Routine

In [0]:
def train(model, train_cropus, dev_corpus, max_epochs):
  sum_loss, sum_acc = 0., 0.
  train_instances_idxs = list(range(train_corpus.data_size))
  st = time.time()
  for epoch_i in range(max_epochs):
    sum_loss, sum_acc = 0., 0.
    random.shuffle(train_instances_idxs)
    model.train()
    for i in train_instances_idxs:
      x1, x2, y = train_corpus.get(i)
      l, a = model.train_step(x1, x2, y)
      sum_loss += l
      sum_acc += a
    print(f"epoch: {epoch_i} time elapsed: {time.time() - st:.2f}")
    print(f"train loss: {sum_loss/train_corpus.data_size:.4f} train acc: {sum_acc/train_corpus.data_size:.4f}")
    sum_loss, sum_acc = 0., 0.
    model.eval()
    for dev_i in range(dev_corpus.data_size):
      x1, x2, y = dev_corpus.get(dev_i)
      with torch.no_grad():
        l, a = model(x1, x2, y)
        sum_loss += l
        sum_acc += a
    print(f"  dev loss: {sum_loss/dev_corpus.data_size:.4f}   dev acc: {sum_acc/dev_corpus.data_size:.4f}")
  return model


# Evaluation Routine

In [0]:
def evaluate(model, test_corpus):
  print('Predictions:')
  sum_acc = 0.0
  model.eval()
  for test_i in range(test_corpus.data_size):
    x1, x2, y = test_corpus.get(test_i)
    _, pred = model.predict(x1, x2)
    sum_acc += (1 if pred.item() == y.item() else 0)
  print(f"Avg acc: {sum_acc/test_corpus.data_size:.4f}")

# 1. Baseline Classifier

In [0]:
class Classifier(torch.nn.Module):
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 hidden_size,
                 num_layers=1,
                 dropout=0.1,
                 max_grad_norm=5.0):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_size = embedding_size
        self.max_grad_norm = max_grad_norm
        self.dropout = torch.nn.Dropout(dropout)
        
        if max(vocab_size,embedding_size ,hidden_size,num_layers) > 0:
          self.embedding = torch.nn.Embedding(vocab_size, embedding_size)

          self.rnn = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

          self.output = torch.nn.Linear(2*hidden_size, 1)
  
          self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.parameters()))
        else:
          pass
        
        self.loss = torch.nn.BCELoss(reduction='mean')

    def predict(self, x1, x2):

        batch_size, seq_len = x1.shape
        batch_size2, seq_len2 = x2.shape
        assert batch_size == batch_size2
        
        emb_x1 = self.embedding(x1)
        emb_x1 = self.dropout(emb_x1)

        emb_x2 = self.embedding(x2)
        emb_x2 = self.dropout(emb_x2)
        
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        h0 = h0.cuda()
        c0 = torch.zeros(self.num_layers, batch_size2, self.hidden_size)
        c0 = c0.cuda()

        out1, _  = self.rnn(emb_x1, (h0, c0))
        
        out2, _ = self.rnn(emb_x2, (h0, c0))

        final_hidden = torch.cat((out1[:,-1], out2[:,-1]), 1).view(batch_size, 2 * self.hidden_size)
        
        final_hidden = self.dropout(final_hidden)

        soo = self.output(final_hidden)
        out = torch.nn.functional.sigmoid(soo).view(batch_size, 1)
        
        pred = out.clone().detach()
        pred[pred >= 0.5] = 1
        pred[pred < 0.5] = 0
        return out, pred

    def forward(self, x1, x2, y):
        out, pred = self.predict(x1, x2)
        loss = self.loss(out, y)
        
        assert pred.shape == y.shape
        acc = (pred == y).sum().item() / y.numel()
        return loss, acc

    def train_step(self, x1, x2, y):

        self.optimizer.zero_grad()
        _loss, acc = self(x1, x2, y) 
        _loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, self.parameters()),
                                                   self.max_grad_norm)

        if math.isnan(grad_norm):
            print('skipping update grad_norm is nan!')
        else:
            self.optimizer.step()
        loss = _loss.item()
        return loss, acc

In [0]:
base_model = Classifier(vocab_size=len(train_corpus.vocab),
                        embedding_size=1024,
                        hidden_size=1024,
                        num_layers=2)
print(base_model, '\ncontains', sum([p.numel() for p in base_model.parameters() if p.requires_grad]), 'parameters')
base_model = base_model.cuda()

Classifier(
  (dropout): Dropout(p=0.1, inplace=False)
  (embedding): Embedding(61585, 1024)
  (rnn): LSTM(1024, 1024, num_layers=2, batch_first=True)
  (output): Linear(in_features=2048, out_features=1, bias=True)
  (loss): BCELoss()
) 
contains 79858689 parameters


In [0]:
base_model = train(base_model, train_corpus, dev_corpus, 5)



epoch: 0 time elapsed: 235.50
train loss: 0.5644 train acc: 0.6982
  dev loss: 0.5141   dev acc: 0.7448
epoch: 1 time elapsed: 479.92
train loss: 0.4040 train acc: 0.8171
  dev loss: 0.5291   dev acc: 0.7542
epoch: 2 time elapsed: 724.33
train loss: 0.2190 train acc: 0.9117
  dev loss: 0.6638   dev acc: 0.7574
epoch: 3 time elapsed: 968.77
train loss: 0.0927 train acc: 0.9649
  dev loss: 0.9561   dev acc: 0.7357
epoch: 4 time elapsed: 1213.21
train loss: 0.0505 train acc: 0.9813
  dev loss: 1.2827   dev acc: 0.7509


In [0]:
evaluate(base_model, test_corpus)

Predictions:




Avg acc: 0.7756


Creating train, dev and test data objects. (with `bert_format=1`).

In [0]:
train_corpus = STSCorpus(file='train.tsv',
                          cuda=True,
                          batch_size=32, bert_format=1)
dev_corpus = STSCorpus(file='dev.tsv', vocab=train_corpus.vocab,
                        cuda=True,
                        batch_size=32,bert_format=1)
test_corpus = STSCorpus(file='test.tsv', vocab=train_corpus.vocab,
                        cuda=True,
                        batch_size=1,bert_format=1)
print(train_corpus.data_size, dev_corpus.data_size, test_corpus.data_size)

1212 33 0
151 51 0
1213 152 2500


# 2. BERT based Classifier

In [0]:
class BERTClassifier(Classifier):
    def __init__(self,
                 dropout=0.1,
                 max_grad_norm=5.0):
        super().__init__(0, 0, 0, 0, dropout, max_grad_norm)
        self.output = torch.nn.Linear(768, 1)

        torch.nn.init.normal_(self.output.weight, mean=0, std=0.05)

        self.bert_model = BertModel.from_pretrained('distilbert-base-uncased')
        self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-5)

    def predict(self, x1, x2=None):
        assert x2 is None

        this_bert = self.bert_model(x1)

        first_step = this_bert[0]
        so = self.output(first_step)[:, 0]
        out = torch.nn.functional.sigmoid(so)  
        
        pred = out.clone().detach()
        pred[pred >= 0.5] = 1
        pred[pred < 0.5] = 0
        return out, pred

In [0]:
bert_model = BERTClassifier()
bert_model = bert_model.cuda()
print(bert_model, '\ncontains', sum([p.numel() for p in bert_model.parameters() if p.requires_grad]), 'parameters')

BERTClassifier(
  (dropout): Dropout(p=0.1, inplace=False)
  (loss): BCELoss()
  (output): Linear(in_features=768, out_features=1, bias=True)
  (bert_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (dropout): Dropout(p=0.1, inplace=False)
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
       

In [0]:
bert_model = train(bert_model, train_corpus, dev_corpus, 3) 



epoch: 0 time elapsed: 144.34
train loss: 0.4085 train acc: 0.8082
  dev loss: 0.3188   dev acc: 0.8612
epoch: 1 time elapsed: 293.70
train loss: 0.2862 train acc: 0.8793
  dev loss: 0.3204   dev acc: 0.8681
epoch: 2 time elapsed: 443.34
train loss: 0.2207 train acc: 0.9113
  dev loss: 0.3201   dev acc: 0.8701


In [0]:
evaluate(bert_model, test_corpus)

Predictions:




Avg acc: 0.8692
