### RNN/CNN-based Natural Language Inference

In [1]:
import glob
import os
import time
from argparse import ArgumentParser, Namespace

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboard import TensorBoard
from torchtext import data, datasets

In [54]:
params.train_file = 'dev.tsv'
params.val_file = 'dev.tsv'
params.word_vectors = 'glove.6B.300d'

In [40]:
params = Namespace()

params.epochs = 10
params.batch_size = 128
params.encoder = 'rnn'
params.d_embed = 300
params.word_vectors = 'fasttext.en.300d'
params.d_hidden = 300
params.d_fc = 100
params.n_layers = 1
params.lr = .001
params.dp_ratio = 0.2
params.gpu = 0
params.train_file = 'snli_train.tsv'
params.val_file = 'snli_val.tsv'
params.log_every = 50
params.dev_every = 1000
params.experiment = 'test'

In [34]:
class CNNEncoder(nn.Module):
    def __init__(self, params):
        super(CNNEncoder, self).__init__()
        self.params = params
        self.conv1 = nn.Conv1d(
            params.d_embed, params.d_hidden, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(
            params.d_hidden, params.d_hidden, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

    def forward(self, x):
        # input is: (length, batch_size, num_channels)
        # conv1d module expects: (batch_size, num_channels, length)
        h0 = x.transpose(0, 1).transpose(1, 2).contiguous()

        h0 = self.relu(self.conv1(h0))
        h0 = self.relu(self.conv2(h0))

        # return (batch_size, num_channels)
        h0 = h0.transpose(1, 2)
        return torch.sum(h0, dim=1)

In [4]:
class RNNEncoder(nn.Module):
    def __init__(self, params):
        super(RNNEncoder, self).__init__()
        self.params = params
        input_size = params.d_embed
        dropout = 0 if params.n_layers == 1 else params.dp_ratio
        self.rnn = nn.GRU(input_size=input_size, hidden_size=params.d_hidden,
                        num_layers=params.n_layers, dropout=dropout,
                        bidirectional=True)

    def forward(self, inputs):
        batch_size = inputs.size()[1]
        state_shape = self.params.n_cells, batch_size, self.params.d_hidden
        h0 =  inputs.new_zeros(state_shape)
        _, ht = self.rnn(inputs, h0)

        # bring batch_size to the 0th dim
        ht = ht[-2:].transpose(0, 1).contiguous()
        # concat forward and backward rnn hidden
        return ht.view(batch_size, -1)

In [5]:
class NLI(nn.Module):

    def __init__(self, params):
        super(NLI, self).__init__()
        
        self.params = params
        self.embed = nn.Embedding(params.n_embed, params.d_embed)
        if params.encoder == 'rnn':
            self.encoder = RNNEncoder(params)
        elif params.encoder == 'cnn':
            self.encoder = CNNEncoder(params)
        else:
            raise ValueError(f'Encoder {params.encoder} is not supported. Try using cnn or rnn.')

        
        self.dropout = nn.Dropout(p=params.dp_ratio)
        self.relu = nn.ReLU()
        
        fc_in_size = params.d_hidden
        # concat s1 and s2
        fc_in_size *= 2
        if params.encoder == 'rnn':
            # concat forward and backward bi-rnn
            fc_in_size *= 2

        fc_ot_size = params.d_fc
        
        # 2-layers fc
        self.out = nn.Sequential(
            nn.Linear(fc_in_size, fc_ot_size),
            self.relu,
            self.dropout,
            nn.Linear(fc_ot_size, params.d_out)
        )

    def forward(self, s1, s2):
        # fix embeddings, do not backprop
        s1_embed = self.embed(s1).detach()
        s2_embed = self.embed(s2).detach()
        
        s1_encode = self.encoder(s1_embed)
        s2_encode = self.encoder(s2_embed)
        
        return self.out(torch.cat([s1_encode, s2_encode], 1))
    

In [6]:
# gpu business
if torch.cuda.is_available():
    torch.cuda.set_device(params.gpu)
    device = torch.device('cuda:{}'.format(params.gpu))
else:
    device = torch.device('cpu')

#### Tensoboard logging

In [7]:
model_dir = f"runs/{params.experiment}/{time.asctime(time.localtime())}/"
tb = TensorBoard(model_dir)

#### Define text felids

In [19]:
# TODO: Other tokenizers?
inputs = data.Field(lower=True, tokenize='spacy')
answers = data.Field(sequential=False, unk_token=None)

train, valid = data.TabularDataset.splits(
    path="data",
    train=params.train_file, validation=params.val_file,
    format='tsv',
    skip_header=True,
    fields=[("sentence1", inputs), ("sentence2", inputs), ("label", answers)])

#### Build vocabulary and load per-trained embeddings

In [20]:
# TODO: Too slow for big dataset. Use n-workers.
inputs.build_vocab(train, valid, vectors=params.word_vectors)
answers.build_vocab(train)

In [11]:
# equivalent of dataloader in torchtext
train_iter, valid_iter = data.BucketIterator.splits(
            (train, valid), 
            batch_size=params.batch_size, 
            sort_key=lambda x: len(x.sentence1),
            device=device)

In [25]:
params.n_embed = len(inputs.vocab)
params.d_out = len(answers.vocab)
# double the number of cells for bidirectional networks
params.n_cells = params.n_layers * 2

#### Load Model

In [35]:
model = NLI(params)
# hack to tie pre-trained vectors to Embedding class
# TODO: find a cleaner way
if params.word_vectors:
    model.embed.weight.data.copy_(inputs.vocab.vectors)
model.to(device)

NLI(
  (embed): Embedding(2640, 100)
  (encoder): RNNEncoder(
    (rnn): GRU(100, 300, bidirectional=True)
  )
  (dropout): Dropout(p=0.2)
  (relu): ReLU()
  (out): Sequential(
    (0): Linear(in_features=1200, out_features=100, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2)
    (3): Linear(in_features=100, out_features=3, bias=True)
  )
)

In [36]:
print("Total number of parameters: {}.".format(sum(p.numel()
                                                   for p in model.parameters() if p.requires_grad)))

Total number of parameters: 1108003.


In [61]:
criterion = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=params.lr)

In [62]:
# global step for tensorboard logging
step = 0
start = time.time()

In [15]:
def training():
    # logging business
    header = '  Time Epoch Iteration Progress    (%Epoch)   Loss       Accuracy'
    print(header)

    global step
    for epoch in range(params.epochs):
        train_iter.init_epoch()
        n_correct, n_total = 0, 0
        for batch_idx, batch in enumerate(train_iter):
            step += 1

            model.train()
            opt.zero_grad()
            answer = model(batch.sentence1, batch.sentence2)
            loss = criterion(answer, batch.label)
            loss.backward()
            opt.step()

            # evaluate performance on validation set periodically
            if step % params.dev_every == 0:
                validation()

            if step % params.log_every == 0:
                # calculate accuracy
                n_correct += (torch.max(answer, 1)[1].view(batch.label.size()) == batch.label).sum().item()
                n_total += batch.batch_size
                accuracy = 100. * n_correct/n_total

                # print progress message
                log_template = ' '.join('{:>6.0f},{:>5.0f},{:>9.0f},{:>5.0f}/{:<5.0f} {:>7.0f}%,{:>8.6f},{:12.4f}'.split(','))
                print(log_template.format(time.time()-start, epoch, step, 1+batch_idx, len(train_iter),
                        100. * (1+batch_idx) / len(train_iter), loss.item(), accuracy))
                
                if tb is not None:
                    tb.scalar_summary("train/loss", loss.item(), step)
                    tb.scalar_summary("train/accuracy", accuracy, step)


In [48]:
def validation():
    global step
    # switch model to evaluation mode
    model.eval()
    valid_iter.init_epoch()

    # calculate accuracy on validation set
    n_valid_correct, valid_loss = 0, 0
    with torch.no_grad():
        for _, valid_batch in enumerate(valid_iter):
            answer = model(valid_batch.sentence1, valid_batch.sentence2)
            n_valid_correct += (torch.max(answer, 1)[1].view(valid_batch.label.size()) == valid_batch.label).sum().item()
            valid_loss = criterion(answer, valid_batch.label)
    valid_acc = 100. * n_valid_correct / len(valid)

    valid_log_template = 'Validation Loss: {:>8.6f}, Accuracy: {:12.4f}'
    print(valid_log_template.format(valid_loss.item(), valid_acc))

    if tb is not None:
        tb.scalar_summary("validation/loss", valid_loss.item(), step)
        tb.scalar_summary("validation/accuracy", valid_acc, step)


In [32]:
p.numel?

In [33]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

1108003

In [41]:
print(params)

Namespace(batch_size=128, d_embed=300, d_fc=100, d_hidden=300, dev_every=1000, dp_ratio=0.2, encoder='rnn', epochs=10, experiment='test', gpu=0, log_every=50, lr=0.001, n_layers=1, train_file='snli_train.tsv', val_file='snli_val.tsv', word_vectors='fasttext.en.300d')


In [None]:
training()

  Time Epoch Iteration Progress    (%Epoch)   Loss       Accuracy
    51     6        50     2/8          25% 1.031085      48.4375
    85    12       100     4/8          50% 0.781582      67.1875
   118    18       150     6/8          75% 0.536626      81.2500
   150    24       200     8/8         100% 0.208219      93.7500
   183    31       250     2/8          25% 0.022324     100.0000
   215    37       300     4/8          50% 0.003283     100.0000


### The Multi-Genre NLI

#### Split MNLI

Split MNLI into 5 different files. Then use these files as validation files.

In [None]:
df = pd.read_csv("mnli_val.tsv", sep='\t')

for k, v in df.groupby("genre"):
    v[["sentence1", "sentence2", "label"]].to_csv(f"mnli_{k}.tsv", sep='\t', index=None)

In [46]:
params.val_file = 'mnli_telephone.tsv'

In [51]:
# TODO: Other tokenizers?
inputs = data.Field(lower=True, tokenize='spacy')
answers = data.Field(sequential=False, unk_token=None)

train, valid = data.TabularDataset.splits(
    path="data",
    train=params.train_file, validation=params.val_file,
    format='tsv',
    skip_header=True,
    fields=[("sentence1", inputs), ("sentence2", inputs), ("label", answers)])

In [55]:
# TODO: Too slow for big dataset. Use n-workers.
inputs.build_vocab(train, valid, vectors=params.word_vectors)
answers.build_vocab(train)

100%|█████████▉| 398811/400000 [00:40<00:00, 13099.01it/s]

In [58]:
# equivalent of dataloader in torchtext
train_iter, valid_iter = data.BucketIterator.splits(
            (train, valid), 
            batch_size=params.batch_size, 
            sort_key=lambda x: len(x.sentence1),
            device=device)

In [59]:
params.n_embed = len(inputs.vocab)
params.d_out = len(answers.vocab)
# double the number of cells for bidirectional networks
params.n_cells = params.n_layers * 2

In [64]:
validation()

Validation Loss: 1.098440, Accuracy: 44.6326
