In [45]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import pickle as pkl
import random
import io
import os
random.seed(1337)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32
VOCAB_SIZE = 10000

In [2]:
train_data = pd.read_csv('data/snli_train.tsv', sep='\t')
val_data = pd.read_csv('data/snli_val.tsv', sep='\t')

In [3]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [4]:
train_data.shape

(100000, 3)

In [5]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [6]:
def prepare_data(data):
    data['label'] = data['label'].map({'neutral': 0, 'contradiction': 1, 'entailment': 2})
    data['sentence1'] = data['sentence1'].str.split()
    data['sentence2'] = data['sentence2'].str.split()
    return data

In [7]:
train_data = prepare_data(train_data)
val_data = prepare_data(val_data)

In [8]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label
0,"[A, young, girl, in, a, pink, shirt, sitting, ...","[A, young, girl, watching, the, sunset, over, ...",0
1,"[A, woman, is, smiling, while, the, man, next,...","[Two, people, are, next, to, each, other, .]",2
2,"[Across, the, river, ,, you, can, see, a, larg...","[The, large, building, is, full, of, apartment...",0
3,"[a, man, in, white, shorts, and, a, black, shi...","[A, man, is, riding, a, jetski, on, the, ocean...",1
4,"[Four, black, dogs, run, together, on, bright,...","[Four, dogs, are, preparing, to, be, launched,...",1


In [9]:
val_data.head()

Unnamed: 0,sentence1,sentence2,label
0,"[Three, women, on, a, stage, ,, one, wearing, ...","[There, are, two, women, standing, on, the, st...",1
1,"[Four, people, sit, on, a, subway, two, read, ...","[Multiple, people, are, on, a, subway, togethe...",2
2,"[bicycles, stationed, while, a, group, of, peo...","[People, get, together, near, a, stand, of, bi...",2
3,"[Man, in, overalls, with, two, horses, .]","[a, man, in, overalls, with, two, horses]",2
4,"[Man, observes, a, wavelength, given, off, by,...","[The, man, is, examining, what, wavelength, is...",2


In [10]:
train_data['sentence1'].str.len().describe()

count    100000.000000
mean         14.038630
std           6.009452
min           2.000000
25%          10.000000
50%          13.000000
75%          17.000000
max          82.000000
Name: sentence1, dtype: float64

In [11]:
train_data['sentence2'].str.len().describe()

count    100000.000000
mean          8.242020
std           3.202772
min           1.000000
25%           6.000000
50%           8.000000
75%          10.000000
max          41.000000
Name: sentence2, dtype: float64

In [12]:
def prepare_stopwords():
    NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]
    
    stopwords = [word for word in STOP_WORDS if word not in NEGATE]

    return stopwords

In [13]:
def clean(tokens, stopwords, punctuations):
    tokens = [unidecode(tok.lower().strip()) for tok in tokens if (tok not in stopwords and tok not in punctuations)]    
    return tokens

In [14]:
def load_vectors(f_name, vocabulary=None):
    f_in = io.open(f_name, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, f_in.readline().split())
    data = {}
    for line in f_in:
        tokens = line.rstrip().split(' ')
        if (not vocabulary) or (vocabulary and tokens[0] in vocabulary):
            data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [24]:
def build_vocabulary(train_data, vocab_size=10000):
    '''
    Returns:
    id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    token2id: dictionary where keys represent tokens and corresponding values represent indices
    '''
    all_tokens = []
    for row in (train_data['sentence1']+train_data['sentence2']).iteritems():
        all_tokens += row[1]
    vocabulary, count = zip(*Counter(all_tokens).most_common(vocab_size))
    id2token = list(vocabulary)
    token2id = dict(zip(vocabulary, range(2, 2+len(vocabulary))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

def get_max_sentence_length(train_data):
    max_sent1_len = train_data['sentence1'].str.len().max()
    max_sent2_len = train_data['sentence2'].str.len().max()
    return max(max_sent1_len, max_sent2_len)

def convert_to_chars(data):
    return [([c for c in sample[0]], sample[1]) for sample in data]

def read_and_preprocess_data(dataset='train', vocab_size=None):
    data = pd.read_csv('data/snli_{}.tsv'.format(dataset), sep='\t')
    data = prepare_data(data)
    if dataset == 'train':
        token2id, id2token = build_vocabulary(data, vocab_size)
        return data, token2id, id2token
    return data

In [25]:
MAX_SENT_LENGTH = get_max_sentence_length(train_data)

In [26]:
train_data, token2id, id2token = read_and_preprocess_data('train', VOCAB_SIZE)
val_data = read_and_preprocess_data('val')

In [35]:
# Check the dictionary by loading random token from it
random_token_id = np.random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]
print("Token id: {}; Token: {}".format(random_token_id, id2token[random_token_id]))
print("Token: {}; Token id: {}".format(random_token, token2id[random_token]))

Token id: 3533; Token: carved
Token: carved; Token id: 3533


In [36]:
vectors = load_vectors('data/wiki-news-300d-1M.vec')

In [44]:
class VocabDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_tuple, token2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_list, self.target_list = zip(*data_tuple)
        assert (len(self.data_list) == len(self.target_list))
        self.token2id = token2id
        self.vectors = vectors
        
        np.random.seed(1337)
        self.UNK_VEC = np.random.randn(300)

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        word_idx = [self.token2id[c] if c in self.token2id.keys() else UNK_IDX  for c in self.data_list[key][:MAX_SENT_LENGTH]]
        word_vectors = [self.vectors.get(word, self.UNK_VEC) for word in self.data_list[key][:MAX_SENT_LENGTH]]
        label = self.target_list[key]
        return [word_idx, len(word_idx), label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []

    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    
    # Padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                            pad_width=((0, MAX_SENT_LENGTH-datum[1])),
                            mode="constant", constant_values=0)
        data_list.append(padded_vec)
    ind_dec_order = np.argsort(length_list)[::-1]
    data_list = np.array(data_list)[ind_dec_order]
    length_list = np.array(length_list)[ind_dec_order]
    label_list = np.array(label_list)[ind_dec_order]
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [None]:
# Build train, valid and test dataloaders

train_dataset = VocabDataset(train_data, token2id)
train_loader = DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_data, token2id)
val_loader = DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_data, token2id)
test_loader = DataLoader(dataset=test_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

### Now lets implement basic Recurrent Neural Net model

In [None]:
class RNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers, batch_size, self.hidden_size)

        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()

        self.hidden = self.init_hidden(batch_size)

        # get embedding of characters
        embed = self.embedding(x)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        # undo packing
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        rnn_out = torch.sum(rnn_out, dim=1)

        logits = self.linear(rnn_out)
        return logits

In [None]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # emb_size: Embedding Size
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # num_classes: number of output classes
        # vocab_size: vocabulary size
        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        hidden = torch.randn(self.num_layers, batch_size, self.hidden_size)

        return hidden

    def forward(self, x, lengths):
        # reset hidden state

        batch_size, seq_len = x.size()

        self.hidden = self.init_hidden(batch_size)

        # get embedding of characters
        embed = self.embedding(x)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths.numpy(), batch_first=True)
        # fprop though RNN
        rnn_out, self.hidden = self.rnn(embed, self.hidden)
        # undo packing
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # sum hidden activations of RNN across time
        rnn_out = torch.sum(rnn_out, dim=1)

        logits = self.linear(rnn_out)
        return logits

## Important things to keep in mind when using variable sized sequences in RNN in Pytorch

### RNN modules accept packed sequences as inputs
* pack_padded_sequence function packs a sequence (in Tensor format) containing padded sequences of variable length. **IMPORTANT: the sequences should be sorted by length in a decreasing order before passing to this function**

* pad_packed_sequence function is an inverse operation to pack_padded_sequence. Transforms a padded sequence into a tensor of variable lenth sequences

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = RNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

## Exercise 1:
### Implement LSTM cell instead of RNN cell. Train the model and compare the results.
### Hint (modify init_hidden function and cell in __init__) 

## Exercise 2:
### Implement Bidirectional LSTM. You can do it very easily by adding one argument to cell when you create it.
### For better understanding we recommend that you implement it youself by reversing a sequence and passing it to another cell.

## Exercise 3:

### Add max-pooling (over time) after passing through RNN instead of summing over hidden layers through time

In [None]:
class CNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = torch.sum(hidden, dim=1)
        logits = self.linear(hidden)
        return logits

## Important things to keep in mind when using Convolutional Nets for Language Tasks in Pytorch

### Conv1d module expect input of size (batch_size, num_channels, length), where in our case input has size (batch_size, length, num_channels). Hence it is important call transpose(1,2) before passing it to convolutional layer and then reshape it back to (batch_size, length, num_channels) by calling transpose(1,2) again

### Additionally we need to reshape hidden activations into 2D tensor before passing it to Relu layer by calling view(-1, hidden.size(-1)

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, lengths_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, lengths_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)


model = CNN(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))


## Exercise 4:
### Implement Gated Relu activations as well as Gated Linear activations and compare them with Relu (reference: https://arxiv.org/pdf/1612.08083.pdf )
### Hint: Gated Relu activations are sigmoid(conv1_1(x)) * relu(conv1_2(x))
### Hint: Gated Linear activations are sigmoid(conv1_1(x)) * conv1_2(x)

### Feel free to play with other variants of gating


## Exercise 5:

### Add max-pooling (over time) after passing through conv as well as add non-linear fully connected layer