In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import pickle
import random
import io
import os
from unidecode import unidecode
from sklearn.model_selection import train_test_split
random.seed(1337)

DEVICE = 'cuda:1' if torch.cuda.is_available() else 'cpu'

PAD_IDX = 0
UNK_IDX = 1

BATCH_SIZE = 32
VOCAB_SIZE = 50000
NUM_CLASSES = 3
NUM_LAYERS = 1
BIDIRECTIONAL = True
NUM_DIRECTIONS = 2 if BIDIRECTIONAL else 1
EMB_HIDDEN_SIZE, CLASS_HODDEN_SIZE = 256, 512

LR = 3e-4
N_EPOCHS = 10

In [2]:
train_data = pd.read_csv('data/snli_train.tsv', sep='\t')
val_data = pd.read_csv('data/snli_val.tsv', sep='\t')

In [3]:
data_dict = {
    'train': train_data,
    'val': val_data
}

In [4]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [5]:
train_data.shape

(100000, 3)

In [6]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label
0,A young girl in a pink shirt sitting on a dock...,A young girl watching the sunset over the water .,neutral
1,A woman is smiling while the man next to her i...,Two people are next to each other .,entailment
2,"Across the river , you can see a large building .",The large building is full of apartments and t...,neutral
3,a man in white shorts and a black shirt is par...,A man is riding a jetski on the ocean .,contradiction
4,Four black dogs run together on bright green g...,Four dogs are preparing to be launched into sp...,contradiction


In [7]:
def prepare_data(data):
    data['label'] = data['label'].map({'contradiction': 0, 'neutral': 1, 'entailment': 2})
    data['sentence1'] = data['sentence1'].str.split()
    data['sentence2'] = data['sentence2'].str.split()
    return data

In [8]:
def load_vectors(f_name, vocabulary):
    f_in = io.open(f_name, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, f_in.readline().split())
    vectors = {}
    for line in f_in:
        tokens = line.strip().split(' ')
        if tokens[0] in vocabulary:
            vectors[tokens[0]] = list(map(float, tokens[1:]))
    return vectors

In [9]:
def build_vocabulary(train_data, vocab_size):
    '''
    Returns:
    id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    token2id: dictionary where keys represent tokens and corresponding values represent indices
    '''
    print('Building vocabulary... ', end='', flush=True)
    all_tokens = []
    for row in (train_data['sentence1']+train_data['sentence2']).iteritems():
        all_tokens += row[1]
    vocabulary, count = zip(*Counter(all_tokens).most_common(vocab_size))
    print('Done.')
    print('Loading vectors... ', end='', flush=True)
    vectors = load_vectors('data/wiki-news-300d-1M.vec', vocabulary)
    vocabulary = [word for word in vocabulary if word in vectors]
    print('Done.')
    id2token = list(vocabulary)
    token2id = dict(zip(vocabulary, range(2, 2+len(vocabulary))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token, vectors

def read_and_preprocess_data(data_dict, dataset, vocab_size=50000):
    data = prepare_data(data_dict[dataset])
    if dataset == 'train':
        token2id, id2token, vectors = build_vocabulary(data, vocab_size)
        return data, token2id, id2token, vectors
    return data

In [10]:
train_data, token2id, id2token, vectors = read_and_preprocess_data(data_dict, 'train', VOCAB_SIZE)
val_data = read_and_preprocess_data(data_dict, 'val')

pickle.dump(vectors, open('data/vectors.pkl', 'wb'))
pickle.dump(token2id, open('data/token2id.pkl', 'wb'))
pickle.dump(id2token, open('data/id2token.pkl', 'wb'))

Building vocabulary... Done.
Loading vectors... Done.


In [None]:
train_data = prepare_data(train_data)
val_data = prepare_data(val_data)
vectors = pickle.load(open('data/vectors.pkl', 'rb'))
id2token = pickle.load(open('data/id2token.pkl', 'rb'))
token2id = pickle.load(open('data/token2id.pkl', 'rb'))

In [11]:
len(token2id)

22059

In [12]:
# Check the dictionary by loading random token from it
random_token_id = np.random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]
print("Token id: {}; Token: {}".format(random_token_id, id2token[random_token_id]))
print("Token: {}; Token id: {}".format(random_token, token2id[random_token]))

Token id: 6038; Token: applied
Token: applied; Token id: 6038


In [15]:
class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data, token2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.x1, self.x2, self.y = data['sentence1'].values, data['sentence2'].values, data['label'].values
        assert (len(self.x1) == len(self.x2) == len(self.y))
        self.token2id = token2id

    def __len__(self):
        return len(self.y)

    def __getitem__(self, row):
        """
        Triggered when you call dataset[i]
        """
        label = self.y[row]
        x1_word_idx, x2_word_idx = [], []
        x1_mask, x2_mask = [], []
        
        for word in self.x1[row][:MAX_SENT_LENGTH]:
            if word in self.token2id.keys():
                x1_word_idx.append(self.token2id[word])
                x1_mask.append(0)
            else:
                x1_word_idx.append(UNK_IDX)
                x1_mask.append(1)
                
        for word in self.x2[row][:MAX_SENT_LENGTH]:
            if word in self.token2id.keys():
                x2_word_idx.append(self.token2id[word])
                x2_mask.append(0)
            else:
                x2_word_idx.append(UNK_IDX)
                x2_mask.append(1)
        
        x1_list = [x1_word_idx, x1_mask, len(x1_word_idx)]
        x2_list = [x2_word_idx, x2_mask, len(x2_word_idx)]
        return x1_list + x2_list + [label]

def snli_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    x1_data, x2_data = [], []
    x1_mask, x2_mask = [], []
    x1_lengths, x2_lengths = [], []
    labels = []

    for datum in batch:
        x1_lengths.append(datum[2])
        x2_lengths.append(datum[5])
        labels.append(datum[6])
        
        # Padding
        x1_data_padded = np.pad(np.array(datum[0]),
                                pad_width=((0, MAX_SENT_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        x1_data.append(x1_data_padded)
        
        x1_mask_padded = np.pad(np.array(datum[1]),
                                pad_width=((0, MAX_SENT_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        x1_mask.append(x1_mask_padded)
        
        x2_data_padded = np.pad(np.array(datum[3]),
                                pad_width=((0, MAX_SENT_LENGTH-datum[5])),
                                mode="constant", constant_values=0)
        x2_data.append(x2_data_padded)
        
        x2_mask_padded = np.pad(np.array(datum[4]),
                               pad_width=((0, MAX_SENT_LENGTH-datum[5])),
                               mode="constant", constant_values=0)
        x2_mask.append(x2_mask_padded)
        
    ind_dec_order = np.argsort(x1_lengths)[::-1]
    x1_data = np.array(x1_data)[ind_dec_order]
    x2_data = np.array(x2_data)[ind_dec_order]
    
    x1_mask = np.array(x1_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    x2_mask = np.array(x2_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    
    x1_lengths = np.array(x1_lengths)[ind_dec_order]
    x2_lengths = np.array(x2_lengths)[ind_dec_order]
    
    labels = np.array(labels)[ind_dec_order]
    
    x1_list = [torch.from_numpy(x1_data), torch.from_numpy(x1_mask).float(), x1_lengths]
    x2_list = [torch.from_numpy(x2_data), torch.from_numpy(x2_mask).float(), x2_lengths]
        
    return x1_list + x2_list + [torch.from_numpy(labels)]

In [13]:
def get_max_sentence_length(train_data, q=0.95):
    max_sent1_len = train_data['sentence1'].str.len().quantile(q)
    max_sent2_len = train_data['sentence2'].str.len().quantile(q)
    return int(max(max_sent1_len, max_sent2_len))

In [14]:
MAX_SENT_LENGTH = get_max_sentence_length(train_data)

In [16]:
# Build train, valid and test dataloaders

train_dataset = SNLIDataset(train_data, token2id)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          collate_fn=snli_collate_func,
                          shuffle=True)

val_dataset = SNLIDataset(val_data, token2id)
val_loader = DataLoader(dataset=val_dataset,
                        batch_size=BATCH_SIZE,
                        collate_fn=snli_collate_func,
                        shuffle=False)

In [17]:
# Initialise weights matrix
def init_embedding_weights(vectors, token2id, id2token):
    weights = np.zeros((len(token2id), 300))
    for idx in range(2, len(vectors)):
        weights[idx] = np.array(vectors[id2token[idx]])
    np.random.seed(1337)
    weights[1] = np.random.randn(300)
    return weights

In [18]:
class GRUEncoder(nn.Module):
    def __init__(self, hidden_size, num_layers, weights, vocab_size, bidirectional=False):
        '''
        params:
            hidden_size: hidden Size of layer in GRU
            num_layers: number of layers in GRU
            output_size: dimension of output
            vocab_size: vocabulary size
            bidirectional: use bidirectional GRU
        '''
        super(GRUEncoder, self).__init__()
        
        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, 300, padding_idx=PAD_IDX)
        self.gru = nn.GRU(300, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
        self.num_directions = 2 if bidirectional else 1
        self.embedding.weight.data.copy_(torch.from_numpy(weights))

    def forward(self, x, mask, lengths):
        true2sorted = sorted(range(len(lengths)), key=lambda x: -lengths[x])
        sorted2true = sorted(range(len(lengths)), key=lambda x: true2sorted[x])
        x = x[true2sorted]
        mask = mask[true2sorted]
        lengths = lengths[true2sorted]
        
        batch_size, seq_len = x.size()
        
        # reset hidden state
        self.hidden = self._init_hidden(batch_size)
        
        # get embedding of words
        embed = self.embedding(x)
        
        # mask out all embeddings other than <unk> token to freeze their weights
        embed = mask*embed + (1-mask)*embed.clone().detach()
        
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, lengths, batch_first=True)
        
        # forward prop though GRU
        gru_out, self.hidden = self.gru(embed, self.hidden)
        
        # undo packing
        gru_out, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out, batch_first=True)
        
        # (batch_size, seq_len, num_directions*hidden_size) -> (batch_size, seq_len, num_directions, hidden_size)
        gru_out = gru_out.view(batch_size, -1, self.num_directions, self.hidden_size)
        
        # sum hidden activations of GRU across time
        gru_out = torch.sum(gru_out, dim=1)
        
        # concat all directions along the hidden dimension
        gru_out = torch.cat([gru_out[:,i,:] for i in range(self.num_directions)], dim=1)
        
        # get data back in original order of batches
        gru_out = gru_out[sorted2true]
        
        return gru_out
    
    def _init_hidden(self, batch_size):
        hidden = torch.randn(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(DEVICE)
        return hidden

In [19]:
class ClassificationNetwork(nn.Module):
    def __init__(self, input_size, num_directions, hidden_size, num_outputs, mode='cat'):
        super(ClassificationNetwork, self).__init__()
        
        self.mode = mode
        
        # Fully connected and ReLU layers
        if mode == 'cat':
            self.fc1 = nn.Linear(2*input_size*num_directions, hidden_size)
        elif mode in ['elementwise_mult', 'sum']:
            self.fc1 = nn.Linear(input_size*num_directions, hidden_size)
        else:
            raise ValueError('Invalid arugment "{}" for mode!'.format(mode))
        
        self.fc2 = nn.Linear(hidden_size, num_outputs)
        self.relu = nn.ReLU(inplace=True)
        self.dropout = nn.Dropout(p=0.5)

        # Initialize weights
        self._init_weights()

    def forward(self, embedding_output1, embedding_output2):
        if self.mode == 'cat':
            input = torch.cat([embedding_output1, embedding_output2], dim=1)
        elif self.mode == 'elementwise_mult':
            input = embedding_output1 * embedding_output2
        elif self.mode == 'sum':
            input = embedding_output1 + embedding_output2
        input = input.view(input.size(0), -1) # Reshape input to batch_size x num_inputs
        output = self.fc1(input)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.uniform_(m.bias)

In [20]:
def test_model(data_loader, embedding_network, classification_network):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    
    embedding_network.eval()
    classification_network.eval()
    
    with torch.no_grad():
        for batch_idx, (x1, x1_mask, x1_lengths, x2, x2_mask, x2_lengths, y) in enumerate(data_loader):
            x1, x1_mask, x2, x2_mask, y = x1.to(DEVICE), x1_mask.to(DEVICE), x2.to(DEVICE), x2_mask.to(DEVICE), y.to(DEVICE)
            
            embedding_output1 = embedding_network(x1, x1_mask, x1_lengths)
            embedding_output2 = embedding_network(x2, x2_mask, x2_lengths)
            classification_output = classification_network(embedding_output1, embedding_output2)
            
            output_normalized = F.softmax(classification_output, dim=1)
            predicted = output_normalized.max(1, keepdim=True)[1]

            total += y.size(0)
            correct += predicted.eq(y.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [21]:
WEIGHTS = init_embedding_weights(vectors, token2id, id2token)
embedding_network = GRUEncoder(hidden_size=EMB_HIDDEN_SIZE, num_layers=NUM_LAYERS, weights=WEIGHTS, vocab_size=len(token2id), bidirectional=BIDIRECTIONAL).to(DEVICE)
classification_network = ClassificationNetwork(EMB_HIDDEN_SIZE, NUM_DIRECTIONS, CLASS_HODDEN_SIZE, NUM_CLASSES).to(DEVICE)

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(embedding_network.parameters())+list(classification_network.parameters()), lr=LR)

# Train the model
total_step = len(train_loader)

In [22]:
for epoch in range(N_EPOCHS):
    for batch_idx, (x1, x1_mask, x1_lengths, x2, x2_mask, x2_lengths, y) in enumerate(train_loader):
        x1, x1_mask, x2, x2_mask, y = x1.to(DEVICE), x1_mask.to(DEVICE), x2.to(DEVICE), x2_mask.to(DEVICE), y.to(DEVICE)
        
        embedding_network.train()
        classification_network.train()
        
        optimizer.zero_grad()
        
        # Forward pass
        embedding_output1 = embedding_network(x1, x1_mask, x1_lengths)
        embedding_output2 = embedding_network(x2, x2_mask, x2_lengths)
        classification_output = classification_network(embedding_output1, embedding_output2)
        loss = criterion(classification_output, y)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        # Validate every 100 iterations
        if batch_idx > 0 and batch_idx % 100 == 0:
            # validate
            val_acc = test_model(val_loader, embedding_network, classification_network)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                epoch+1, N_EPOCHS, batch_idx+1, len(train_loader), val_acc))

Epoch: [1/10], Step: [101/3125], Validation Acc: 36.3
Epoch: [1/10], Step: [201/3125], Validation Acc: 35.6
Epoch: [1/10], Step: [301/3125], Validation Acc: 39.0
Epoch: [1/10], Step: [401/3125], Validation Acc: 38.6
Epoch: [1/10], Step: [501/3125], Validation Acc: 38.5
Epoch: [1/10], Step: [601/3125], Validation Acc: 37.1
Epoch: [1/10], Step: [701/3125], Validation Acc: 39.4
Epoch: [1/10], Step: [801/3125], Validation Acc: 40.2
Epoch: [1/10], Step: [901/3125], Validation Acc: 39.4
Epoch: [1/10], Step: [1001/3125], Validation Acc: 38.3
Epoch: [1/10], Step: [1101/3125], Validation Acc: 39.2
Epoch: [1/10], Step: [1201/3125], Validation Acc: 39.2
Epoch: [1/10], Step: [1301/3125], Validation Acc: 40.4
Epoch: [1/10], Step: [1401/3125], Validation Acc: 41.0
Epoch: [1/10], Step: [1501/3125], Validation Acc: 38.8
Epoch: [1/10], Step: [1601/3125], Validation Acc: 43.2
Epoch: [1/10], Step: [1701/3125], Validation Acc: 42.4
Epoch: [1/10], Step: [1801/3125], Validation Acc: 41.6
Epoch: [1/10], Step

Epoch: [5/10], Step: [2701/3125], Validation Acc: 64.6
Epoch: [5/10], Step: [2801/3125], Validation Acc: 65.0
Epoch: [5/10], Step: [2901/3125], Validation Acc: 64.0
Epoch: [5/10], Step: [3001/3125], Validation Acc: 62.8
Epoch: [5/10], Step: [3101/3125], Validation Acc: 62.6
Epoch: [6/10], Step: [101/3125], Validation Acc: 64.6
Epoch: [6/10], Step: [201/3125], Validation Acc: 64.3
Epoch: [6/10], Step: [301/3125], Validation Acc: 63.2
Epoch: [6/10], Step: [401/3125], Validation Acc: 64.3
Epoch: [6/10], Step: [501/3125], Validation Acc: 64.0
Epoch: [6/10], Step: [601/3125], Validation Acc: 65.6
Epoch: [6/10], Step: [701/3125], Validation Acc: 64.3
Epoch: [6/10], Step: [801/3125], Validation Acc: 63.5
Epoch: [6/10], Step: [901/3125], Validation Acc: 66.2
Epoch: [6/10], Step: [1001/3125], Validation Acc: 64.0
Epoch: [6/10], Step: [1101/3125], Validation Acc: 65.1
Epoch: [6/10], Step: [1201/3125], Validation Acc: 64.9
Epoch: [6/10], Step: [1301/3125], Validation Acc: 65.0
Epoch: [6/10], Step

Epoch: [10/10], Step: [2201/3125], Validation Acc: 71.5
Epoch: [10/10], Step: [2301/3125], Validation Acc: 69.6
Epoch: [10/10], Step: [2401/3125], Validation Acc: 70.9
Epoch: [10/10], Step: [2501/3125], Validation Acc: 69.3
Epoch: [10/10], Step: [2601/3125], Validation Acc: 70.6
Epoch: [10/10], Step: [2701/3125], Validation Acc: 69.7
Epoch: [10/10], Step: [2801/3125], Validation Acc: 71.0
Epoch: [10/10], Step: [2901/3125], Validation Acc: 69.2
Epoch: [10/10], Step: [3001/3125], Validation Acc: 69.9
Epoch: [10/10], Step: [3101/3125], Validation Acc: 71.2


In [None]:
class CNNEncoder(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, num_classes, vocab_size):

        super(CNNEncoder, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
    
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=3, padding=1)

        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        batch_size, seq_len = x.size()

        embed = self.embedding(x)
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = torch.sum(hidden, dim=1)
        logits = self.linear(hidden)
        return logits

## Important things to keep in mind when using Convolutional Nets for Language Tasks in Pytorch

### Conv1d module expect input of size (batch_size, num_channels, length), where in our case input has size (batch_size, length, num_channels). Hence it is important call transpose(1,2) before passing it to convolutional layer and then reshape it back to (batch_size, length, num_channels) by calling transpose(1,2) again

### Additionally we need to reshape hidden activations into 2D tensor before passing it to Relu layer by calling view(-1, hidden.size(-1)

In [None]:
model = CNNEncoder(emb_size=100, hidden_size=200, num_layers=2, num_classes=5, vocab_size=len(id2char))

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data, lengths)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))