In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

train = 'data/snli_train.tsv'
val = 'data/snli_val.tsv'


In [2]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words2id[word] if word in words2id else UNK_IDX for word in tokens]
        indices_data.append(index_list)
    return indices_data

In [3]:
words2id = pkl.load(open("data/words2id.pkl", "rb"))
idx2words = pkl.load(open("data/idx2words.pkl", "rb"))
loaded_embeddings = pkl.load(open("data/embedding_matrix.pkl", "rb"))

train_data = pd.read_csv(train,delimiter='\t',encoding='utf-8')
val_data = pd.read_csv(val,delimiter='\t',encoding='utf-8')

train_data['label_num'] = train_data['label'].apply(lambda x: 0 if str(x) == 'contradiction' else 1 if str(x) == 'neutral' else 2)
val_data['label_num'] = val_data['label'].apply(lambda x: 0 if str(x) == 'contradiction' else 1 if str(x) == 'neutral' else 2)

train_label = list(train_data['label_num'])
val_label = list(val_data['label_num'])

In [4]:
train_tokens_1 = pkl.load(open("data/train_data_tokens_1.p", "rb"))
train_tokens_2 = pkl.load(open("data/train_data_tokens_2.p", "rb"))

val_tokens_1 = pkl.load(open("data/val_data_tokens_1.p", "rb"))
val_tokens_2 = pkl.load(open("data/val_data_tokens_2.p", "rb"))

In [5]:
train_data_indices_1 = token2index_dataset(train_tokens_1)
train_data_indices_2 = token2index_dataset(train_tokens_2)
val_data_indices_1 = token2index_dataset(val_tokens_1)
val_data_indices_2 = token2index_dataset(val_tokens_2)

In [244]:
MAX_SENTENCE_LENGTH = 45
BATCH_SIZE = 32

class VocabDataset(Dataset):
    """
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_list_1, data_list_2, target_list, words2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_1 = data_list_1
        self.data_2 = data_list_2
        self.target_list = target_list
        assert (len(self.data_1) == len(self.target_list))
        assert (len(self.data_2) == len(self.target_list))
        self.words2id = words2id

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        words_idx_1 = self.data_1[key][:MAX_SENTENCE_LENGTH]
        
        words_idx_2 = self.data_2[key][:MAX_SENTENCE_LENGTH]
        
        label = self.target_list[key]
        
        return [words_idx_1, len(words_idx_1), words_idx_2,len(words_idx_2),label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []

    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[1])
        length_list_2.append(datum[3])
        
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])),
                                mode="constant", constant_values=0)
        data_list_1.append(padded_vec_1)
        
        padded_vec_2 = np.pad(np.array(datum[2]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])),
                                mode="constant", constant_values=0)
        data_list_2.append(padded_vec_2)
        


    #handle torch type problem by adding the following line
    data_list_2 = np.asarray(data_list_2, dtype=int)
    label_list = np.array(label_list)
    
    return [torch.from_numpy(np.array(data_list_1)), torch.LongTensor(length_list_1), 
            torch.from_numpy(np.array(data_list_2)), torch.LongTensor(length_list_2),
            torch.LongTensor(label_list)]


In [245]:
# Build train and valid dataloaders

train_dataset = VocabDataset(train_data_indices_1, train_data_indices_2, train_label,words2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)


val_dataset = VocabDataset(val_data_indices_1,val_data_indices_2,val_label, words2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)



In [246]:
def generate_weights_matrix(idx2words,loaded_embeddings):
   
    matrix_len = len(idx2words)
    weights_matrix = np.zeros((matrix_len, 300))
    
    for key in idx2words.keys():
        try: 
            weights_matrix[key] = loaded_embeddings[key]
        except KeyError:
            weights_matrix[key] = np.random.normal(scale=0.6, size=(emb_dim, ))
    return weights_matrix

In [247]:
weights_matrix = generate_weights_matrix(idx2words,loaded_embeddings)

In [248]:
class CNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes,kernel_size,stride):

        super(CNN, self).__init__()

        num_embeddings, embedding_dim = weights_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim,padding_idx=PAD_IDX)
        self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        self.embedding.weight.requires_grad = False
        
        self.layer1 = nn.Sequential(
            nn.Conv1d(embedding_dim, hidden_size, kernel_size=5, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.layer2 = nn.Sequential(
            nn.Conv1d(embedding_dim, hidden_size, kernel_size=5, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        
        self.fc1 = nn.Linear(4200, 180)
        self.fc2 = nn.Linear(180, num_classes)
        
 
    
    def forward(self, x,  y):
        embed_x = self.embedding(x)
        hidden_x = self.layer1(embed_x.transpose(1,2))
        
        embed_y = self.embedding(y)
        hidden_y = self.layer2(embed_y.transpose(1,2))
        
        hidden = torch.cat([hidden_x, hidden_y], 1)

        out = hidden.reshape(hidden.size(0), -1)
        out = self.fc1(out)
        out = F.relu(out.contiguous().view(-1, out.size(-1)))
        
        logits = self.fc2(out)
        
        return logits

In [249]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """

    correct = 0
    total = 0
    loss_val = 0
    model.eval()
    for data_1,lengths_1, data_2, lengths_2,labels in loader:
        data_batch_1, data_batch_2, label_batch = data_1, data_2, labels
        outputs = F.softmax(model(data_batch_1, data_batch_2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        loss = criterion(outputs, label_batch)
        loss_val += loss.item() * len(data) / len(loader.dataset)
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).double().sum().item()
    return (100 * correct / total), loss_val


In [250]:
num_classes = len(train_data['label_num'].unique())
model = CNN(weights_matrix, hidden_size=200, num_layers=2, num_classes= num_classes, kernel_size= 3 ,stride = 5)

learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


# Train the model
total_step = len(train_loader)

loss_list = []
accuracy_list = []
for epoch in range(num_epochs):
    for i, (data_1, lengths_1, data_2, lengths_2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data_1, data_2)

        loss = criterion(outputs, labels)
        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

    loss_list.append(loss.data)
    accuracy_list.append(val_acc)

Epoch: [1/10], Step: [101/3125], Validation Acc: 38.9
Epoch: [1/10], Step: [201/3125], Validation Acc: 39.9
Epoch: [1/10], Step: [301/3125], Validation Acc: 42.3
Epoch: [1/10], Step: [401/3125], Validation Acc: 44.9
Epoch: [1/10], Step: [501/3125], Validation Acc: 47.9
Epoch: [1/10], Step: [601/3125], Validation Acc: 48.5
Epoch: [1/10], Step: [701/3125], Validation Acc: 49.4
Epoch: [1/10], Step: [801/3125], Validation Acc: 49.8
Epoch: [1/10], Step: [901/3125], Validation Acc: 50.6
Epoch: [1/10], Step: [1001/3125], Validation Acc: 52.6
Epoch: [1/10], Step: [1101/3125], Validation Acc: 53.6
Epoch: [1/10], Step: [1201/3125], Validation Acc: 54.2
Epoch: [1/10], Step: [1301/3125], Validation Acc: 54.2
Epoch: [1/10], Step: [1401/3125], Validation Acc: 53.7
Epoch: [1/10], Step: [1501/3125], Validation Acc: 54.9
Epoch: [1/10], Step: [1601/3125], Validation Acc: 53.1
Epoch: [1/10], Step: [1701/3125], Validation Acc: 52.5
Epoch: [1/10], Step: [1801/3125], Validation Acc: 54.0
Epoch: [1/10], Step

Epoch: [5/10], Step: [2701/3125], Validation Acc: 60.7
Epoch: [5/10], Step: [2801/3125], Validation Acc: 61.8
Epoch: [5/10], Step: [2901/3125], Validation Acc: 61.2
Epoch: [5/10], Step: [3001/3125], Validation Acc: 59.7
Epoch: [5/10], Step: [3101/3125], Validation Acc: 61.4
Epoch: [6/10], Step: [101/3125], Validation Acc: 62.4
Epoch: [6/10], Step: [201/3125], Validation Acc: 60.4
Epoch: [6/10], Step: [301/3125], Validation Acc: 61.5
Epoch: [6/10], Step: [401/3125], Validation Acc: 61.7
Epoch: [6/10], Step: [501/3125], Validation Acc: 61.7
Epoch: [6/10], Step: [601/3125], Validation Acc: 62.6
Epoch: [6/10], Step: [701/3125], Validation Acc: 59.8
Epoch: [6/10], Step: [801/3125], Validation Acc: 61.0
Epoch: [6/10], Step: [901/3125], Validation Acc: 60.0
Epoch: [6/10], Step: [1001/3125], Validation Acc: 60.7
Epoch: [6/10], Step: [1101/3125], Validation Acc: 60.0
Epoch: [6/10], Step: [1201/3125], Validation Acc: 60.4
Epoch: [6/10], Step: [1301/3125], Validation Acc: 60.8
Epoch: [6/10], Step

KeyboardInterrupt: 

In [252]:
loss_list

[tensor(1.0854),
 tensor(1.1486),
 tensor(1.1277),
 tensor(1.0347),
 tensor(1.0586),
 tensor(1.0631),
 tensor(0.9696),
 tensor(1.0109),
 tensor(0.9247),
 tensor(1.0744),
 tensor(0.9471),
 tensor(1.0133),
 tensor(0.9145),
 tensor(1.0260),
 tensor(1.0667),
 tensor(1.2077),
 tensor(0.9675),
 tensor(1.1202),
 tensor(0.9092),
 tensor(0.8133),
 tensor(0.9675),
 tensor(0.8215),
 tensor(0.9508),
 tensor(0.8404),
 tensor(0.7954),
 tensor(0.9242),
 tensor(0.8645),
 tensor(0.7255),
 tensor(0.7863),
 tensor(0.7590),
 tensor(0.7228),
 tensor(0.8705),
 tensor(0.7469),
 tensor(0.8872),
 tensor(0.8345),
 tensor(1.0622),
 tensor(0.8043),
 tensor(0.9650),
 tensor(0.8950),
 tensor(0.9589),
 tensor(0.9285),
 tensor(0.8080),
 tensor(0.9105),
 tensor(0.8406),
 tensor(1.1042),
 tensor(0.8177),
 tensor(0.8388),
 tensor(0.7125),
 tensor(0.9619),
 tensor(0.7619),
 tensor(0.8477),
 tensor(0.9057),
 tensor(0.7922),
 tensor(0.7365),
 tensor(0.9448),
 tensor(0.9679),
 tensor(0.9810),
 tensor(0.8054),
 tensor(0.9078