In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

random.seed(134)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

train = 'data/snli_train.tsv'
val = 'data/snli_val.tsv'


In [2]:
def load_emb_matrix():
    #load fasttext word vectors
    words_to_load = 50000

    with open('wiki-news-300d-1M-subword.vec') as f:
        #remove the first line
        firstLine = f.readline()
        loaded_embeddings = np.zeros((words_to_load + 2, 300))
        words2id = {}
        idx2words = {}
        #ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i + 2 , :] = np.asarray(s[1:])
            words2id['<pad>'] = PAD_IDX
            words2id['<unk>'] = UNK_IDX
            words2id[s[0]] = i + 2
            idx2words[i + 2] = s[0]
            idx2words[0] = '<pad>'
            idx2words[1] = '<unk>'

    return words2id,idx2words,loaded_embeddings
        

In [3]:
words2id,idx2words,loaded_embeddings = load_emb_matrix()

# pkl.dump(words2id, open(f'data/words2id.pkl', 'wb'))
# pkl.dump(idx2words, open(f'data/idx2words.pkl', 'wb'))
# pkl.dump(loaded_embeddings, open(f'data/embedding_matrix.pkl', 'wb'))


In [53]:
# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens 
            if (token.text not in punctuations) & (token.text not in STOP_WORDS)]

In [184]:
def tokenize_dataset(dataset):
    token_dataset = []
    #all_tokens = []
    for sample in dataset:
        tokens = tokenize(sample)
        token_dataset.append(tokens)
        #all_tokens += tokens

    return token_dataset

In [4]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [words2id[word] if word in words2id else UNK_IDX for word in tokens]
        indices_data.append(index_list)
    return indices_data

In [5]:
train_data = pd.read_csv(train,delimiter='\t',encoding='utf-8')
val_data = pd.read_csv(val,delimiter='\t',encoding='utf-8')

train_data['label_num'] = train_data['label'].apply(lambda x: 0 if str(x) == 'contradiction' else 1 if str(x) == 'neutral' else 2)
val_data['label_num'] = val_data['label'].apply(lambda x: 0 if str(x) == 'contradiction' else 1 if str(x) == 'neutral' else 2)


In [6]:
# train_tokens_1 = tokenize_dataset(train_data['sentence1'])
# train_tokens_2 = tokenize_dataset(train_data['sentence2'])
# pkl.dump(train_tokens_1, open("data/train_data_tokens_1.p", "wb"))
# pkl.dump(train_tokens_2, open("data/train_data_tokens_2.p", "wb"))

# val_tokens_1 = tokenize_dataset(val_data['sentence1'])
# val_tokens_2 = tokenize_dataset(val_data['sentence2'])
# pkl.dump(val_tokens_1, open("data/val_data_tokens_1.p", "wb"))
# pkl.dump(val_tokens_2, open("data/val_data_tokens_2.p", "wb"))

train_tokens_1 = pkl.load(open("data/train_data_tokens_1.p", "rb"))
train_tokens_2 = pkl.load(open("data/train_data_tokens_2.p", "rb"))

val_tokens_1 = pkl.load(open("data/val_data_tokens_1.p", "rb"))
val_tokens_2 = pkl.load(open("data/val_data_tokens_2.p", "rb"))

In [7]:
train_data_indices_1 = token2index_dataset(train_tokens_1)
train_data_indices_2 = token2index_dataset(train_tokens_2)
val_data_indices_1 = token2index_dataset(val_tokens_1)
val_data_indices_2 = token2index_dataset(val_tokens_2)

In [8]:
train_label = list(train_data['label_num'])
val_label = list(val_data['label_num'])

In [86]:
MAX_SENTENCE_LENGTH = 45
BATCH_SIZE = 32

class VocabDataset(Dataset):
    """
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, data_list_1, data_list_2, target_list, words2id):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.data_1 = data_list_1
        self.data_2 = data_list_2
        self.target_list = target_list
        assert (len(self.data_1) == len(self.target_list))
        assert (len(self.data_2) == len(self.target_list))
        self.words2id = words2id

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        words_idx_1 = self.data_1[key][:MAX_SENTENCE_LENGTH]
        
        words_idx_2 = self.data_2[key][:MAX_SENTENCE_LENGTH]
        
        label = self.target_list[key]
        return [words_idx_1, len(words_idx_1), words_idx_2,len(words_idx_2),label]

def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list_1 = []
    data_list_2 = []
    label_list = []
    length_list_1 = []
    length_list_2 = []

    for datum in batch:
        label_list.append(datum[4])
        length_list_1.append(datum[1])
        length_list_2.append(datum[3])
        
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])),
                                mode="constant", constant_values=0)
        data_list_1.append(padded_vec_1)
        
        padded_vec_2 = np.pad(np.array(datum[2]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])),
                                mode="constant", constant_values=0)
        data_list_2.append(padded_vec_2)
        
    ind_dec_order_1 = np.argsort(length_list_1)[::-1]
    ind_dec_order_2 = np.argsort(length_list_2)[::-1]
    
    data_list_1 = np.array(data_list_1)[ind_dec_order_1]
    length_list_1 = np.array(length_list_1)[ind_dec_order_1]
    
    data_list_2 = np.array(data_list_2)[ind_dec_order_2]
    length_list_2 = np.array(length_list_2)[ind_dec_order_2]
    #handle torch type problem by adding the following line
    data_list_2 = np.asarray(data_list_2, dtype=int)
    label_list = np.array(label_list)[ind_dec_order_1]
    
    return [torch.from_numpy(np.array(data_list_1)), torch.LongTensor(length_list_1), 
            torch.from_numpy(np.array(data_list_2)), torch.LongTensor(length_list_2),
            torch.LongTensor(label_list)]


In [87]:
# Build train, valid and test dataloaders

train_dataset = VocabDataset(train_data_indices_1, train_data_indices_2, train_label,words2id)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)


val_dataset = VocabDataset(val_data_indices_1,val_data_indices_2,val_label, words2id)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)



In [11]:
def generate_weights_matrix(train_data_indices_1,train_data_indices_2,loaded_embeddings):
    all_tokens = []
    for line in train_data_indices_1:
        for token in line:
            all_tokens.append(token)

    for line in train_data_indices_2:
        for token in line:
            all_tokens.append(token)
            
    matrix_len = len(all_tokens)
    weights_matrix = np.zeros((matrix_len, 300))
    words_found = 0
    
    for i, idx in enumerate(all_tokens):
        try: 
            weights_matrix[i] = loaded_embeddings[idx]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
    return weights_matrix

In [12]:
weights_matrix = generate_weights_matrix(train_data_indices_1,train_data_indices_2,loaded_embeddings)

In [1]:
class RNN(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, num_classes, vocab_size):
        # RNN Accepts the following hyperparams:
        # hidden_size: Hidden Size of layer in RNN
        # num_layers: number of layers in RNN
        # vocab_size: vocabulary size
        super(RNN, self).__init__()

        self.num_layers, self.hidden_size = num_layers, hidden_size
        num_embeddings, embedding_dim = weights_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(weights_matrix))
        self.embedding.weight.requires_grad = False
        
        self.bi_gru = nn.GRU(embedding_dim, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        
        self.linear = nn.Linear(2 * hidden_size, num_classes)

    def init_hidden(self, batch_size):
        # Function initializes the activation of recurrent neural net at timestep 0
        # Needs to be in format (num_layers, batch_size, hidden_size)
        #biased term?
        hidden = torch.randn(self.num_layers * 2, batch_size, self.hidden_size)

        return hidden
    
    
    def forward(self, x, lengths_x, y,lengths_y):
        # reset hidden state

        batch_size_x, seq_len_x = x.size()
        self.hidden_x = self.init_hidden(batch_size_x)
        
        # get embedding of characters
        embed_x = self.embedding(x)        
        rnn_out_x, _ = self.bi_gru(embed_x,self.hidden_x)
        rnn_out_x = rnn_out_x[:, -1, :self.hidden_size] + rnn_out_x[:, 0, self.hidden_size:] 

        
        batch_size_y, seq_len_y = y.size()
        self.hidden_y = self.init_hidden(batch_size_y)
        
        embed_y = self.embedding(y)           
        rnn_out_y, self.hidden_y = self.bi_gru(embed_y, self.hidden_y)
        #rnn_out_y = rnn_out_y[:, -1, :self.hidden_size] + rnn_out_y[0, :, self.hidden_size:]
        
        #run_out = torch.cat([rnn_out_x, rnn_out_y], 1)
        
#         logits = self.linear(rnn_out)
        
        return rnn_out_x


NameError: name 'nn' is not defined

In [29]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data_1, lengths_1, data_2, lengths_2, labels in loader:
        data_batch_1, lengths_batch_1, data_batch_2, lengths_batch_2, label_batch = data_1, lengths_1, data_2, lengths_2,labels
        outputs = F.softmax(model(data_batch_1, lengths_batch_1,data_batch_2, lengths_batch_2), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]

        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)




In [94]:
model = RNN(weights_matrix, hidden_size=200, num_layers=1, num_classes = 3, vocab_size=len(idx2words))
learning_rate = 3e-4
num_epochs = 10 # number epoch to train

# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    for i, (data_1, lengths_1, data_2, lengths_2, labels) in enumerate(train_loader):
        model.train()
        optimizer.zero_grad()
        # Forward pass
        outputs = model(data_1, lengths_1,data_2,lengths_2)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format(
                       epoch+1, num_epochs, i+1, len(train_loader), val_acc))

In [95]:
for i, (data_1, lengths_1, data_2, lengths_2, labels) in enumerate(train_loader):
    model.train()
    optimizer.zero_grad()
    # Forward pass
    outputs = model(data_1, lengths_1,data_2,lengths_2)
    #loss = criterion(outputs, labels)
    
    if i >= 5:
        break

In [96]:
outputs.size()

torch.Size([32, 45, 400])