<a href="https://colab.research.google.com/github/ravichoudharyds/Natural-Language-Processing/blob/master/HW1/Bag_of_Words_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Let's write the tokenization function 

import spacy
import string
import pickle as pkl
import torch

# Load English tokenizer, tagger, parser, NER and word vectors
tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# lowercase and remove punctuation
def tokenize(sent):
    tokens = tokenizer(sent)
    return [token.text.lower() for token in tokens if (token.text not in punctuations)]

# Example
tokens = tokenize(u'Apple is looking at buying U.K. startup for $1 billion.')
print (tokens)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

['apple', 'is', 'looking', 'at', 'buying', 'u.k.', 'startup', 'for', '1', 'billion']


In [0]:
# This is the code cell that tokenizes train/val/test datasets
# However it takes about 15-20 minutes to run it
# For convinience we have provided the preprocessed datasets
# Please see the next code cell

from tqdm import tqdm

def tokenize_dataset(dataset):
    sentence1_tokens_dataset = []
    sentence2_tokens_dataset = []
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = []
    label_list = []
    for sample in tqdm(dataset):
        sentence1, sentence2, label = sample.split("\t")
        if label[-1] == '\n':
            label = label[:-1]
        sentence1_tokens = tokenize(sentence1)
        sentence2_tokens = tokenize(sentence2)
        sentence1_tokens_dataset.append(sentence1_tokens)
        sentence2_tokens_dataset.append(sentence2_tokens)
        label_list.append(label)
        all_tokens += sentence1_tokens
        all_tokens += sentence2_tokens

    return sentence1_tokens_dataset, sentence2_tokens_dataset, label_list, all_tokens

#val set tokens
# print ("Tokenizing val data")
# val_data = open("/content/drive/My Drive/NLP_HW/HW1/snli_val.tsv","r")
# sentence1_val_tokens_dataset, sentence2_val_tokens_dataset, snli_val_label_list, _ = tokenize_dataset(val_data)
# pkl.dump(sentence1_val_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/snli_val_sentence1_tokens.p", "wb"))
# pkl.dump(sentence2_val_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/snli_val_sentence2_tokens.p", "wb"))
# pkl.dump(snli_val_label_list, open("/content/drive/My Drive/NLP_HW/HW1/snli_val_label_list.p", "wb"))

#train set tokens
# print ("Tokenizing train data")
# train_data = open("/content/drive/My Drive/NLP_HW/HW1/snli_train.tsv","r")
# sentence1_train_tokens_dataset, sentence2_train_tokens_dataset, snli_train_label_list, snli_train_tokens = tokenize_dataset(train_data)
# pkl.dump(sentence1_train_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/snli_train_sentence1_tokens.p", "wb"))
# pkl.dump(sentence2_train_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/snli_train_sentence2_tokens.p", "wb"))
# pkl.dump(snli_train_label_list, open("/content/drive/My Drive/NLP_HW/HW1/snli_train_label_list.p", "wb"))
# pkl.dump(snli_train_tokens, open("/content/drive/My Drive/NLP_HW/HW1/snli_train_tokens.p", "wb"))

In [0]:
sentence1_train_tokens_dataset = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_train_sentence1_tokens.p", "rb"))
sentence2_train_tokens_dataset = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_train_sentence2_tokens.p", "rb"))
snli_train_label_list = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_train_label_list.p", "rb"))
snli_train_tokens = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_train_tokens.p", "rb"))

sentence1_val_tokens_dataset = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_val_sentence1_tokens.p", "rb"))
sentence2_val_tokens_dataset = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_val_sentence2_tokens.p", "rb"))
snli_val_label_list = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/snli_val_label_list.p", "rb"))

In [0]:
train_target = []
val_target = []
target_dict = {'contradiction':0, 'entailment':1, 'neutral':2}
for target in snli_train_label_list[1:]:
    train_target.append(target_dict[target])
for target in snli_val_label_list[1:]:
    val_target.append(target_dict[target])

In [0]:
MAX_SENTENCE_LENGTH = 20

In [0]:
import numpy as np
import torch
from torch.utils.data import Dataset

class SNLIDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, sentence1_list, sentence2_list, target_list):
        """
        @param data_list: list of newsgroup tokens 
        @param target_list: list of newsgroup targets 

        """
        self.sentence1_list = sentence1_list
        self.sentence2_list = sentence2_list
        self.target_list = target_list
        assert (len(self.sentence1_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token1_idx = self.sentence1_list[key][:MAX_SENTENCE_LENGTH]
        token2_idx = self.sentence2_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token1_idx, token2_idx, len(token1_idx), len(token2_idx), label]



In [0]:
from collections import Counter

# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

def build_vocab(all_tokens, max_vocab_size = 10**4):
    # Returns:
    # id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    # token2id: dictionary where keys represent tokens and corresponding values represent indices
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(2,2+len(vocab)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(snli_train_tokens)

In [0]:
# Lets check the dictionary by loading random token from it
import random

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 5074 ; token islamic
Token islamic; token id 5074


In [0]:
# convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

sent1_train_indices = token2index_dataset(sentence1_train_tokens_dataset)
sent2_train_indices = token2index_dataset(sentence2_train_tokens_dataset)
sent1_val_indices = token2index_dataset(sentence1_val_tokens_dataset)
sent2_val_indices = token2index_dataset(sentence2_val_tokens_dataset)

# double checking
print ("Train dataset size is {}".format(len(sent1_train_indices)))
print ("Train dataset size is {}".format(len(sentence1_train_tokens_dataset)))
print ("Train dataset size is {}".format(len(sent2_train_indices)))
print ("Train dataset size is {}".format(len(sentence2_train_tokens_dataset)))

Train dataset size is 100001
Train dataset size is 100001
Train dataset size is 100001
Train dataset size is 100001


In [0]:
train_dataset = SNLIDataset(sent1_train_indices[1:], sent2_train_indices[1:], train_target)

In [0]:
def snli_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    sent1_list = []
    sent2_list = []
    label_list = []
    length_list_sent1 = []
    length_list_sent2 = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[4])
        length_list_sent1.append(datum[2])
        length_list_sent2.append(datum[3])
    # padding
    for datum in batch:
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])), 
                                mode="constant", constant_values=0)
        sent1_list.append(padded_vec1)
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        sent2_list.append(padded_vec2)
    return [torch.from_numpy(np.array(sent1_list)), torch.from_numpy(np.array(sent2_list)), torch.LongTensor(length_list_sent1),torch.LongTensor(length_list_sent2), torch.LongTensor(label_list)]

In [0]:
BATCH_SIZE = 256

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=snli_collate_func,
                                           shuffle=True)

val_dataset = SNLIDataset(sent1_val_indices[1:], sent2_val_indices[1:], val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=snli_collate_func,
                                           shuffle=True)

In [0]:
# First import torch related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords_Model(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim, nn_ind = False, interaction_type = 'cat'):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords_Model, self).__init__()
        # pay attention to padding_idx 
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.nn_ind = nn_ind
        self.interaction_type = interaction_type
        if self.interaction_type =='cat':
            if nn_ind:
                self.linear1 = nn.Linear(2*emb_dim, emb_dim)
                self.relu1 = nn.ReLU()
                self.linear2 = nn.Linear(emb_dim, int(emb_dim/2))
                self.relu2 = nn.ReLU()
                self.linear3 = nn.Linear(int(emb_dim/2), 3)
            else:
                self.linear = nn.Linear(2*emb_dim,3)
        else:
            if nn_ind:
                self.linear1 = nn.Linear(emb_dim, int(emb_dim/2))
                self.relu1 = nn.ReLU()
                self.linear2 = nn.Linear(int(emb_dim/2),int(emb_dim/4))
                self.relu2 = nn.ReLU()
                self.linear3 = nn.Linear(int(emb_dim/4), 3)
            else:
                self.linear = nn.Linear(emb_dim,3)
    
    def interaction_func(self, sent1, sent2):
        if self.interaction_type == 'sum':
            out = sent1.float() + sent2.float()
        elif self.interaction_type == 'cat':
            out = torch.cat((sent1.float(),sent2.float()),dim=1)
        else:
            out = sent1.float()*sent2.float()
        return out
    
    def forward(self, sent1, sent2, length1, length2):
        """
        
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out1 = self.embed(sent1)
        out1 = torch.sum(out1, dim=1)
        out1 /= length1.view(length1.size()[0],1).expand_as(out1).float()
        out2 = self.embed(sent2)
        out2 = torch.sum(out2, dim=1)
        out2 /= length2.view(length2.size()[0],1).expand_as(out2).float()
        out = self.interaction_func(out1, out2)

        # return logits
        
        if self.nn_ind:
            out = self.relu1(self.linear1(out))
            out = self.relu2(self.linear2(out))
            out = self.linear3(out)            
        else:
            out = self.linear(out)
        return out

In [0]:
num_epochs = 10 # number epoch to train

# Function for testing the model
def test_model(loader, model, criterion):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total_loss = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for sent1, sent2, length1, length2, labels in loader:
            sent1_batch, sent2_batch, length1_batch, length2_batch, label_batch = sent1.to(device), sent2.to(device), length1.to(device), length2.to(device), labels.to(device)
            outputs = F.softmax(model(sent1_batch, sent2_batch, length1_batch, length2_batch), dim=1).to(device)
            _, predicted = torch.max(outputs.data, 1)
            loss = criterion(outputs, label_batch)
            total_loss += loss.item()
            total += label_batch.size(0)
            correct += predicted.eq(label_batch.view_as(predicted)).sum().item()
    return (100 * correct / total), (total_loss/total)

def train_model(train_loader, val_loader, model, optimizer, criterion, num_epochs):
    train_acc_list = []
    train_loss_list = []
    val_acc_list = []
    val_loss_list = []
    for epoch in range(num_epochs):
        for sent1, sent2, length1, length2, labels in train_loader:
            model.train()
            sent1_batch, sent2_batch, length1_batch, length2_batch, label_batch = sent1.to(device), sent2.to(device), length1.to(device), length2.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(sent1_batch, sent2_batch, length1_batch, length2_batch)
            loss = criterion(outputs, label_batch)
            loss.backward()
            optimizer.step()
        train_acc, train_loss = test_model(train_loader, model, criterion)
        val_acc, val_loss = test_model(val_loader, model, criterion)
        train_acc_list.append(train_acc)
        train_loss_list.append(train_loss)
        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)
        print("Epoch:{}, Validation Accuracy:{}, Training Acc: {}".format(epoch+1, val_acc, train_acc))
    return train_acc_list, train_loss_list, val_acc_list, val_loss_list
        

In [0]:
from itertools import product

vocab_size_list = [1000, 5000, 10000]
emb_dim_list = [100, 150, 200]
nn_ind_list = [0, 1]
interaction_list = ['cat','sum','mult']
results_dict = {}

for vocab_size, emb_dim, nn_ind, interaction_type in product(vocab_size_list, emb_dim_list, nn_ind_list, interaction_list):
    token2id, id2token = build_vocab(snli_train_tokens, max_vocab_size = 2*10**4)
    sent1_train_indices = token2index_dataset(sentence1_train_tokens_dataset)
    sent2_train_indices = token2index_dataset(sentence2_train_tokens_dataset)
    sent1_val_indices = token2index_dataset(sentence1_val_tokens_dataset)
    sent2_val_indices = token2index_dataset(sentence2_val_tokens_dataset)
    train_dataset = SNLIDataset(sent1_train_indices[1:], sent2_train_indices[1:], train_target)
    BATCH_SIZE = 256

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=snli_collate_func,
                                               shuffle=True)

    val_dataset = SNLIDataset(sent1_val_indices[1:], sent2_val_indices[1:], val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                               batch_size=BATCH_SIZE,
                                               collate_fn=snli_collate_func,
                                               shuffle=True)
    model = BagOfWords_Model(len(id2token), emb_dim, nn_ind=nn_ind, interaction_type = interaction_type).to(device)
    criterion = torch.nn.CrossEntropyLoss(reduction='sum')  

    learning_rate = 0.01
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    combination_dict_key = str(vocab_size) + ", " + str(emb_dim) + ", " + str(nn_ind) + ", " + str(interaction_type)
    print(combination_dict_key)
    train_acc_list, train_loss_list, val_acc_list, val_loss_list = train_model(train_loader, val_loader, model, optimizer, criterion, num_epochs)
    results_dict[combination_dict_key] = [train_acc_list, train_loss_list, val_acc_list, val_loss_list]
pkl.dump(results_dict, open("/content/drive/My Drive/NLP_HW/HW1/Hyperparameter_Search_Results_Dictionary.p", "wb"))

1000, 100, 0, cat
Epoch:1, Validation Accuracy:62.5, Training Acc: 66.258
Epoch:2, Validation Accuracy:62.1, Training Acc: 69.704
Epoch:3, Validation Accuracy:61.3, Training Acc: 71.624
Epoch:4, Validation Accuracy:59.6, Training Acc: 73.599
Epoch:5, Validation Accuracy:61.4, Training Acc: 74.65
Epoch:6, Validation Accuracy:61.9, Training Acc: 74.926
Epoch:7, Validation Accuracy:60.8, Training Acc: 75.855
Epoch:8, Validation Accuracy:60.8, Training Acc: 76.117
Epoch:9, Validation Accuracy:61.5, Training Acc: 76.605
Epoch:10, Validation Accuracy:60.0, Training Acc: 76.878
1000, 100, 0, sum
Epoch:1, Validation Accuracy:56.1, Training Acc: 59.501
Epoch:2, Validation Accuracy:56.4, Training Acc: 62.795
Epoch:3, Validation Accuracy:57.1, Training Acc: 64.06
Epoch:4, Validation Accuracy:55.8, Training Acc: 65.05
Epoch:5, Validation Accuracy:55.7, Training Acc: 66.006
Epoch:6, Validation Accuracy:56.2, Training Acc: 66.378
Epoch:7, Validation Accuracy:55.0, Training Acc: 66.648
Epoch:8, Valid

In [0]:
import pandas as pd
import itertools

def tokenize_mnli(dataset):
    sentence1_tokens_dataset = {}
    sentence2_tokens_dataset = {}
    # we are keeping track of all tokens in dataset 
    # in order to create vocabulary later
    all_tokens = {}
    label_dict = {}
    genre_list = dataset['genre'].unique()
    tqdm.pandas()
    for genre in genre_list:
        sentence1_tokens_dataset[genre] = dataset.loc[dataset['genre']==genre, 'sentence1'].progress_apply(tokenize).tolist()
        sentence2_tokens_dataset[genre] = dataset.loc[dataset['genre']==genre, 'sentence2'].progress_apply(tokenize).tolist()
        label_dict[genre] = dataset.loc[dataset['genre']==genre, 'label'].tolist()

    return sentence1_tokens_dataset, sentence2_tokens_dataset, label_dict

print ("Tokenizing val data")
val_data = pd.read_csv("/content/drive/My Drive/NLP_HW/HW1/mnli_val.tsv",delimiter = "\t")
mnli_sentence1_val_tokens_dataset, mnli_sentence2_val_tokens_dataset, mnli_val_label_list = tokenize_mnli(val_data)
pkl.dump(mnli_sentence1_val_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/mnli_val_sentence1_tokens.p", "wb"))
pkl.dump(mnli_sentence2_val_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/mnli_val_sentence2_tokens.p", "wb"))
pkl.dump(mnli_val_label_list, open("/content/drive/My Drive/NLP_HW/HW1/mnli_val_label_list.p", "wb"))

print ("Tokenizing train data")
train_data = pd.read_csv("/content/drive/My Drive/NLP_HW/HW1/mnli_train.tsv",delimiter = "\t")
mnli_sentence1_train_tokens_dataset, mnli_sentence2_train_tokens_dataset, mnli_train_label_list = tokenize_mnli(train_data)
pkl.dump(mnli_sentence1_train_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/mnli_train_sentence1_tokens.p", "wb"))
pkl.dump(mnli_sentence2_train_tokens_dataset, open("/content/drive/My Drive/NLP_HW/HW1/mnli_train_sentence2_tokens.p", "wb"))
pkl.dump(mnli_train_label_list, open("/content/drive/My Drive/NLP_HW/HW1/mnli_train_label_list.p", "wb"))




  0%|          | 0/995 [00:00<?, ?it/s][A[A

  1%|          | 11/995 [00:00<00:09, 108.27it/s][A[A

Tokenizing val data




  2%|▏         | 21/995 [00:00<00:09, 104.29it/s][A[A

  3%|▎         | 32/995 [00:00<00:09, 103.68it/s][A[A

  4%|▍         | 43/995 [00:00<00:09, 104.29it/s][A[A

  5%|▌         | 53/995 [00:00<00:09, 102.31it/s][A[A

  6%|▋         | 63/995 [00:00<00:09, 100.45it/s][A[A

  7%|▋         | 73/995 [00:00<00:09, 99.71it/s] [A[A

  8%|▊         | 83/995 [00:00<00:09, 97.12it/s][A[A

  9%|▉         | 94/995 [00:00<00:09, 99.83it/s][A[A

 11%|█         | 105/995 [00:01<00:08, 101.20it/s][A[A

 12%|█▏        | 115/995 [00:01<00:08, 98.15it/s] [A[A

 13%|█▎        | 125/995 [00:01<00:08, 96.71it/s][A[A

 14%|█▎        | 135/995 [00:01<00:09, 95.33it/s][A[A

 15%|█▍        | 145/995 [00:01<00:08, 95.75it/s][A[A

 16%|█▌        | 155/995 [00:01<00:08, 93.38it/s][A[A

 17%|█▋        | 165/995 [00:01<00:08, 94.48it/s][A[A

 18%|█▊        | 175/995 [00:01<00:08, 92.30it/s][A[A

 19%|█▊        | 185/995 [00:01<00:08, 92.85it/s][A[A

 20%|█▉        | 196/995 [00:

Tokenizing train data




  0%|          | 10/4270 [00:00<00:50, 84.71it/s][A[A

  0%|          | 19/4270 [00:00<00:51, 83.30it/s][A[A

  1%|          | 27/4270 [00:00<00:52, 80.83it/s][A[A

  1%|          | 36/4270 [00:00<00:51, 81.73it/s][A[A

  1%|          | 46/4270 [00:00<00:49, 85.51it/s][A[A

  1%|▏         | 54/4270 [00:00<00:50, 83.25it/s][A[A

  1%|▏         | 63/4270 [00:00<00:50, 83.14it/s][A[A

  2%|▏         | 72/4270 [00:00<00:50, 82.99it/s][A[A

  2%|▏         | 81/4270 [00:00<00:50, 83.61it/s][A[A

  2%|▏         | 90/4270 [00:01<00:51, 81.51it/s][A[A

  2%|▏         | 100/4270 [00:01<00:49, 84.44it/s][A[A

  3%|▎         | 109/4270 [00:01<00:48, 85.09it/s][A[A

  3%|▎         | 118/4270 [00:01<00:50, 83.00it/s][A[A

  3%|▎         | 127/4270 [00:01<00:50, 82.40it/s][A[A

  3%|▎         | 137/4270 [00:01<00:49, 84.21it/s][A[A

  3%|▎         | 147/4270 [00:01<00:47, 86.86it/s][A[A

  4%|▎         | 156/4270 [00:01<00:49, 83.31it/s][A[A

  4%|▍         | 165/4

In [0]:
results_dict = pkl.load(open("/content/drive/My Drive/NLP_HW/HW1/Hyperparameter_Search_Results_Dictionary.p", "rb"))

['cook',
 'and',
 'then',
 'the',
 'next',
 'time',
 'it',
 'would',
 'be',
 'my',
 'turn',
 'and',
 'i',
 "'d",
 'try',
 'to',
 'outdo',
 'him',
 'and',
 'then',
 'he',
 "'d",
 'try',
 'to',
 'outdo',
 'me',
 'and',
 'we',
 'we',
 'was',
 'really',
 'a',
 'lot',
 'of',
 'fun',
 'and']

In [0]:
sentence2_val_tokens_dataset['telephone'][0]

['i',
 'would',
 'cook',
 'and',
 'then',
 'the',
 'next',
 'turn',
 'would',
 'be',
 'his',
 'and',
 'we',
 'would',
 'try',
 'to',
 'outdo',
 'each',
 'other',
 'but',
 'sometimes',
 'we',
 'would',
 'get',
 'in',
 'a',
 'fight',
 'over',
 'things']

In [0]:
list_k[1]

'fiction'