In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from collections import Counter

import re
from unidecode import unidecode
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset

import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Globals
PROJECT_DIR = '/home/mihir/Desktop/GitHub/nyu/nyu_1011/homeworks/hw1/'
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
PLOTS_DIR = os.path.join(PROJECT_DIR, 'plots')
NUM_VAL = 5000
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

BATCH_SIZE = 64     # input batch size for training
N_EPOCHS = 50       # number of epochs to train
LR = 0.01           # learning rate
VOCAB_SIZE = 10000  # max vocab size
MAX_SENTENCE_LENGTH = 200
EMB_DIM = 100       # size of embedding


# Save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

In [3]:
def load_dataset(dataset='train'):
    data_path = os.path.join(DATA_DIR, dataset)
    data = []
    for sentiment in ['pos', 'neg']:
        target = 1 if sentiment == 'pos' else 0
        data_target_path = os.path.join(data_path, sentiment)
        for file in os.listdir(data_target_path):
            file_path = os.path.join(data_target_path, file)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file_text:
                    text = file_text.readlines()[0].replace(')', ' ').replace('(', ' ')
                    text = re.sub('<[^<]+?>', '', text)
                    data.append([text, target])
    data = pd.DataFrame(data, columns=['text', 'sentiment'])
    data['text'] = data['text'].astype(str)
    data['sentiment'] = data['sentiment'].astype(int)
    return data

In [4]:
def split_train_val(train_data):
    train_data.sample(frac=1, random_state=1337)
    val_data = train_data[:NUM_VAL]
    train_data = train_data[NUM_VAL:]
    return train_data, val_data

In [5]:
def load_train_val_datasets(force=False):
    train_data_path = os.path.join(DATA_DIR, 'train.pkl')
    val_data_path = os.path.join(DATA_DIR, 'val.pkl')
    if not force and os.path.exists(train_data_path) and os.path.exists(val_data_path):
        train_data = pickle.load(open(train_data_path, 'rb'))
        val_data = pickle.load(open(val_data_path, 'rb'))
    else:
        train_data = load_dataset('train')
        train_data, val_data = split_train_val(train_data)
        pickle.dump(train_data, open(train_data_path, 'wb'))
        pickle.dump(val_data, open(val_data_path, 'wb'))
    return train_data.reset_index(drop=True), val_data.reset_index(drop=True)

In [6]:
def load_test_dataset(force=False):
    test_data_path = os.path.join(DATA_DIR, 'test.pkl')
    if not force and os.path.exists(test_data_path):
        test_data = pickle.load(open(test_data_path, 'rb'))
    else:
        test_data = load_dataset('test')
        pickle.dump(test_data, open(test_data_path, 'wb'))
    return test_data

In [7]:
train_data, val_data = load_train_val_datasets()
test_data = load_test_dataset()

In [8]:
print("Train dataset size is {}".format(len(train_data)))
print("Val dataset size is {}".format(len(val_data)))
print("Test dataset size is {}".format(len(test_data)))

Train dataset size is 20000
Val dataset size is 5000
Test dataset size is 25000


In [9]:
# Random sample from train dataset
print(train_data.iloc[np.random.randint(0, len(train_data)-1)])

text         It had all the clichés of movies of this type ...
sentiment                                                    0
Name: 16444, dtype: object


In [10]:
def prepare_stopwords():
    NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

    stopwords = STOP_WORDS.copy()
    for word in STOP_WORDS:
        if word in NEGATE:
            stopwords.remove(word)

    return stopwords

In [11]:
def clean_data(tokens, stopwords, punctuations):
    tokens = [tok.lemma_.lower().strip() for tok in tokens]
    tokens = [unidecode(tok) for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens

In [12]:
# Load English tokenizer+tagger+parser+NER+word vectors, and punctuations and stopwords
tokenizer = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])
punctuations = string.punctuation
stopwords = prepare_stopwords()

In [13]:
def tokenize_dataset(data, tokenizer, stopwords, punctuations, dataset='train', force=False):
    tokens_data_path = os.path.join(DATA_DIR, '{}_tokenized.pkl'.format(dataset))
    all_train_tokens_path = os.path.join(DATA_DIR, 'all_train_tokens.pkl')
    if not force and os.path.exists(tokens_data_path):
        tokens_data = pickle.load(open(tokens_data_path, 'rb'))
        if dataset == 'train':
            all_train_tokens = pickle.load(open(all_train_tokens_path, 'rb'))
            return tokens_data, all_train_tokens
        return tokens_data
    else:
        parsed_data = tokenizer.pipe(data['text'], batch_size=512, n_threads=-1)
        tokens_data = pd.Series(parsed_data).apply(clean_data, args=(stopwords, punctuations))
        pickle.dump(tokens_data, open(tokens_data_path, 'wb'))
        if dataset == 'train':
            all_train_tokens = np.hstack(tokens_data)
            pickle.dump(all_train_tokens, open(all_train_tokens_path, 'wb'))
            return tokens_data, all_train_tokens
    return tokens_data

In [14]:
train_data_tokens, all_train_tokens = tokenize_dataset(train_data, tokenizer, stopwords, punctuations, dataset='train')
val_data_tokens = tokenize_dataset(val_data, tokenizer, stopwords, punctuations, dataset='val')
test_data_tokens = tokenize_dataset(test_data, tokenizer, stopwords, punctuations, dataset='test')

In [15]:
train_data_tokens.head()

0    [film, watch, high, school, spanish, class, fa...
1    [know, absolutely, nothing, ireland, love, lef...
2    [watch, star, josie, lawrence, knewfrom, line,...
3    [example, film, not, good, receive, stand, rai...
4    [horror, not, educational, film, genre, huh, t...
dtype: object

In [16]:
all_train_tokens

array(['film', 'watch', 'high', ..., 'wrong', 'val', 'kilmer'],
      dtype='<U74')

In [17]:
print("Total number of tokens in train dataset = {}".format(len(all_train_tokens)))

Total number of tokens in train dataset = 2175805


In [18]:
def build_vocabulary(all_train_tokens, vocab_size):
    '''
    Returns:
    id2token: list of tokens, where id2token[i] returns token that corresponds to token i
    token2id: dictionary where keys represent tokens and corresponding values represent indices
    '''
    
    token_counter = Counter(all_train_tokens)
    vocabulary, count = zip(*token_counter.most_common(vocab_size))
    id2token = list(vocabulary)
    token2id = dict(zip(vocabulary, range(2, 2+len(vocabulary)))) 
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

In [19]:
token2id, id2token = build_vocabulary(all_train_tokens, VOCAB_SIZE)

In [20]:
# Check the dictionary by loading random token from it

random_token_id = np.random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 2566 ; token usa
Token usa; token id 2566


In [21]:
# Convert token to id in the dataset
def token2index_dataset(tokens_data):
    indices_data = tokens_data.apply(lambda tokens: [token2id[token] if token in token2id else UNK_IDX \
                                                     for token in tokens])
    return indices_data

In [22]:
train_data_indices = token2index_dataset(train_data_tokens)
val_data_indices = token2index_dataset(val_data_tokens)
test_data_indices = token2index_dataset(test_data_tokens)

In [23]:
class IMDBReviewsDataset(Dataset):
    def __init__(self, data_list, target_list):
        """
        @param data_list: list of review tokens
        @param target_list: list of review targets
        """
        self.data_list = data_list
        self.target_list = target_list
        assert (len(self.data_list) == len(self.target_list))

    def __getitem__(self, key):
        """
        Triggered when dataset[i] is called
        """
        token_idx = self.data_list[key][:MAX_SENTENCE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

    def __len__(self):
        return len(self.data_list)

In [24]:
def imdbreviews_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    #print("collate batch: ", batch[0][0])
    #batch[0][0] = batch[0][0][:MAX_SENTENCE_LENGTH]
    for datum in batch:
        label_list.append(datum[2])
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]),
                            pad_width=((0, MAX_SENTENCE_LENGTH-datum[1])),
                            mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [25]:
train_dataset = IMDBReviewsDataset(train_data_indices, train_data['sentiment'])
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=imdbreviews_collate_func,
                                           shuffle=True)

val_dataset = IMDBReviewsDataset(val_data_indices, val_data['sentiment'])
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                         batch_size=BATCH_SIZE,
                                         collate_fn=imdbreviews_collate_func,
                                         shuffle=True)

test_dataset = IMDBReviewsDataset(test_data_indices, test_data['sentiment'])
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=BATCH_SIZE,
                                          collate_fn=imdbreviews_collate_func,
                                          shuffle=False)

In [26]:
class BagOfWords(nn.Module):
    """
    BagOfWords classification model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary
        @param emb_dim: size of the word embedding
        """
        super(BagOfWords, self).__init__()

        # Pay attention to padding_idx
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim, 2)

    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out /= length.view(length.size()[0], 1).expand_as(out).float()

        # Return logits
        out = self.linear(out.float())
        return out

In [27]:
# Model, Criterion, and Optimizer
model = BagOfWords(len(id2token), EMB_DIM).to(DEVICE)
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [28]:
# Function for testing the model
def test_model(dataloader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0.
    total = 0.
    model.eval()
    with torch.no_grad():
        for data_batch, lengths_batch, labels_batch in dataloader:
            data_batch, lengths_batch, labels_batch = data_batch.to(DEVICE), lengths_batch.to(DEVICE), labels_batch.to(DEVICE)
            outputs = nn.functional.softmax(model(data_batch, lengths_batch), dim=1)
            predicted = outputs.max(1, keepdim=True)[1]

            total += labels_batch.size(0)
            correct += predicted.eq(labels_batch.view_as(predicted)).sum().item()
    return (100 * correct / total)

In [29]:
def run_training(model, train_loader, val_loader, criterion, optimizer, n_epochs):
    train_loss_history, val_accuracies = [], []
    for epoch in range(1, n_epochs+1):
        for batch_idx, (data_batch, lengths_batch, labels_batch) in enumerate(train_loader):
            data_batch, lengths_batch, labels_batch = data_batch.to(DEVICE), lengths_batch.to(DEVICE), labels_batch.to(DEVICE)
            model.train()
            optimizer.zero_grad()
            outputs = model(data_batch, lengths_batch)
            loss = criterion(outputs, labels_batch)
            loss.backward()
            optimizer.step()

            train_loss_history.append(loss.item())

            if batch_idx == len(train_loader)-1:    # validate every 100 iterations
                val_accuracy = test_model(val_loader, model)
                print('Epoch: [{}/{}], Step: [{}/{}], Validation Accuracy: {}'\
                      .format(epoch, n_epochs, batch_idx+1, len(train_loader), val_accuracy))
                val_accuracies.append(val_accuracy)
    return train_loss_history, val_accuracies

In [None]:
train_loss_history, val_accuracies = run_training(model, train_loader, val_loader, criterion, optimizer, N_EPOCHS)

In [None]:
train_loss_history = pd.DataFrame({
    'train': train_loss_history
})
train_loss_history.plot(alpha=0.5)

In [None]:
val_accuracies = pd.DataFrame({
    'val': val_accuracies
})
val_accuracies.plot(alpha=0.5)

In [None]:
print("After training for {} epochs:".format(N_EPOCHS))
print("Val Accuracy: {}".format(test_model(val_loader, model)))
print("Test Accuracy: {}".format(test_model(test_loader, model)))

In [30]:
# BATCH_SIZEs = pd.DataFrame([16, 32, 64, 128], columns=['batch_size'])
BATCH_SIZEs = pd.DataFrame([64], columns=['batch_size'])
LRs = pd.DataFrame([1e-2, 1e-3, 1e-5], columns=['lr'])
EMB_DIMs = pd.DataFrame([100, 256, 512, 800, 1024], columns=['emb_dim'])
VOCAB_SIZEs = pd.DataFrame([10000, 20000, 50000], columns=['vocab_size'])
OPTIMIZERS = pd.DataFrame(['adam', 'sgd'], columns=['optimizer'])
# MAX_SENTENCE_LENGTHs = pd.DataFrame([100, 256, 512], columns=['max_sent_length'])
MAX_SENTENCE_LENGTHs = pd.DataFrame([100], columns=['max_sent_length'])

In [34]:
def hyperparameter_tuning():
    try:
        cv_results = pd.DataFrame(columns=['batch_size', 'lr', 'emb_dim', 'vocab_size', \
                                           'max_sent_length', 'optimizer', \
                                           'train_loss_hist', 'val_accuracies', 'max_accuracy'])

        params = pd.DataFrame([1]*len(EMB_DIMs), columns=['key'])
        for df in BATCH_SIZEs, LRs, EMB_DIMs, VOCAB_SIZEs, MAX_SENTENCE_LENGTHs, OPTIMIZERS:
            df['key'] = 1
            params = pd.merge(params, df, on='key')
        params = params.drop('key', axis=1).drop_duplicates()

        for row in params.iterrows():
            print('\n', params.iloc[row[0]:row[0]+1])

            batch_size, emb_dim, vocab_size, max_sent_length = int(row[1]['batch_size']), \
                int(row[1]['emb_dim']), int(row[1]['vocab_size']), int(row[1]['max_sent_length'])
            lr = row[1]['lr']

            token2id, id2token = build_vocabulary(all_train_tokens, vocab_size)

            train_data_indices = token2index_dataset(train_data_tokens)
            val_data_indices = token2index_dataset(val_data_tokens)

            train_dataset = IMDBReviewsDataset(train_data_indices, train_data['sentiment'])
            train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=batch_size,
                                                       collate_fn=imdbreviews_collate_func,
                                                       shuffle=True)

            val_dataset = IMDBReviewsDataset(val_data_indices, val_data['sentiment'])
            val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                                     batch_size=batch_size,
                                                     collate_fn=imdbreviews_collate_func,
                                                     shuffle=True)

            model = BagOfWords(len(id2token), emb_dim).to(DEVICE)
            criterion = nn.CrossEntropyLoss()
            optimizer = {'adam': torch.optim.Adam(model.parameters(), lr=lr), \
                          'sgd': torch.optim.SGD(model.parameters(), lr=lr)}[row[1]['optimizer']]
                
            train_loss_history, val_accuracies = run_training(model, train_loader, val_loader, \
                                                              criterion, optimizer, N_EPOCHS)
            max_accuracy = np.max(val_accuracies)

            result = pd.DataFrame([[batch_size, lr, emb_dim, vocab_size, max_sent_length, train_loss_history, \
                                    val_accuracies, max_accuracy]], columns=cv_results.columns)
            cv_results = cv_results.append(result)
        
    except KeyboardInterrupt:
        return cv_results
    
    return cv_results

In [33]:
cv_results = hyperparameter_tuning()

(90, 6)
> <ipython-input-32-7759add29f4a>(13)hyperparameter_tuning()
-> for row in params.iterrows():
(Pdb) params.head()
   batch_size    lr  emb_dim  vocab_size  max_sent_length optimizer
0          64  0.01      100       10000              100      adam
1          64  0.01      100       10000              100       sgd
2          64  0.01      100       20000              100      adam
3          64  0.01      100       20000              100       sgd
4          64  0.01      100       50000              100      adam
(Pdb) exit()


BdbQuit: 

In [None]:
cv_results

In [None]:
cv_results = hyperparameter_tuning()
pickle.dump(cv_results, open(os.path.join(DATA_DIR, 'cv_results.pkl'), 'wb'))