In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm_notebook

In [2]:
from helper import Helper
from cbow_dataset import CBOWDataset
from cbow_classifier import CBOWClassifier
from cbow_vectorizer import CBOWVectorizer

In [3]:
args = Namespace(
    # Data and Path information
    cbow_csv="../Data/frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model/",
    # Model hyper parameters
    embedding_size=50,
    # Training hyper parameters
    seed=1337,
    num_epochs=100,
    learning_rate=0.001,
    batch_size=32,
    early_stopping_criteria=5,
    # Runtime options
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda:0" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))
print("Using CUDA: {}".format(args.device))


# Set seed for reproducibility
Helper.set_seed_everywhere(args.seed, args.cuda)

# handle dirs
Helper.handle_dirs(args.save_dir)

Expanded filepaths: 
	model/vectorizer.json
	model/model.pth
Using CUDA: True
Using CUDA: cuda:0


Initializations

In [4]:
if args.reload_from_files:
    print("Loading dataset and loading vectorizer")
    dataset = CBOWDataset.load_dataset_and_load_vectorizer(args.cbow_csv,
                                                           args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer")
    dataset = CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
    dataset.save_vectorizer(args.vectorizer_file)
    
vectorizer = dataset.get_vectorizer()

classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab), 
                            embedding_size=args.embedding_size)

Loading dataset and creating vectorizer


Training loop

In [5]:
classifier = classifier.to(args.device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

train_state = Helper.make_train_state(args)

epoch_bar = tqdm_notebook(desc='Training routine', total=args.num_epochs, position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=1, 
                          leave=True)

dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_ndex'] = epoch_index
        
        # iterate over training ds
        dataset.set_split('train')
        batch_generator = Helper.generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        
        running_loss = 0.0
        running_acc = 0.0
        
        # train mode on
        classifier.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # training routine in these five steps
            
            # ---------------------------------------------------
            
            #step 1 -> zero the grads
            optimizer.zero_grad()
            
            # step 2 -> get logits or y_pred
            y_pred = classifier(x=batch_dict['x_data'])
            
            # step 3 -> compute loss 
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            
            # step 4 -> use optimzzer to take gradient step
            optimizer.step()
            
            # compute the accuracy
            acc_t = Helper.compute_accuracy(y_pred,batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # update the bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            train_bar.update()
        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)
        
        # iterating over val dataset
        dataset.set_split('val')
        batch_generator = Helper.generate_batches(dataset, batch_size=args.batch_size, device=args.device)
        running_loss = 0.
        running_acc = 0.
        
        classifier.eval()
        
        for batch_index, batch_dict in enumerate(batch_generator):

            # compute the output
            y_pred =  classifier(x=batch_dict['x_data']) 

            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # compute the accuracy
            acc_t = Helper.compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = Helper.update_train_state(args=args, model=classifier, train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
        
except KeyboardInterrupt:
    print('Exiting Loop')

Training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/1984 [00:00<?, ?it/s]

split=val:   0%|          | 0/425 [00:00<?, ?it/s]

Exiting Loop


In [6]:
#compute the loss & accuracy on the test set using the best available model

classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss()

dataset.set_split('test')
batch_generator = Helper.generate_batches(dataset, batch_size=args.batch_size, device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred =  classifier(x=batch_dict['x_data'])
    
    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = Helper.compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [7]:
print("Test loss: {};".format(train_state['test_loss']))
print("Test Accuracy: {}".format(train_state['test_acc']))

Test loss: 10.068104887569643;
Test Accuracy: 0.0


Trained embeddings 

In [8]:
word = input('Enter a word: ')
embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx
Helper.pretty_print(Helper.get_closest(word, word_to_idx, embeddings, n=5))

...[7.60] - permit
...[7.92] - kid
...[7.95] - saw
...[8.01] - truly
...[8.04] - ultimately
...[8.07] - confused


In [9]:
embeddings[word_to_idx['Monster'.lower()]]

tensor([ 0.8318,  1.1497, -1.3499, -1.1848, -1.7517, -2.0525, -0.5165, -0.3856,
         1.6346, -2.3498,  1.2516,  1.2691, -0.2066,  0.3899,  0.8535, -0.0461,
         1.4486,  0.6616,  1.2856,  1.0166, -0.4041, -2.2251, -0.7742,  0.0843,
        -0.5036,  0.4254, -1.4129, -0.2728,  2.2198, -1.7673,  1.3507,  1.7860,
         1.2807,  0.0284, -0.5660,  0.6268, -0.3849, -1.0777, -1.3215, -0.5572,
        -1.4466,  0.2905, -1.1010, -0.0777, -0.7591,  0.7025,  1.4143, -0.3745,
         0.5387,  0.5286], device='cuda:0')

In [12]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    Helper.pretty_print(Helper.get_closest(target_word, word_to_idx, embeddings, n=5))

...[7.58] - prejudiced
...[7.87] - shrivelled
...[7.92] - liable
...[7.92] - gush
...[7.93] - tour
...[7.95] - recollect
...[7.92] - kid
...[7.95] - saw
...[8.01] - truly
...[8.04] - ultimately
...[8.07] - confused
...[8.08] - cares
...[7.13] - impression
...[7.22] - mutual
...[7.25] - darkened
...[7.26] - mist
...[7.39] - swelling
...[7.45] - tempted
...[6.42] - while
...[6.56] - literally
...[6.61] - foundations
...[6.63] - probabilities
...[6.70] - consoles
...[6.71] - awoke
...[6.97] - moonlight
...[7.00] - unveiled
...[7.27] - heartily
...[7.28] - ought
...[7.32] - undiscovered
...[7.35] - bed
...[6.56] - bottom
...[6.59] - chimney
...[6.61] - injury
...[6.72] - evening
...[6.74] - lingered
...[6.79] - chivalry
