<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Data-&amp;-Model" data-toc-modified-id="Data-&amp;-Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data &amp; Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training</a></span><ul class="toc-item"><li><span><a href="#Results" data-toc-modified-id="Results-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Results</a></span></li></ul></li><li><span><a href="#Testing" data-toc-modified-id="Testing-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Testing</a></span><ul class="toc-item"><li><span><a href="#Ignite-Testing" data-toc-modified-id="Ignite-Testing-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Ignite Testing</a></span></li><li><span><a href="#NLPBook-Testing" data-toc-modified-id="NLPBook-Testing-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>NLPBook Testing</a></span></li></ul></li><li><span><a href="#Trained-Embeddings" data-toc-modified-id="Trained-Embeddings-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Trained Embeddings</a></span></li></ul></div>

# CBOW Training with Frankenstein Text

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import torch

from torch import nn
from torch import optim
from torch.utils.data import DataLoader

In [None]:
from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [None]:
from consts import consts
from cbow.dataset import CBOWDataset, DataContainer
from cbow.model import CBOWClassifier, ModelContainer
from cbow.trainer import IgniteTrainer
consts

## Data & Model

In [None]:
df = pd.read_csv(consts.proc_dataset_csv)
dc = DataContainer(df, consts.vectorizer_file, consts.batch_size, is_load=True)

classifier = CBOWClassifier(dc.vocabulary_size, consts.embedding_size)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=consts.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)

mc = ModelContainer(classifier, optimizer, loss_func, scheduler)

pbar = ProgressBar(persist=True)
metrics = {'accuracy': Accuracy(), 'loss': Loss(loss_func)}

## Training

In [None]:
ig = IgniteTrainer(mc, dc, consts, pbar, metrics)
ig.run()

### Results

In [None]:
training_metrics = pd.read_csv(consts.metric_file)
training_metrics = training_metrics[:-1]

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5))

training_metrics.plot(x='epoch', y=['training_loss', 'validation_loss'], kind='line',
                      title='Training and validation loss', ax=axes[0])

training_metrics.plot(x='epoch', y=['training_acc', 'validation_acc'], kind='line',
                      title='Training and validation accuracy', ax=axes[1])

## Testing

### Ignite Testing

In [None]:
state_dict = torch.load(consts.work_dir/'cbow_classifier.pth')
classifier.load_state_dict(state_dict)
evaluator = create_supervised_evaluator(classifier, metrics=metrics)

@evaluator.on(Events.COMPLETED)
def log_testing_results(engine):
  metrics = engine.state.metrics
  print(f"Test loss: {metrics['loss']:0.3f}")
  print(f"Test accuracy: {metrics['accuracy']:0.3f}")

In [None]:
evaluator.run(dc.test_dl)

### NLPBook Testing

In [None]:
def compute_accuracy(y_pred, y_target):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_target).sum().item()
  return n_correct / len(y_pred_indices) * 100

In [None]:
running_loss = 0.
running_acc = 0.

classifier.eval()
for i, batch in enumerate(dc.test_dl):
  x,y = batch
  y_pred = classifier(x_in=x)
  
  loss = loss_func(y_pred, y)
  loss_t = loss.item()
  running_loss += (loss_t-running_loss)/(i+1)
  
  acc_t = compute_accuracy(y_pred, y)
  running_acc += (acc_t-running_acc)/(i+1)

In [None]:
print(f"Test loss: {running_loss:0.3f}")
print(f"Test acc: {running_acc:0.3f}%")

## Trained Embeddings

In [None]:
def pretty_print(results):
  for item in results:
    print(f"...{item[1]:0.2f} - {item[0]}")
    
def get_closest(target_word, idx_word_bidict, embedings, n=5):
  """
    Get the n closest words to the target word
  """
  # calculate distances to all other words
  word_embedding = embeddings[idx_word_bidict.inverse[target_word.lower()]]
  distances = []
  
  for idx, word in idx_word_bidict.items():
    if word == '<MASK>' or word == target_word:
      continue
    distances.append((word, torch.dist(word_embedding, embeddings[idx])))
    
  results = sorted(distances, key=lambda x: x[1])[1:n+2]
  return results

In [None]:
target_words = ['frankenstein', 'monster', 'hello', 'science', 'sickness', 'lonely', 'happy']
embeddings = classifier.embedding.weight.data
idx_word_bidict = dc.vocabulary.idx_token_bidict

for target_word in target_words:
  print(f"========={target_word}============")
  try:
    idx_word_bidict.inverse[target_word]
  except:
    print(f"Word {target_word} not in vocabulary")
    continue
  pretty_print(get_closest(target_word, idx_word_bidict, embeddings, n=5))

In [None]:
word = input("Enter a word: ")
embeddings = classifier.embedding.weight.data
idx_word_bidict = dc.vocabulary.idx_token_bidict
pretty_print(get_closest(word, idx_word_bidict, embeddings))