<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-&amp;-Inits" data-toc-modified-id="Imports-&amp;-Inits-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports &amp; Inits</a></span></li><li><span><a href="#Setup" data-toc-modified-id="Setup-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Setup</a></span><ul class="toc-item"><li><span><a href="#Data" data-toc-modified-id="Data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Data</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Model</a></span></li></ul></li><li><span><a href="#Training" data-toc-modified-id="Training-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Testing" data-toc-modified-id="Testing-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Testing</a></span><ul class="toc-item"><li><span><a href="#Ignite-Testing" data-toc-modified-id="Ignite-Testing-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Ignite Testing</a></span></li><li><span><a href="#NLPBook-Testing" data-toc-modified-id="NLPBook-Testing-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>NLPBook Testing</a></span></li></ul></li><li><span><a href="#Inference" data-toc-modified-id="Inference-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Inference</a></span><ul class="toc-item"><li><span><a href="#Predict-Rating" data-toc-modified-id="Predict-Rating-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Predict Rating</a></span></li></ul></li><li><span><a href="#Interpretablity" data-toc-modified-id="Interpretablity-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Interpretablity</a></span></li></ul></div>

# Yelp Review Classifier from NLP Book

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import torch
import pdb

from pathlib import Path
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:
from ignite.engine import Events, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [None]:
from yelp.dataset import ProjectDataset
from yelp.trainer import YelpTrainer
from yelp.model import Classifier
from yelp.args import args

In [None]:
path = Path('./data/yelp')
review_csv = path/args.sample_file
scratch = path/args.workdir_name
vectorizer_path = scratch/args.vectorizer_fname
args.save_dir = scratch

df = pd.read_csv(review_csv)

In [None]:
args.num_epochs=2
args

## Setup

### Data

Run only once for creating vectorizer

In [None]:
# train_ds = ProjectDataset.load_data_and_create_vectorizer(df.loc[df['split'] == 'train'])
# train_ds.save_vectorizer(vectorizer_path)

In [None]:
train_df = df.loc[df['split'] == 'train']
train_ds = ProjectDataset.load_data_and_vectorizer(train_df, vectorizer_path)
vectorizer = train_ds.get_vectorizer()
train_dl = DataLoader(train_ds, args.batch_size, shuffle=True, drop_last=True)

val_df = df.loc[df['split'] == 'val']
val_ds = ProjectDataset.load_data_and_vectorizer(val_df, vectorizer_path)
val_dl = DataLoader(val_ds, args.batch_size, shuffle=True, drop_last=True)

test_df = df.loc[df['split'] == 'test']
test_ds = ProjectDataset.load_data_and_vectorizer(test_df, vectorizer_path)
test_dl = DataLoader(test_ds, args.batch_size, shuffle=True, drop_last=True)

### Model

In [None]:
def bce_logits_wrapper(output):
    y_pred, y = output
    y_pred = (torch.sigmoid(y_pred) > 0.5).long()
    return y_pred, y

In [None]:
classifier = Classifier(num_features=len((vectorizer).review_vocab))
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
loss_func = nn.BCEWithLogitsLoss()

pbar = ProgressBar(persist=True)
metrics = {'accuracy': Accuracy(bce_logits_wrapper), 'loss': Loss(loss_func)}

## Training

In [None]:
yelp_trainer = YelpTrainer(classifier, optimizer, loss_func, train_dl, val_dl, args, pbar, metrics)
yelp_trainer.run()

## Testing

### Ignite Testing

In [None]:
state_dict = torch.load(scratch/'yelp_classifier_lite')
classifier.load_state_dict(state_dict)

evaluator = create_supervised_evaluator(classifier, metrics=metrics)

In [None]:
@evaluator.on(Events.COMPLETED)
def log_testing_results(engine):
  metrics = engine.state.metrics
  print(f"Test loss: {metrics['loss']:0.3f}")
  print(f"Test accuracy: {metrics['accuracy']:0.3f}")

In [None]:
evaluator.run(test_dl)

### NLPBook Testing

In [None]:
def compute_accuracy(y_pred, y):
  y = y.type(torch.uint8)
  y_pred = (torch.sigmoid(y_pred)>0.5)#.max(dim=1)[1]
  n_correct = torch.eq(y_pred, y).sum().item()
  return n_correct / len(y_pred) * 100

In [None]:
state_dict = torch.load(scratch/'yelp_classifier_54.pth')
classifier.load_state_dict(state_dict)

In [None]:
running_loss = 0.
running_acc = 0.

In [None]:
classifier.eval()

In [None]:
for i, batch in enumerate(test_dl):
  x,y = batch
  y_pred = classifier(x_in=x.float())
  
  loss = loss_func(y_pred, y.float())
  loss_t = loss.item()
  running_loss += (loss_t-running_loss)/(i+1)
  
  acc_t = compute_accuracy(y_pred, y)
  running_acc += (acc_t-running_acc)/(i+1)

In [None]:
print(f"Test loss: {running_loss:0.3f}")
print(f"Test acc: {running_acc:0.3f}")

## Inference

In [None]:
import re

In [None]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r"([.,!?])", r" \1 ", text)
  text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
  return text

### Predict Rating

In [None]:
def predict_rating(review, classifier, vectorizer, decision_threshold=0.5):
  """Predict the rating of a review

  Args:
      review (str): the text of the review
      classifier (ReviewClassifier): the trained model
      vectorizer (ReviewVectorizer): the corresponding vectorizer
      decision_threshold (float): The numerical boundary which separates the rating classes
  """
  review = preprocess_text(review)
  print(review)

  vectorized_review = torch.tensor(vectorizer.vectorize(review))
  print(vectorized_review)
  result = classifier(vectorized_review.view(1, -1))
  print(result)

  probability_value = torch.sigmoid(result).item()
  print(probability_value)
  index = 1
  if probability_value < decision_threshold:
      index = 0

  return vectorizer.rating_vocab.lookup_idx(index)

In [None]:
test_review = "While the begining of this book is great, the ending sucks"

classifier = classifier.cpu()
prediction = predict_rating(test_review, classifier, vectorizer, decision_threshold=0.5)
print(f"{test_review} -> {prediction}")

## Interpretablity

In [None]:
classifier.fc1.weight.shape

In [None]:
# sort weights
fc1_weights = classifier.fc1.weight.detach()[0]
_, idxs = torch.sort(fc1_weights, dim=0, descending=True)
idxs = idxs.numpy().tolist()

In [None]:
# Top 20 words
print("Influential words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_idx(idxs[i]))
    
print("====\n\n\n")

In [None]:
# Top 20 words
print("Influential words in Negative Reviews:")
print("--------------------------------------")
idxs.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_idx(idxs[i]))
    
print("====\n\n\n")