In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import torch
import pdb

from pathlib import Path
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.contrib.handlers import ProgressBar

In [None]:
from yelp.dataset import ProjectDataset

In [None]:
path = Path('./data/yelp')
review_csv = path/'reviews_with_splits_lite.csv'
scratch = path/'scratch'
vectorizer_path = scratch/'vectorizer.json'

df = pd.read_csv(review_csv)

In [None]:
# train_ds = ProjectDataset.load_data_and_create_vectorizer(df.loc[df['split'] == 'train'])
# train_ds.save_vectorizer(vectorizer_path)

In [None]:
train_df = df.loc[df['split'] == 'train']
train_ds = ProjectDataset.load_data_and_vectorizer(train_df, vectorizer_path)
vectorizer = train_ds.get_vectorizer()
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True, drop_last=True)

val_df = df.loc[df['split'] == 'val']
val_ds = ProjectDataset.load_data_and_vectorizer(val_df, vectorizer_path)
val_dl = DataLoader(val_ds, batch_size=128, shuffle=True, drop_last=True)

In [None]:
def set_all_seed(seed, cuda):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if cuda:
    torch.cuda.manual_seed(seed)

In [None]:
# dataset = ProjectDataset.load_data_and_create_vectorizer(review_csv)
# dataset.save_vectorizer(vectorizer_path)

In [None]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True):
  """
  A generator function which wraps the PyTorch DataLoader. It will 
    ensure each tensor is on the write device location.
  """
  dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                          shuffle=shuffle, drop_last=drop_last)

  for data_dict in dataloader:
      yield (data_dict['x_data'].float(), data_dict['y_target'].float())

In [None]:
dataset = ProjectDataset.load_data_and_vectorizer(review_csv, vectorizer_path)
vectorizer = dataset.get_vectorizer()

dataset.set_split('train')
# train_dl = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)
# dataset.set_split('val')
# val_dl = DataLoader(dataset, batch_size=128, shuffle=True, drop_last=True)

In [None]:
batches = generate_batches(dataset, 64)

In [None]:
itr = iter(batches)

In [None]:
x,y = next(itr)

In [None]:
x

In [None]:
class ReviewClassifier(nn.Module):
  def __init__(self, num_features):
    super(ReviewClassifier, self).__init__()
    self.fc1 = nn.Linear(in_features=num_features, out_features=1)
    
  def forward(self, x_in, apply_sigmoid=False):
    y_out = self.fc1(x_in).squeeze(1)
    if apply_sigmoid:
      y_out = torch.sigmoid(y_out)
    return y_out

In [None]:
classifier = ReviewClassifier(num_features=len((vectorizer).review_vocab))
optimizer = optim.Adam(classifier.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
loss_func = nn.BCEWithLogitsLoss()

In [None]:
def bce_logits_wrapper(output):
    y_pred, y = output
    y_pred = (torch.sigmoid(y_pred) > 0.5).long()
    return y_pred, y

In [None]:
trainer = create_supervised_trainer(classifier, optimizer, loss_func, device='cuda:3')
evaluator = create_supervised_evaluator(classifier, metrics=\
                                        {'accuracy':Accuracy(bce_logits_wrapper),\
                                         'bce': Loss(loss_func)}, device='cuda:3')

In [None]:
pbar = ProgressBar(persist=True)
pbar.attach(trainer, output_transform=lambda x: {'loss': x})

In [None]:
# @trainer.on(Events.EPOCH_COMPLETED)
# def log_training_results(engine):
#   evaluator.run(batches)
#   metrics = evaluator.state.metrics
#   pbar.log_message(f"Training Results - Epoch: {engine.state.epoch}\
#                     Avg accuracy: {metrics['accuracy']:0.2f}\
#                     Avg loss: {metrics['bce']:0.2f}")

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
  evaluator.run(train_dl)
  metrics = evaluator.state.metrics
  pbar.log_message(f"Training Results - Epoch: {engine.state.epoch}\
                    Avg accuracy: {metrics['accuracy']:0.2f}\
                    Avg loss: {metrics['bce']:0.2f}")
                   
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
  evaluator.run(val_dl)
  metrics = evaluator.state.metrics
  pbar.log_message(f"Validation Results - Epoch: {engine.state.epoch}\
                    Avg accuracy: {metrics['accuracy']:0.2f}\
                    Avg loss: {metrics['bce']:0.2f}")

In [None]:
trainer.run(train_dl, max_epochs=2)

In [None]:
itr = iter(train_dl)

In [None]:
x,y = next(itr)
y_pred = classifier(x)
loss = loss_func(y_pred, y)