<a href="https://colab.research.google.com/github/serfsup/thinkful-final-capstone/blob/master/final_capstone_04_build_and_train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
cd 'drive/My Drive/Colab Datasets'

/content/drive/My Drive/Colab Datasets


In [0]:
!pip install pytorch_lightning==0.5.2 -q

In [0]:
from collections import Counter
import json
import os

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
print('Pandas:', pd.__version__)
print('PyTorch:', torch.__version__)

Pandas: 0.25.3
PyTorch: 1.3.1


In [0]:
!pip list | grep pytorch-lightning

pytorch-lightning        0.5.2      


In [0]:
train_target = pd.read_csv('./train_clean.tsv', sep='\t', index_col=0)

  mask |= (ar1 == a)


In [0]:
eval_target = pd.read_csv('./eval_clean.tsv', sep='\t', index_col=0,
                          error_bad_lines=False, engine='python')

In [0]:
train_target = train_target.loc[:, 'target'].values

In [0]:
eval_target = eval_target.loc[:, 'target'].values

In [0]:
# Subtracting 1 to have a zero class to aid in cross-entropy loss function.
train_target -= 1
eval_target -= 1
assert train_target.max() == 4 and train_target.min() == 0

In [0]:
def compute_accuracy(y_pred, y_true):
  _, y_pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(y_pred_indices, y_true).sum()
  return torch.tensor(
      n_correct / len(y_pred_indices) * 100, dtype=torch.float32)

In [0]:
def get_learning_rate(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']

In [0]:
def compute_class_weights(arr):
  counts = Counter(arr)
  weights = {k: (v / len(arr)) for k, v in counts.items()}
  weight_tuples = sorted([(k, v) for k, v in weights.items()])
  sorted_weights = [weight for _, weight in weight_tuples]
  return torch.tensor(sorted_weights)

In [0]:
def load_vocab(file_path):
  """Loads a json file and converts it to a dictionary."""
  with open(file_path, 'r') as d:
    return json.load(d)

In [0]:
def get_padding_index(padding_value, vocab):
  """Takes a string and returns an int that corresponds to the padding index."""
  return vocab[padding_value]

In [0]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, path, chunksize, n_samples):
        self.path = path
        self.chunksize = chunksize
        self.length = int(n_samples / self.chunksize)

    def __getitem__(self, index):
        data = next(pd.read_csv(self.path, skiprows=(index * self.chunksize), 
                                chunksize=self.chunksize, sep=' ', header=None,
                                engine='python')) 
        y = data.iloc[:, -1:].values
        y -= 1  # to aid with cross-entropy loss
        y = torch.tensor(y, dtype=torch.int64)
        y = F.one_hot(y, num_classes=5)
        x = data.iloc[:, :-1].values
        x = torch.tensor(x, dtype=torch.int64)
        return x, y

    def __len__(self):
        return self.length

In [0]:
vocab = load_vocab('word_to_index.json')

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device 

'cuda'

In [0]:
!nvidia-smi

Thu Dec 12 16:39:24 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.36       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    28W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
weights = compute_class_weights(train_target).to(device)
weights

tensor([0.0568, 0.0391, 0.0767, 0.1804, 0.6469], device='cuda:0')

In [0]:
padding_index = get_padding_index('<pad>', vocab)
assert padding_index

In [0]:
train_dataset = TextDataset('train_numeric.tsv', 16, 6024321)

In [0]:
eval_dataset = TextDataset('eval_numeric.tsv', 16, 1290927)

In [0]:
class TextModelMLP(pl.LightningModule):
    def __init__(self, vocab_size=len(vocab.keys()), embedding_size=100,
               dropout=0.25, num_classes=len(weights),
               h1=512, h2=256, h3=128, h4=32):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                       embedding_dim=embedding_size,
                                       padding_idx = padding_index,
                                       sparse=False)
        self.linear1 = nn.Linear(in_features=embedding_size, out_features=h1)
        self.linear2 = nn.Linear(in_features=h1, out_features=h2)
        self.linear3 = nn.Linear(in_features=h2, out_features=h3)
        self.linear4 = nn.Linear(in_features=h3, out_features=h4)
        self.out = nn.Linear(in_features=h4, out_features=num_classes)
        self.dropout_p = dropout

    def forward(self, x):
        x = F.relu(self.linear1(self.embeddings(x)))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear2(x))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear3(x))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear4(x))
        x = F.dropout(x, p=self.dropout_p)
        x = self.out(x)
        return x

    @pl.data_loader
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train_dataset, batch_size=64,
                                           shuffle=False, pin_memory=True,
                                           drop_last=True, num_workers=4)
        
    @pl.data_loader
    def val_dataloader(self):
        return torch.utils.data.DataLoader(eval_dataset, batch_size=64,
                                           shuffle=False, pin_memory=True,
                                           drop_last=True, num_workers=4)
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

    def training_step(self, batch, batch_nb):
        x, y = batch
        x = x.view(1024, 478)
        y = y.view(1024, 5)
        y_pred = self.forward(x)
        acc_t = compute_accuracy(y_pred, y)
        loss = F.cross_entropy(y_pred, y)
        self.logger.experiment.log(
            {'loss': loss.item(),
             'train_acc': acc_t.item(),
             'batch_nb': batch_nb})
        return {'loss': loss, 'train_acc': acc_t}

    def training_end(self, outputs):
        average_loss = torch.stack([x['loss'] for x in outputs]).mean()
        average_accuracy = torch.stack([x['train_acc'] for x in outputs]).mean()
        return {'mean_train_loss': average_loss,
                'mean_train_accuracy': average_accuracy}

    def validation_step(self, batch, batch_nb):
        x, y = batch
        x = x.view(1024, 478)
        y = y.view(1024, 5)
        y_pred = self.forward(x)
        loss = F.cross_entropy(y_pred, y)
        acc_t = compute_accuracy(y_pred, y)
        self.logger.experiment.log(
            {'val_loss': loss.item(),
             'val_acc': acc_t.item(),
             'batch_nb': batch_nb})
        return {'val_loss': loss, 'val_acc': acc_t}

    def validation_end(self, outputs):
        average_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        average_accuracy = \
            torch.stack([x['val_acc'] for x in outputs]).mean()
        return {'mean_val_loss': average_loss, 
                'mean_val_accuracy': average_accuracy}
                

In [0]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=f'{os.getcwd()}/mlp_model', save_best_only=True, verbose=True,
    monitor='mean_val_loss', mode='min'
)

In [0]:
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor='mean_val_loss', min_delta=0.0005, patience=3, verbose=True
)

In [0]:
logger = pl.logging.TestTubeLogger(
    save_dir=f'{os.getcwd()}/train_logs/mlp_model', name='mlp_model'
)

In [0]:
trainer = pl.Trainer(logger=logger, checkpoint_callback=checkpoint_callback,
                     max_nb_epochs=25, min_nb_epochs=10, gpus=1, 
                     early_stop_callback=early_stopping_callback,
                     fast_dev_run=False)

gpu available: True, used: True
VISIBLE GPUS: 0


In [0]:
model = TextModelMLP()
model

TextModelMLP(
  (embeddings): Embedding(4516760, 100, padding_idx=4337549)
  (linear1): Linear(in_features=100, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=128, bias=True)
  (linear4): Linear(in_features=128, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=5, bias=True)
)

In [0]:
trainer.fit(model)

  0%|          | 0/5 [00:00<?, ?it/s]

         Name       Type Params
0  embeddings  Embedding  451 M
1     linear1     Linear   51 K
2     linear2     Linear  131 K
3     linear3     Linear   32 K
4     linear4     Linear    4 K
5         out     Linear  165  


  """
201it [9:37:42, 305.13s/it, batch_nb=200, epoch=0, gpu=0, loss=6.099, v_nb=20]