In [1]:
from collections import Counter
import json
import os

from google.cloud import storage
import pandas as pd
import pytorch_lightning as pl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
print('pandas', pd.__version__)
print('pytorch', torch.__version__)

pandas 1.0.0
pytorch 1.3.1


In [3]:
!nvidia-smi

Sun Feb  9 02:00:29 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.50       Driver Version: 430.50       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    26W / 250W |      0MiB / 16280MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [4]:
train_df = pd.read_csv('gs://amazon_bucket/train_numeric_dl.tsv', sep=' ', header=None, dtype=np.int32)
eval_df = pd.read_csv('gs://amazon_bucket/eval_numeric.tsv', sep=' ', header=None, dtype=np.int32)

In [5]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,469,470,471,472,473,474,475,476,477,478
0,3521059,1189115,4352091,4664800,2091395,475580,2055385,1267201,2832609,3754422,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5
1,4570507,390981,2964667,5205699,544392,2164037,2964667,4368892,4873022,2111775,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5
2,415758,362915,1054491,660717,3376710,4235417,572327,1552755,475580,2700358,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5
3,1735072,349092,4500667,4302042,5073616,349092,2111775,1033319,2090218,3611929,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5
4,3852213,3910364,127502,1267201,4525159,475580,2964667,2557625,390981,2964667,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5


In [6]:
X_train_tensor = torch.tensor(train_df.iloc[:, :-1].values, dtype=torch.long)
X_eval_tensor = torch.tensor(eval_df.iloc[:, :-1].values, dtype=torch.long)

In [7]:
y_train_tensor = torch.tensor(train_df.iloc[:, -1].values, dtype=torch.long)
y_eval_tensor = torch.tensor(eval_df.iloc[:, -1].values, dtype=torch.long)

In [8]:
train_df.iloc[:5, :-1]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,468,469,470,471,472,473,474,475,476,477
0,3521059,1189115,4352091,4664800,2091395,475580,2055385,1267201,2832609,3754422,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337
1,4570507,390981,2964667,5205699,544392,2164037,2964667,4368892,4873022,2111775,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337
2,415758,362915,1054491,660717,3376710,4235417,572327,1552755,475580,2700358,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337
3,1735072,349092,4500667,4302042,5073616,349092,2111775,1033319,2090218,3611929,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337
4,3852213,3910364,127502,1267201,4525159,475580,2964667,2557625,390981,2964667,...,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337,5256337


In [9]:
X_eval_tensor.size()

torch.Size([1290927, 478])

In [10]:
y_train_tensor -= 1  # To aid with cross-entropy loss.
y_eval_tensor -= 1
assert y_train_tensor.max() == 4 and y_train_tensor.min() == 0

In [11]:
n_classes = set(y_train_tensor.tolist())
print(len(n_classes))

5


In [12]:
y_train_tensor = F.one_hot(y_train_tensor.view(-1, 1), num_classes=len(n_classes))
y_eval_tensor = F.one_hot(y_eval_tensor.view(-1, 1), num_classes=len(n_classes))

In [13]:
def compute_accuracy(y_pred, y_true):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_true).sum().item()
    return torch.tensor(n_correct / y_pred_indices.shape[0] * 100)

In [14]:
def get_learning_rate(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [15]:
def compute_class_weight(series):
    counts = Counter(series)
    weights = {k: (v / len(series)) for k, v in counts.items()}
    weight_tuples = sorted([(k, v) for k, v in weights.items()])
    sorted_weights = [weight for _, weight in weight_tuples]
    return torch.tensor(sorted_weights)

In [16]:
def get_padding_index(padding_value, vocab):
    """Takes a string and returns an int which corresponds to the padding index."""
    return vocab[padding_value]

In [17]:
def load_vocab(file_path):
    """Loads a json file and converts it to a dictionary."""
    with open(file_path, 'r') as d:
        out = json.load(d)
    return out

In [18]:
client = storage.Client()
bucket = client.get_bucket('amazon_bucket')
blob = bucket.get_blob('word_to_index.json')
json_data = blob.download_to_filename('test.json')

In [19]:
with open('test.json') as jd:
    word_to_index = json.load(jd)

In [20]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [21]:
weights = compute_class_weight(y_train_tensor.squeeze().max(dim=1).indices.numpy())

In [22]:
padding_index = get_padding_index('<pad>', word_to_index)

In [23]:
train_torch = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
eval_torch = torch.utils.data.TensorDataset(X_eval_tensor, y_eval_tensor)

In [24]:
class TextModelMLP(pl.LightningModule):
    def __init__(self, vocab_size=len(word_to_index.keys()), embedding_size=60,
               dropout=0.25, num_classes=len(weights),
               h1=512, h2=256, h3=128, h4=32):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                       embedding_dim=embedding_size,
                                       padding_idx = padding_index,
                                       sparse=False)
        self.linear1 = nn.Linear(in_features=embedding_size, out_features=h1)
        self.linear2 = nn.Linear(in_features=h1, out_features=h2)
        self.linear3 = nn.Linear(in_features=h2, out_features=h3)
        self.linear4 = nn.Linear(in_features=h3, out_features=h4)
        self.out = nn.Linear(in_features=h4, out_features=num_classes)
        self.dropout_p = dropout

    def forward(self, x):
        x = F.relu(self.linear1(self.embeddings(x)))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear2(x))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear3(x))
        x = F.dropout(x, p=self.dropout_p)
        x = F.relu(self.linear4(x))
        x = F.dropout(x, p=self.dropout_p)
        x = self.out(x)
        return x

    @pl.data_loader
    def train_dataloader(self):
        return torch.utils.data.DataLoader(train_torch, batch_size=512,
                                           shuffle=True, pin_memory=True,
                                           drop_last=True)
        
    @pl.data_loader
    def val_dataloader(self):
        return torch.utils.data.DataLoader(eval_torch, batch_size=512,
                                           shuffle=True, pin_memory=True,
                                           drop_last=True)
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

    def training_step(self, batch, batch_nb):
        x, y = batch
        y_pred = self.forward(x)
        acc_t = compute_accuracy(y_pred, y)
        loss = F.cross_entropy(y_pred, y.squeeze())
        self.logger.experiment.log(
            {'loss': loss.item(),
             'train_acc': acc_t.item(),
             'batch_nb': batch_nb})
        return {'loss': F.cross_entropy(y_pred, y.squeeze()), 'train_acc': acc_t}

#     def training_end(self, outputs):
#         average_loss = torch.stack([x['loss'] for x in outputs]).mean()
#         average_accuracy = torch.stack([x['train_acc'] for x in outputs]).mean()
#         return {'mean_train_loss': average_loss,
#                 'mean_train_accuracy': average_accuracy}

    def validation_step(self, batch, batch_nb):
        x, y = batch
        y_pred = self.forward(x)
        loss = F.cross_entropy(y_pred, y.squeeze())
        acc_t = compute_accuracy(y_pred, y)
        self.logger.experiment.log(
            {'val_loss': loss.item(),
             'val_acc': acc_t.item(),
             'batch_nb': batch_nb})
        return {'val_loss': loss, 'val_acc': acc_t}

    def validation_end(self, outputs):
        average_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        average_accuracy = \
            torch.stack([x['val_acc'] for x in outputs]).mean()
        return {'mean_val_loss': average_loss, 
                'mean_val_accuracy': average_accuracy}

In [25]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=f'{os.getcwd()}/mlp_model', save_best_only=True, verbose=True,
    monitor='mean_val_loss', mode='min'
)

In [26]:
early_stopping_callback = pl.callbacks.EarlyStopping(
    monitor='mean_val_loss', min_delta=0.0005, patience=3, verbose=True
)

In [27]:
logger = pl.logging.TestTubeLogger(
    save_dir=f'{os.getcwd()}/train_logs/mlp_model', name='mlp_model'
)

In [28]:
trainer = pl.Trainer(logger=logger, checkpoint_callback=checkpoint_callback,
                     max_nb_epochs=50, min_nb_epochs=10, gpus=1, 
                     early_stop_callback=early_stopping_callback,
                     fast_dev_run=False, use_amp=False)

In [29]:
model = TextModelMLP()
model

TextModelMLP(
  (embeddings): Embedding(5490692, 60, padding_idx=5256337)
  (linear1): Linear(in_features=60, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (linear3): Linear(in_features=256, out_features=128, bias=True)
  (linear4): Linear(in_features=128, out_features=32, bias=True)
  (out): Linear(in_features=32, out_features=5, bias=True)
)

In [None]:
trainer.fit(model)

In [None]:
!nvidia-smi