# Toxic comment classification convolutional model run on instance


In [1]:
import os
data_dir = './data_to_s3' # The folder where data/iterators/vocab are stored
if not os.path.exists(data_dir): # Make sure that the folder exists
    print('Go back and create data at directory: ', data_dir)

In [2]:
#import sagemaker

#sagemaker_session = sagemaker.Session()
#bucket = sagemaker_session.default_bucket()
#prefix = 'toxic/data'
#role = sagemaker.get_execution_role()

## Step 4: Build and Train the PyTorch Model

In the XGBoost notebook we discussed what a model is in the SageMaker framework. In particular, a model comprises three objects

 - Model Artifacts,
 - Training Code, and
 - Inference Code,
 
each of which interact with one another. In the XGBoost example we used training and inference code that was provided by Amazon. Here we will still be using containers provided by Amazon with the added benefit of being able to include our own custom code.

We will start by implementing our own neural network in PyTorch along with a training script. For the purposes of this project we have provided the necessary model object in the `model.py` file, inside of the `train` folder. You can see the provided implementation by running the cell below.

In [3]:
!pygmentize train/model_nlp.py

[34mimport[39;49;00m [04m[36mtorch.nn[39;49;00m [34mas[39;49;00m [04m[36mnn[39;49;00m
[34mimport[39;49;00m [04m[36mtorch.nn.functional[39;49;00m [34mas[39;49;00m [04m[36mF[39;49;00m
[34mimport[39;49;00m [04m[36mtorch[39;49;00m

[34mclass[39;49;00m [04m[32mCNN[39;49;00m(nn.Module):
    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        [36msuper[39;49;00m().[32m__init__[39;49;00m()
        
        [36mself[39;49;00m.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        [36mself[39;49;00m.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = [34m1[39;49;00m, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    [34mfor

In [4]:
# import utilities to load iterators
from train.utils import Data_iterator, Test_iterator

In [5]:
iterator_train = Data_iterator('train')
iterator_val = Data_iterator('val')

## Training methods


In [6]:
import torch.nn as nn
from torch.functional import F
from sklearn.metrics import roc_auc_score
import numpy as np



def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    preds_list=[]
    labels_list= []
 
    iterations=0
    for batch in iterator:
        iterations+=1
        
        batch_X, batch_y = batch
        
        optimizer.zero_grad()
        
        predictions = model(batch_X).squeeze(1)
        
        loss = criterion(predictions, batch_y)
        
        loss.backward()
        
        optimizer.step()
        
        preds_list+=[torch.sigmoid(predictions).detach().numpy()]
        labels_list+=[batch_y.numpy()]
        
        epoch_loss += loss.item()
        
        
        
    return epoch_loss / iterations, roc_auc_score(np.vstack(labels_list), np.vstack(preds_list))

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds_list=[]
    labels_list= []
    epoch_acc=[]
    
    with torch.no_grad():
        iterations = 0
        for batch in iterator:
            iterations+=1
            
            batch_X, batch_y = batch
            
            predictions = model(batch_X).squeeze(1)
            
            #batch_labels = torch.stack([getattr(batch, y) for y in yFields]) #transpose?
            #batch_labels = torch.transpose(batch_labels,0,1)
            
            loss = criterion(predictions, batch_y)

            epoch_loss += loss.item()
            
            preds_list+=[torch.sigmoid(predictions).detach().numpy()]
            labels_list+=[batch_y.numpy()]
        
            #if i%64==0:
            #    epoch_acc += [roc_auc(np.vstack(preds_list), np.vstack(batch_labels))]
            #    preds_list=[]
            #    labels_list= []
        
    return epoch_loss / iterations, roc_auc_score(np.vstack(labels_list),np.vstack(preds_list))


In [7]:
from train.model_nlp import CNN

INPUT_DIM =  20002 # len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 6
DROPOUT = 0.5
PAD_IDX = 1 # TEXT.vocab.stoi[TEXT.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, 
            FILTER_SIZES ,OUTPUT_DIM, DROPOUT, PAD_IDX)

In [8]:
import json
import torch
with open(os.path.join('./data_to_s3','untrained_vocab_vectors_list.json'), 'r') as f:
    vocab_vectors = json.load(f)
pretrained_embeddings = torch.tensor(vocab_vectors)
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2161, -0.4992,  0.4119,  ...,  0.0555,  0.1958,  0.8141],
        [-0.3152,  0.3180,  0.0812,  ..., -0.0243,  0.2619, -0.6031],
        [-0.4617, -0.3862, -0.4489,  ..., -0.8185,  0.4885,  0.4705]])

In [9]:
# Setting unknown token and padding to zero
model.embedding.weight.data[0] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[1] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2161, -0.4992,  0.4119,  ...,  0.0555,  0.1958,  0.8141],
        [-0.3152,  0.3180,  0.0812,  ..., -0.0243,  0.2619, -0.6031],
        [-0.4617, -0.3862, -0.4489,  ..., -0.8185,  0.4885,  0.4705]])


In [10]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [11]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [12]:
model.embedding.weight.requires_grad = True


N_EPOCHS = 4

best_valid_loss = float('inf')


for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, iterator_train, optimizer, criterion)

    valid_loss, valid_acc = evaluate(model, iterator_val, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './model_state/model_state.pt')

    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 3m 38s
	Train Loss: 0.083 | Train Acc: 88.46%
	 Val. Loss: 0.052 |  Val. Acc: 96.33%
Epoch: 02 | Epoch Time: 3m 36s
	Train Loss: 0.055 | Train Acc: 96.26%
	 Val. Loss: 0.048 |  Val. Acc: 97.54%
Epoch: 03 | Epoch Time: 3m 36s
	Train Loss: 0.049 | Train Acc: 97.56%
	 Val. Loss: 0.047 |  Val. Acc: 97.99%
Epoch: 04 | Epoch Time: 3m 37s
	Train Loss: 0.045 | Train Acc: 98.03%
	 Val. Loss: 0.047 |  Val. Acc: 98.06%
