## Preparing Data

In [1]:


%matplotlib inline
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re


os.environ['OMP_NUM_THREADS'] = '4'


SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(lower=True,include_lengths=True ,tokenize='spacy')

LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None, dtype = torch.float)




dataFields = {"comment_text": ("comment_text", TEXT), 
              'toxic': ("toxic", LABEL), 
              'severe_toxic': ("severe_toxic", LABEL),
              'threat': ("threat", LABEL), 
              'obscene': ("obscene", LABEL),
              'insult': ("insult", LABEL), 
              'identity_hate': ("identity_hate", LABEL)}

dataset= data.TabularDataset(path='./data/train.json', 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=True)

In [2]:
import random
SEED = 3
train, unimportant = dataset.split(split_ratio=0.5,random_state = random.seed(SEED)) 

train_data, val_data = dataset.split(split_ratio=0.5,random_state = random.seed(SEED))

In [3]:
MAX_VOCAB_SIZE = 20_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

In [4]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch = True,
    device = device)

In [5]:
yFields = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
iaux=0
for batch in valid_iterator:
    iaux+=1
    aux = batch
    aux2= torch.stack([getattr(batch, y) for y in yFields])
    if iaux==20: break

In [6]:

        
torch.transpose( torch.stack([getattr(aux, y) for y in yFields]),0,1)

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [7]:
aux.comment_text[0].size()

torch.Size([10, 512])

In [8]:
aux.toxic.size()

torch.Size([512])

## Build model

In [38]:
import torch.nn as nn
from torch.functional import F

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dense_dim ,output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.GRU(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, dense_dim)
        
        self.dropout_dense = nn.Dropout(dropout)
        
        self.fc2 = nn.Linear(dense_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        packed_output, hidden = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
        
        #Doc missing below
        dense = self.fc1(hidden)
        
        dense = F.relu(dense)
        
        dense = self.dropout_dense(dense)
        
        return self.fc2(dense)

In [57]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 30 #256
DENSE_DIM = 30
OUTPUT_DIM = 6 #1 
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM,
            DENSE_DIM,
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [58]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,042,536 trainable parameters


In [59]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([20002, 100])


In [60]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4833, -0.1757,  0.7039,  ..., -0.6125,  0.6385,  0.6922],
        [-0.7999,  0.1235,  0.8337,  ..., -0.0232,  1.0104, -0.2413],
        [ 0.2338,  0.5541,  0.6862,  ..., -0.4598, -0.1484,  0.8151]])

In [61]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4833, -0.1757,  0.7039,  ..., -0.6125,  0.6385,  0.6922],
        [-0.7999,  0.1235,  0.8337,  ..., -0.0232,  1.0104, -0.2413],
        [ 0.2338,  0.5541,  0.6862,  ..., -0.4598, -0.1484,  0.8151]])


## Train our model

In [62]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [63]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [64]:
import numpy
from sklearn.metrics import roc_auc_score
def roc_auc(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    
    acc = roc_auc_score(y, preds)
    
    return acc

In [65]:


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    preds_list=[]
    labels_list= []
 
    
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.comment_text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        batch_labels=torch.stack([getattr(batch, y) for y in yFields]) #transpose?
        batch_labels = torch.transpose(batch_labels,0,1)
        
        loss = criterion(predictions, batch_labels)
        
        loss.backward()
        
        optimizer.step()
        
        preds_list+=[torch.sigmoid(predictions).detach().numpy()]
        labels_list+=[batch_labels.numpy()]
        
        #if i%64==0:
        #    epoch_acc += [roc_auc(np.vstack(preds_list), np.vstack(batch_labels))]
        #    preds_list=[]
        #    labels_list= []
            
        
        epoch_loss += loss.item()
        
        
        
    return epoch_loss / len(iterator), roc_auc(np.vstack(preds_list), np.vstack(labels_list))

In [66]:


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds_list=[]
    labels_list= []
    epoch_acc=[]
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.comment_text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            batch_labels = torch.stack([getattr(batch, y) for y in yFields]) #transpose?
            batch_labels = torch.transpose(batch_labels,0,1)
            
            loss = criterion(predictions, batch_labels)

            epoch_loss += loss.item()
            
            preds_list+=[torch.sigmoid(predictions).detach().numpy()]
            labels_list+=[batch_labels.numpy()]
        
            #if i%64==0:
            #    epoch_acc += [roc_auc(np.vstack(preds_list), np.vstack(batch_labels))]
            #    preds_list=[]
            #    labels_list= []
        
    return epoch_loss / len(iterator), roc_auc(np.vstack(preds_list), np.vstack(labels_list))



In [67]:
from torchsummary import summary

In [68]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [69]:


N_EPOCHS = 6

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



Epoch: 01 | Epoch Time: 3m 51s
	Train Loss: 0.257 | Train Acc: 61.81%
	 Val. Loss: 0.089 |  Val. Acc: 93.53%
Epoch: 02 | Epoch Time: 3m 56s
	Train Loss: 0.085 | Train Acc: 91.75%
	 Val. Loss: 0.057 |  Val. Acc: 96.43%
Epoch: 03 | Epoch Time: 4m 14s
	Train Loss: 0.067 | Train Acc: 94.24%
	 Val. Loss: 0.053 |  Val. Acc: 96.74%
Epoch: 04 | Epoch Time: 4m 27s
	Train Loss: 0.062 | Train Acc: 95.02%
	 Val. Loss: 0.052 |  Val. Acc: 97.08%
Epoch: 05 | Epoch Time: 4m 42s
	Train Loss: 0.059 | Train Acc: 95.76%
	 Val. Loss: 0.053 |  Val. Acc: 97.19%
Epoch: 06 | Epoch Time: 3m 58s
	Train Loss: 0.056 | Train Acc: 96.23%
	 Val. Loss: 0.050 |  Val. Acc: 97.37%


In [70]:
#Let us train for one more epoch
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 4m 3s
	Train Loss: 0.054 | Train Acc: 96.59%
	 Val. Loss: 0.051 |  Val. Acc: 97.38%


In [71]:
#Let us train for one more epoch
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 3m 59s
	Train Loss: 0.053 | Train Acc: 96.83%
	 Val. Loss: 0.051 |  Val. Acc: 97.41%


In [72]:
#Let us train for two more epochs
N_EPOCHS = 2

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 4m 2s
	Train Loss: 0.051 | Train Acc: 97.00%
	 Val. Loss: 0.052 |  Val. Acc: 97.45%
Epoch: 02 | Epoch Time: 4m 1s
	Train Loss: 0.050 | Train Acc: 97.16%
	 Val. Loss: 0.050 |  Val. Acc: 97.48%


In [100]:
var_preds.shape

(79785, 6)

In [173]:
var_y

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [1., 1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])