## Preparing Data

In [167]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import pandas as pd

spacy_en = spacy.load('en')

_stopwords = spacy.lang.en.stop_words.STOP_WORDS

os.environ['OMP_NUM_THREADS'] = '4'


SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#FILTER_SIZES = [2,3,4]
def tokenizer(text):
    token = [t.text for t in spacy_en.tokenizer(text)]
#    if len(token) < FILTER_SIZES[-1]:
    for i in range(1, 7 ):
        token.append('<PAD>')
    return token


TEXT = data.Field(lower=True,include_lengths=False,tokenize=tokenizer, stop_words = _stopwords)

LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None, dtype = torch.float)

dataFields = {"comment_text": ("comment_text", TEXT), 
              'label_int': ("toxic", LABEL)}



In [168]:
data_dir = './data/sampled_train/'
data_name = 'train.json'

In [169]:
dataset= data.TabularDataset(path=data_dir+data_name, 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=True)

In [170]:
import random
SEED = 3 
train_data, val_data = dataset.split(split_ratio=0.8,random_state = random.seed(SEED))

In [171]:
MAX_VOCAB_SIZE = 20_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

In [172]:
yFields = ['toxic']

In [173]:
#import pickle

#pickle.dump(TEXT, open('./custom_embeddings/train_data_field', 'wb'))

In [174]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x1a2c559f28>

In [175]:
BATCH_SIZE = 64


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch = True,
    device = device)

In [176]:
for i in train_iterator:
    aux = i
    break

## Build model

In [177]:
aux.comment_text[0]

tensor([ 293,  579,  223, 3829,   76,   16, 4658,  518,  235,  293,  521,  330,
         229, 1166, 1852,  519,   66,   17,  156,  391,   34, 1969,  715,   63,
         327, 2075,  324,  240, 1181,  594,    8,    3,   18,  124,   46, 1726,
          55,  935,  265,  146,  120, 2923,  801,   65, 2581,   75, 2378,  159,
        1059,  560,   90,   79, 2912,  253,  454,  771,   15, 5069,   75,   46,
        3832, 1352,  443,  339])

In [178]:


import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)




In [261]:
INPUT_DIM = len(TEXT.vocab) # 20002
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4,5,6]
OUTPUT_DIM = 1
DROPOUT = 0.8
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # 1

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES ,OUTPUT_DIM, DROPOUT, PAD_IDX)

In [262]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 756,701 trainable parameters


In [263]:
for i in train_iterator:
    aux=i
    break

from torchsummaryX import summary
print(aux.comment_text.size())
summary(model, aux.comment_text )

torch.Size([18, 64])
                      Kernel Shape      Output Shape  Params Mult-Adds
Layer                                                                 
0_embedding            [100, 5557]     [64, 18, 100]  555.7k    555.7k
1_convs.Conv2d_0  [1, 100, 2, 100]  [64, 100, 17, 1]   20.1k    340.0k
2_convs.Conv2d_1  [1, 100, 3, 100]  [64, 100, 16, 1]   30.1k    480.0k
3_convs.Conv2d_2  [1, 100, 4, 100]  [64, 100, 15, 1]   40.1k    600.0k
4_convs.Conv2d_3  [1, 100, 5, 100]  [64, 100, 14, 1]   50.1k    700.0k
5_convs.Conv2d_4  [1, 100, 6, 100]  [64, 100, 13, 1]   60.1k    780.0k
6_dropout                        -         [64, 500]       -         -
7_fc                      [500, 1]           [64, 1]   501.0     500.0
-------------------------------------------------------------------------
                        Totals
Total params          756.701k
Trainable params      756.701k
Non-trainable params       0.0
Mult-Adds              3.4562M


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embedding,"[100, 5557]","[64, 18, 100]",555700.0,555700.0
1_convs.Conv2d_0,"[1, 100, 2, 100]","[64, 100, 17, 1]",20100.0,340000.0
2_convs.Conv2d_1,"[1, 100, 3, 100]","[64, 100, 16, 1]",30100.0,480000.0
3_convs.Conv2d_2,"[1, 100, 4, 100]","[64, 100, 15, 1]",40100.0,600000.0
4_convs.Conv2d_3,"[1, 100, 5, 100]","[64, 100, 14, 1]",50100.0,700000.0
5_convs.Conv2d_4,"[1, 100, 6, 100]","[64, 100, 13, 1]",60100.0,780000.0
6_dropout,-,"[64, 500]",,
7_fc,"[500, 1]","[64, 1]",501.0,500.0


In [264]:
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-1.0889,  0.1550,  0.3195,  ..., -0.5389, -0.0420, -0.2176],
        ...,
        [-1.2307, -1.2536, -0.7483,  ..., -0.1161,  1.8930,  0.1692],
        [-1.1252,  1.4676, -0.5903,  ..., -0.5557,  0.6070,  0.4536],
        [ 0.9452,  0.8951, -0.3782,  ...,  0.4434, -0.8536, -0.1599]])

In [265]:
#UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
#model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
#model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.requires_grad = True

## Train our model

In [266]:

def print_metrics(pred, labels):
    toxic_labels = ['toxic']
    
    roc_auc_scores= []
    recall_scores=[]
    precision_scores=[]
    accuracy_scores=[]
    f1_scores=[]

    thre = 0.5
    for i,j in enumerate(toxic_labels):
        roc_auc_scores.append(roc_auc_score(labels[:,i], pred[:,i]))
        recall_scores.append(recall_score(labels[:,i], pred[:,i]>=thre))
        accuracy_scores.append(accuracy_score(labels[:,i], pred[:,i]>=thre))
        precision_scores.append(precision_score(labels[:,i], pred[:,i]>=thre))
        f1_scores.append(f1_score(labels[:,i], pred[:,i]>=thre))
    return pd.DataFrame(
    {'Label': toxic_labels,
     'accuracy': accuracy_scores,
     'recall': recall_scores,
     'precision': precision_scores,
     'f1': f1_scores,
     'roc_auc': roc_auc_scores})

In [267]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [268]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [269]:
import numpy
from sklearn.metrics import roc_auc_score


In [270]:


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    preds_list=[]
    labels_list= []
    epoch_loss_hist = []

    
    j = len(iterator)//10
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        text = batch.comment_text
        
        predictions = model(text)
        
        batch_labels=torch.stack([getattr(batch, y) for y in yFields]) #transpose?
        batch_labels = torch.transpose(batch_labels,0,1)
        
        loss = criterion(predictions, batch_labels)
        
        loss.backward()
        
        optimizer.step()
        
        preds_list+=[torch.sigmoid(predictions).detach().numpy()]
        labels_list+=[batch_labels.numpy()]
        

                    
        epoch_loss += loss.item()
        
        if i%j ==0:
            epoch_loss_hist.append([loss.item(),
            evaluate(model, valid_iterator, criterion)[0]])
            model.train()
        
    return (epoch_loss / len(iterator) , epoch_loss_hist, 
           roc_auc_score(np.vstack(labels_list), np.vstack(preds_list)))

In [271]:


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds_list=[]
    labels_list= []
    epoch_acc=[]


    
    with torch.no_grad():
        i=0
        for batch in iterator:
            i+=1
            text = batch.comment_text
            
            predictions = model(text)#.squeeze(1)
            
            batch_labels = torch.stack([getattr(batch, y) for y in yFields]) #transpose?
            batch_labels = torch.transpose(batch_labels,0,1)
            
            loss = criterion(predictions, batch_labels)

            epoch_loss += loss.item()

            
            preds_list+=[torch.sigmoid(predictions).detach().numpy()]
            labels_list+=[batch_labels.numpy()]
            
        
    return (epoch_loss / len(iterator),
           roc_auc_score(np.vstack(labels_list), np.vstack(preds_list)) ,
           np.vstack(preds_list), np.vstack(labels_list))



In [272]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [273]:

N_EPOCHS = 3
best_valid_loss = float('inf')
loss_hist= []
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, _loss_hist, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc , _preds  , _labels  = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    
    loss_hist+= _loss_hist
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    


Epoch: 01 | Epoch Time: 0m 3s
	Train Loss: 0.695 | Train Acc: 62.29%
	 Val. Loss: 0.582 |  Val. Acc: 77.51%
Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 0.580 | Train Acc: 76.27%
	 Val. Loss: 0.547 |  Val. Acc: 81.86%
Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 0.514 | Train Acc: 83.29%
	 Val. Loss: 0.512 |  Val. Acc: 83.57%


In [274]:
print_metrics(_preds , _labels)

Unnamed: 0,Label,accuracy,recall,precision,f1,roc_auc
0,toxic,0.767782,0.83004,0.755396,0.79096,0.835749
