In [1]:
# Taken from https://www.kaggle.com/abhinav2308/pytorch-toxic-comment-solution


%matplotlib inline
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re


os.environ['OMP_NUM_THREADS'] = '4'
my_tok = spacy.load('en')
my_stopwords = spacy.lang.en.stop_words.STOP_WORDS
my_stopwords.update(['wikipedia','article','articles','im','page'])

def spacy_tok(x):
    x= re.sub(r'[^a-zA-Z\s]','',x)
    x= re.sub(r'[\n]',' ',x)
    return [tok.text for tok in my_tok.tokenizer(x)]



TEXT = data.Field(lower=True, tokenize=spacy_tok,eos_token='EOS',stop_words=my_stopwords,include_lengths=True)
LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None)

dataFields = [("id", None),
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]

dataset= data.TabularDataset(path='./data/train.csv', 
                                            format='csv',
                                            fields=dataFields, 
                                            skip_header=True)

In [None]:
train,val= dataset.split(split_ratio=0.9, random_state=1)

In [4]:
TEXT.build_vocab(train,vectors='fasttext.simple.300d')

.vector_cache/wiki.simple.vec: 293MB [01:24, 3.48MB/s]                               
  0%|          | 0/111051 [00:00<?, ?it/s]Skipping token b'111051' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 110590/111051 [00:30<00:00, 11816.39it/s]

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
traindl, valdl = torchtext.data.BucketIterator.splits(datasets=(train, val),
                                            batch_sizes=(128,1024),
                                            sort_key=lambda x: len(x.comment_text),
                                            device=device,
                                            sort_within_batch=True
                                                     )

In [5]:
vectors= train.fields['comment_text'].vocab.vectors

In [6]:
class BatchGenerator:
    def __init__(self, dl):
        self.dl = dl
        self.yFields= ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
        self.x= 'comment_text'
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x)
            y = torch.transpose( torch.stack([getattr(batch, y) for y in self.yFields]),0,1)
            yield (X,y)

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

In [8]:

class MyModel(nn.Module):
    def __init__(self,op_size,n_tokens,pretrained_vectors,nl=2,bidirectional=True,emb_sz=300,n_hiddenUnits=100):
        super(MyModel, self).__init__()
        self.n_hidden= n_hiddenUnits
        self.embeddings= nn.Embedding(n_tokens,emb_sz)
        self.embeddings.weight.data.copy_(pretrained_vectors)
#         self.embeddings.weight.requires_grad = False
        self.rnn= nn.LSTM(emb_sz,n_hiddenUnits,num_layers=2,bidirectional=True,dropout=0.2)
        self.lArr=[]
        if bidirectional:
            n_hiddenUnits= 2* n_hiddenUnits
        self.bn1 = nn.BatchNorm1d(num_features=n_hiddenUnits)
        for i in range(nl):
            if i==0:
                self.lArr.append(nn.Linear(n_hiddenUnits*3,n_hiddenUnits))
            else:
                self.lArr.append(nn.Linear(n_hiddenUnits,n_hiddenUnits))
        self.lArr= nn.ModuleList(self.lArr)
        self.l1= nn.Linear(n_hiddenUnits,op_size)
        
    def forward(self,data,lengths):
        bs= data.shape[1]
        self.h= self.init_hidden(bs)
        embedded= self.embeddings(data)
        embedded= nn.Dropout()(embedded)
#         embedded = pack_padded_sequence(embedded, torch.as_tensor(lengths))
        rnn_out, self.h = self.rnn(embedded, (self.h,self.h))
#         rnn_out, lengths = pad_packed_sequence(rnn_out,padding_value=1)
        avg_pool= F.adaptive_avg_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        max_pool= F.adaptive_max_pool1d(rnn_out.permute(1,2,0),1).view(bs,-1)
        ipForLinearLayer= torch.cat([avg_pool,max_pool,rnn_out[-1]],dim=1)
        for linearlayer in self.lArr:
            outp= linearlayer(ipForLinearLayer)
            ipForLinearLayer= self.bn1(F.relu(outp))
            ipForLinearLayer= nn.Dropout(p=0.6)(ipForLinearLayer)
        outp = self.l1(ipForLinearLayer)
        del embedded;del rnn_out;del self.h;
        return outp
        
    def init_hidden(self, batch_size):
        return torch.zeros((4,batch_size,self.n_hidden),device=device)


In [9]:
def getValidationLoss(valdl,model,loss_func):
    model.eval() # turn off dropout
    runningLoss=0
    valid_batch_it = BatchGenerator(valdl)
    allPreds= []
    allActualPreds= []
    with torch.no_grad():
        for i,obj in enumerate(valid_batch_it):
            obj= ( (obj[0][0],obj[0][1]),obj[1] )
            preds = model(obj[0][0],obj[0][1])
            loss = loss_func(preds,obj[1].float())
            runningLoss+= loss.item()
            allPreds.append(preds.detach().cpu().numpy())
            allActualPreds.append(obj[1].detach().cpu().numpy())
        rocLoss= roc_auc_score(np.vstack(allActualPreds),np.vstack(allPreds))
        return runningLoss/len(valid_batch_it),rocLoss

In [10]:
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence,pad_packed_sequence
def oneEpoch(lr):
    train_batch_it = BatchGenerator(traindl)
    opt = optim.Adam(model.parameters(),lr)
    runningLoss= 0
    allPreds=[]
    allActualPreds=[]
    for i,obj in enumerate(train_batch_it):
        obj= ( (obj[0][0],obj[0][1]),obj[1] )
        model.train()
        opt.zero_grad()
        preds = model(obj[0][0],obj[0][1])
        loss = loss_func(preds,obj[1].float())
        runningLoss+= loss.item()
        loss.backward()
        opt.step()
        allPreds.append(preds.detach().cpu().numpy())
        allActualPreds.append(obj[1].detach().cpu().numpy())
        del obj;del preds
    trainRocLoss= roc_auc_score(np.vstack(allActualPreds),np.vstack(allPreds))
    runningLoss= runningLoss/len(train_batch_it)
    valLoss,valRocLoss= getValidationLoss(valdl,model,loss_func)
    #torch.cuda.empty_cache()
    return runningLoss,valLoss,trainRocLoss,valRocLoss

In [16]:


epochs= 2
trainLossArr=[]
valLossArr=[]
rocTrainLoss=[]
rocValLoss=[]
model= MyModel(6,len(TEXT.vocab),vectors,1)
loss_func= torch.nn.BCEWithLogitsLoss()

for i in range(epochs):
    %time tLoss,vLoss,tRocLoss,vRocLoss= oneEpoch(1e-4)
    print(f"Epoch - {i}")
    print(f"Train Loss - {tLoss} vs Val Loss is {vLoss}")
    print(f"Train ROC - {tRocLoss} vs Val ROC is {vRocLoss}")
    trainLossArr.append(tLoss)
    valLossArr.append(vLoss)
    rocTrainLoss.append(tRocLoss)
    rocValLoss.append(vRocLoss)



CPU times: user 43min 27s, sys: 3min 43s, total: 47min 10s
Wall time: 14min 24s
Epoch - 0
Train Loss - 0.4710391234491289 vs Val Loss is 0.21521919458470445
Train ROC - 0.7470686711865436 vs Val ROC is 0.7962379929400899
CPU times: user 43min 36s, sys: 3min 42s, total: 47min 19s
Wall time: 14min 39s
Epoch - 1
Train Loss - 0.12424225927696468 vs Val Loss is 0.09827515562164023
Train ROC - 0.8207382337705623 vs Val ROC is 0.8896778625310509
