*This is my first kernel on Pytorch-Lightning*

Please UpVote if you like this kernel...

*Import  the required Libraries*

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as pe
from wordcloud import WordCloud, STOPWORDS 

from sklearn import model_selection

from transformers import (BertTokenizer,BertModel,AdamW,get_linear_schedule_with_warmup)

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader,Dataset
from pytorch_lightning.metrics.functional.classification import auroc
import warnings
from pylab import rcParams

*General Configuration parameters*

In [None]:
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED =42
sns.set(style='whitegrid',palette='muted',font_scale=1.2)
HAPPY_COLORS_PALETTE = ['#f0d407','#fbec7e','#04345b','#596e3e','#948304','#2f524f']
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize']=12,8

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

BERT_MODEL_NAME = 'bert-base-cased'
BATCH_SIZE = 32
N_EPOCHS = 5

*Lets check the files present for this competetion*

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
PARENT_DIR = '../input/jigsaw-toxic-comment-classification-challenge'

*Importing the datasets* 

In [None]:
train_df = pd.read_csv(os.path.join(PARENT_DIR,'train.csv.zip'))
test_df = pd.read_csv(os.path.join(PARENT_DIR,'test.csv.zip'))
test_lab_df = pd.read_csv(os.path.join(PARENT_DIR,'test_labels.csv.zip'))
sample_df = pd.read_csv(os.path.join(PARENT_DIR,'sample_submission.csv.zip'))

*Lets take a look into the data*

In [None]:
train_df.head(7)

In [None]:
test_df.head(2)

In [None]:
test_lab_df.head()

In [None]:
sample_df.head()

*Below some sample of toxic comment can be seen*

In [None]:
sample_toxic_words = train_df.head(100)[train_df.head(5000).sum(axis=1)>=3]
sample_toxic_words

*Lets build a wordcloud for a visual representation of toxic words*

In [None]:
def wordcloud(df):
    comment_words = ''   
    stopwords = set(STOPWORDS) 
    # iterate through the csv file 
    for val in df.comment_text: 
        # typecaste each val to string 
        val = str(val) 

        # split the value 
        tokens = val.split() 

        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white',  
                stopwords=stopwords,
                min_font_size = 10).generate(comment_words) 


    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show() 

In [None]:
wordcloud(sample_toxic_words)

*Lets check of there is multiple comment present from same ID,s*

In [None]:
# Also there is exactly one comment from each ID,s
train_df.id.value_counts()[train_df.id.value_counts(ascending=False)>1]

*Now lets split the data training dataset into training and validation*

In [None]:
train_df,val_df = model_selection.train_test_split(train_df,test_size=0.05)

In [None]:
train_df.shape,val_df.shape

*Lets create a list of labels for modelling purpose*

In [None]:
LABEL_COLUMNS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

*Lets check the distribution of different type of toxic comments*

In [None]:
lab_c = pd.DataFrame(train_df[LABEL_COLUMNS].sum()).reset_index()
lab_c.columns = ['Type','Count']
fig = pe.bar(lab_c, x='Type', y='Count')
fig.show()

*Lets Handle the imbalence in the dataset*

In [None]:
lab_toxic_clean_mix = pd.DataFrame(train_df[LABEL_COLUMNS].sum()).reset_index()
lab_toxic_clean_mix.columns = ['Type','Count'] 
lab_toxic_clean_mix.loc[len(lab_toxic_clean_mix.index)] = ['Clean',len(train_df) -lab_toxic_clean_mix.Count.sum()]

*Lets plot toxic and clean data count*

*In Below plot we can clearly notice that there is a huge count gap between the toxic and regular comments*

In [None]:
fig = pe.bar(lab_toxic_clean_mix, x='Type', y='Count')
fig.show()

*I am creating a dataframe with equal number of samples from clean and toxic lables*

In [None]:
toxic_df = train_df[train_df[LABEL_COLUMNS].sum(axis=1)>0]
clean_df = train_df[train_df[LABEL_COLUMNS].sum(axis=1)==0]

train_df = pd.concat([
    toxic_df,
    clean_df.sample(15_000)
])

In [None]:
train_df.shape

*Lets Experiment with a single comment*

In [None]:
sample_row = train_df[train_df.id=='325cd3656d865766']
sample_row = sample_row.iloc[0]
sample_comment = sample_row.comment_text
sample_labels = sample_row[LABEL_COLUMNS]

print(sample_comment)
print()
print(sample_labels.to_dict())

*Defining the tokenizer*

In [None]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [None]:
encoding  = tokenizer.encode_plus(sample_comment,
                     add_special_tokens=True,
                     max_length=512,
                     return_token_type_ids=False,
                     padding = 'max_length',
                     return_attention_mask=True,
                     return_tensors='pt'
                     )

In [None]:
# Encoding consist of input_ids and attenssion mask
encoding.keys()

In [None]:
encoding['input_ids'].shape,encoding['attention_mask'].shape

In [None]:
print(encoding['input_ids'].squeeze()[:50])
print(encoding['attention_mask'].squeeze()[:50])

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'].squeeze()[:50]))

*Before Strting the dataset creation lets find out the max length from dataframe*

In [None]:
from nltk.tokenize import word_tokenize
length = []
for sent in train_df.comment_text:
    lent = len(word_tokenize(sent))
    length.append(lent)

In [None]:
sns.distplot(length)

*Here we can notoce that the mean length is very less so i am going to use my max token length as 128*

In [None]:
np.mean(length)

*Now lets create our dataset for modeling , We can override few functions as per our need to create a custom dataset*

In [None]:
class ToxicCommentsDataset(Dataset):
    def __init__(self,data:pd.DataFrame,tokenizer:BertTokenizer,max_token_len:int=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx:int):
        data_row = self.data.iloc[idx]
        comment_text = data_row.comment_text
        labels = data_row[LABEL_COLUMNS]

        encoding = self.tokenizer.encode_plus(
                 comment_text,
                 add_special_tokens=True,
                 max_length=self.max_token_len,
                 return_token_type_ids=False,
                 padding = 'max_length',
                 truncation = True,
                 return_attention_mask=True,
                 return_tensors='pt'
                 )

        return dict(
            comment_text = comment_text,
            input_ids = encoding['input_ids'].flatten(),
            attention_mask = encoding['attention_mask'].flatten(),
            labels = torch.FloatTensor(labels)

            )

In [None]:
train_dataset = ToxicCommentsDataset(train_df,tokenizer)
sample_row = train_dataset[0]

*Lets see the size of our return values*

In [None]:
sample_row['input_ids'].shape,sample_row['attention_mask'].shape,sample_row['labels'].shape

In [None]:
comment_text = sample_row['comment_text']
input_id = sample_row['input_ids']
attention_m = sample_row['attention_mask']
text_lab = sample_row['labels']

print(comment_text)
print()
print(input_id)
print()
print(attention_m)
print()
print(text_lab)

*Lets Define our bert model for testing , Ignore below section not requied*

In [None]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME,return_dict = True)

In [None]:
bert_model

In [None]:
sample_prediction = bert_model(sample_row['input_ids'].unsqueeze(dim=0),sample_row['attention_mask'].unsqueeze(dim=0))

In [None]:
sample_prediction.last_hidden_state.shape,sample_prediction.pooler_output.shape

*Below is the code to create dataloader which will take Dataset class as input*

In [None]:
class ToxicCommentDataModule(pl.LightningDataModule):
    def __init__(self,train_df,test_df,tokenizer,batch_size=8,max_token_len=128):
        super().__init__()
        
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len=max_token_len
        
    def setup(self):
        self.train_dataset = ToxicCommentsDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
        self.test_dataset = ToxicCommentsDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size = self.batch_size,
            shuffle=True,
            num_workers = 4
        )
    
    def val_dataloader(self):
        return DataLoader(self.test_dataset,batch_size = 1,num_workers = 4)
    
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset,batch_size = 1,num_workers = 4)

In [None]:
data_module = ToxicCommentDataModule(train_df,val_df,tokenizer,batch_size=BATCH_SIZE)
data_module.setup()

*Lets create class for actually model configuration , loss and metrics*

In [None]:
class ToxicCommentClassifier(pl.LightningModule):
    def __init__(self,n_classes:int,steps_per_epoch:None,n_epochs=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME,return_dict = True)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size,n_classes)
        
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        
        self.criterion = torch.nn.BCELoss()
        
        
    def forward(self,input_ids,attention_mask,labels=None):
        output = self.bert(input_ids,attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)
        
        loss = 0
        if labels is not None:
            loss = self.criterion(output,labels)
        return loss ,output
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        loss,outputs = self(input_ids,attention_mask,labels)
        self.log('train_loss',loss,prog_bar=True,logger=True)
        return {"loss":loss,"predictions":outputs,"labels":labels}
    
    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss,outputs = self(input_ids,attention_mask,labels)
        self.log('val_loss',loss,prog_bar=True,logger=True)
        return loss
    
    
    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        loss,outputs = self(input_ids,attention_mask,labels)
        self.log('test_loss',loss,prog_bar=True,logger=True)
        return loss
    
    
    def training_epoch_end(self,outputs):
        labels = []
        predictions=[]
        
        for output in outputs:
            for out_labels in output['labels'].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output['predictions'].detach().cpu():
                predictions.append(out_predictions)
                
        labels = torch.stack(labels)
        predictions = torch.stack(predictions)
        
        for i,name in enumerate(LABEL_COLUMNS):
            roc_score = auroc(predictions[:,i],labels[:,i])
            self.logger.experiment.add_scalar(f"{name}_roc_auc/Train",roc_score,self.current_epoch)
            
        
    def configure_optimizers(self):
        optimizer =  AdamW(self.parameters(), lr=2e-5)
        
        
        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps
        
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            warmup_steps,
            total_steps
        )
        
        return [optimizer],[scheduler]

In [None]:
model = ToxicCommentClassifier(n_classes=6,
                               steps_per_epoch=len(train_df)//BATCH_SIZE,
                              n_epochs=N_EPOCHS
                              )

*Lets run our trainer enable fast_dev_run if you want to just do a quick run its helps in dubugging the code *

In [None]:
trainer = pl.Trainer(max_epochs=N_EPOCHS,
                     gpus=1,
                     progress_bar_refresh_rate=30
#                      fast_dev_run=True
                    )

In [None]:
trainer.fit(model,data_module)

In [None]:
# Load the extension and start TensorBoard
# %load_ext tensorboard
# %tensorboard --logdir ./lightning_logs
# %reload_ext tensorboard


In [None]:
trainer.test()

*Save the best model*

In [None]:
trainer.save_checkpoint("final_checkpoint.ckpt")

*Load the model*

In [None]:
trained_model = ToxicCommentClassifier.load_from_checkpoint('./final_checkpoint.ckpt',
                                                            n_classes=6,
                                                            steps_per_epoch=len(train_df)//BATCH_SIZE)

# Work in progress below i have just checked for a single comment

*Lets test on a small sample *

In [None]:
test_comment = "Nonsense cocksucker Fuck? Kiss off,geek.What i said is true.I will have you account terminated."

In [None]:
encoding  = tokenizer.encode_plus(test_comment,
                     add_special_tokens=True,
                     max_length=128,
                     return_token_type_ids=False,
                     padding = 'max_length',
                     return_attention_mask=True,
                     return_tensors='pt'
                     )

In [None]:
encoding.keys()

In [None]:
_,prediction = trained_model(encoding['input_ids'],encoding['attention_mask'])
prediction

In [None]:
test_prediction = prediction.detach().numpy()
test_prediction

In [None]:
prediction_labels = []

for i,label in enumerate(LABEL_COLUMNS):
    label_prob = test_prediction[:,i]
    
    if label_prob>0.5:
        prediction_labels.append(label)

In [None]:
prediction_labels