### Importing libraries

In [None]:
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, logging
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.metrics.functional import accuracy
from collections import defaultdict

logging.set_verbosity_error()

### Dataset path

In this notebook, I have added dataset from 
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

In [None]:
train_csv_path = '../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv'
sample_sub_path = '../input/jigsaw-toxic-severity-rating/sample_submission.csv'
test_csv_path = '../input/jigsaw-toxic-severity-rating/comments_to_score.csv'

In [None]:
train_df = pd.read_csv(train_csv_path)
train_df.head()

In [None]:
test_df = pd.read_csv(test_csv_path)
test_df.head()

In [None]:
sample_sub_df = pd.read_csv(sample_sub_path)
sample_sub_df.head()

### Checking NaN

In [None]:
train_df.comment_text.isnull().sum()

In [None]:
test_df.text.isnull().sum()

### Checking for imbalance in the training dataset

In [None]:
def add_all(row):
    toxicity = row[2:].sum()
    if toxicity > 0:
        return 1
    else:
        return 0

train_df['toxic_nontoxic'] = train_df.apply(add_all, axis='columns')
train_df.head()

In [None]:
ax = sns.countplot(train_df.toxic_nontoxic)
plt.xlabel('Toxic vs Non toxic')
ax.set_xticklabels(['Non Toxic', 'Toxic'])

In [None]:
train_df.toxic_nontoxic.value_counts()

We see that the dataset is highly imbalanced. Around 16k comments are toxic. We will sample 16k non toxic comments to create new balanced training dataset.


In [None]:
toxic_train_df = train_df[train_df.toxic_nontoxic == 1]
nontoxic_train_df = train_df[train_df.toxic_nontoxic == 0]
toxic_train_df.shape, nontoxic_train_df.shape

In [None]:
sample = 16000
toxic_train_df = toxic_train_df.sample(frac=1).reset_index(drop=True)
nontoxic_train_df = nontoxic_train_df.sample(frac=1).reset_index(drop=True)

new_train_df = pd.concat([toxic_train_df[:sample], nontoxic_train_df[:sample]])
new_train_df = new_train_df.sample(frac=1).reset_index(drop=True)
new_train_df.head()

In [None]:
ax = sns.countplot(new_train_df.toxic_nontoxic)
plt.xlabel('Toxic vs Non toxic')
ax.set_xticklabels(['Non Toxic', 'Toxic'])

In [None]:
new_train_df = new_train_df.drop(['toxic_nontoxic'], axis=1)
new_train_df.head()

### Create a score column for each comment text

In [None]:
# Assign weights to each category
weights_per_category = {'toxic': 0.5,
                        'severe_toxic': 1.5,
                        'obscene': 0.25,
                        'threat': 1.5,
                        'insult': 0.8,
                        'identity_hate': 1.5}

In [None]:
for category in weights_per_category:
    new_train_df[category] = new_train_df[category] * weights_per_category[category]

In [None]:
new_train_df['score'] = new_train_df.loc[:, 'toxic':'identity_hate'].mean(axis=1)
new_train_df.head()

### Determining the max length of the tokens to be encoded from comment_text (using BertTokenizer)

In [None]:
# Defining tokenizer instance from BertTokenizer
# we will use "bert base uncased" pretrained model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# tokenizer encode example on sample text
idx = 2
sample_text = new_train_df.loc[idx, 'comment_text']
print('Sample Text: ')
print(sample_text)

# we will use encode_plus method from tokenizer instance
# https://huggingface.co/docs/transformers/v4.15.0/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.encode_plus

encoded_sample = tokenizer.encode_plus(
    sample_text,
    add_special_tokens=True,     # [CLS], [SEP], [PAD] tokens
    max_length=100,   # to be determined next!!
    return_token_type_ids=False,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt'    # pytorch tensor
)

# encoded_sample is a dictionary 

encoded_sample.keys()

In [None]:
# Let's see the encoded token ids of the sample text
print('Encoded tokens of sample text:')
print(encoded_sample['input_ids'])
print(f"Shape of encoded tokens: {encoded_sample['input_ids'].shape}")

In [None]:
# Let's see the attention mask (contains 1(token to be considered) and 0(token not to be considered, in case of padding))
attention_mask = encoded_sample['attention_mask']
print(attention_mask)
print(f'Shape of attention mask: {attention_mask.shape}')

In [None]:
# Let's see the encoded tokens of the sample text
encoded_tokens = tokenizer.convert_ids_to_tokens(encoded_sample['input_ids'].squeeze())
print(encoded_tokens)

In [None]:
token_lens = []
for i in tqdm(range(len(new_train_df))):
    comment_text = new_train_df.loc[i, 'comment_text']
    # encode comment_text (using .encode method to get only the token ids)
    tokens = tokenizer.encode(comment_text, max_length=512, truncation=True)
    token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0,512])
plt.xlabel('Token count')

We see that a good fit for max length of tokens would be around 350

In [None]:
MAX_LEN = 350

### Spliting newly sampled dataframe into training and validation data frames

In [None]:
train_df, val_df = train_test_split(new_train_df, test_size=0.2)
train_df.shape, val_df.shape

### Building Custom Dataset


In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        comment_text = self.df.iloc[index, 1]
        target = self.df.iloc[index, -1]
        
        encoding = self.tokenizer.encode_plus(comment_text, 
                                             add_special_tokens=True,
                                             max_length=self.max_len,
                                             return_token_type_ids=False,
                                             padding='max_length',
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_tensors='pt'
                                             )
        
        input_ids = encoding['input_ids'].squeeze()    # Shape: (max_length)
        attention_mask = encoding['attention_mask'].squeeze()    # Shape: (max_length)
        target = torch.tensor(target)
            
        return input_ids, attention_mask, target

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [None]:
input_ids, attention_mask, targets = train_dataset[0]
input_ids.shape, attention_mask.shape, targets.shape

In [None]:
input_ids, attention_mask, targets = val_dataset[0]
input_ids.shape, attention_mask.shape, targets.shape

### Building Train and Val DataLoaders

In [None]:
BATCH_SIZE = 16

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, pin_memory=True)

In [None]:
data = next(iter(train_loader))
input_ids, attention_mask, targets = data
input_ids.shape, attention_mask.shape, targets.shape

### Let's try running a sample batch on BertModel

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Instantiating pretrained bertmodel
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
output = bert_model(input_ids=input_ids,
                   attention_mask=attention_mask,
                   return_dict=True)
output.keys()

In [None]:
# bert_model outputs two tensors last_hidden_state and pooler_output
# Let's see their dimensions

last_hidden_state, pooler_output = output['last_hidden_state'], output['pooler_output']

last_hidden_state.shape, pooler_output.shape

**last_hidden_state** contains the hidden representations for each token in each sequence of the batch. So the size is (batch_size, seq_len, hidden_size)

**pooler_output** contains a "representation" of each sequence in the batch, and is of size (batch_size, hidden_size)

768 is the hidden size

[More about bert model output](https://github.com/huggingface/transformers/issues/7540)

In [None]:
bert_model.config.hidden_size

We are interested in the pooler_output.
So we will feed the pooler_output to linear layer to output a score

In [None]:
linear_layer = nn.Linear(768, 1)
output = linear_layer(pooler_output)
output.shape

### Awesome! Now let's define the loss function and print the loss value corresponding to the sample batch we used

In [None]:
criterion = nn.MSELoss()

In [None]:
loss = criterion(output.squeeze(), targets)
print(f'Loss on sample batch: {loss.item()}')

### Double Awesome!!

### Now let's build the model architecture

In [None]:
class Net(nn.Module):
    def __init__(self, bert_model):
        super(Net, self).__init__()
        self.bert_model = bert_model
        self.fcdense = nn.Linear(self.bert_model.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert_model(input_ids, attention_mask, return_dict=True)
        pooler_output =  bert_out['pooler_output']    # (batch_size, 768)
        output = self.fcdense(pooler_output)       # (batch_size, 1)
        return output

In [None]:
model = Net(bert_model=bert_model).to(DEVICE)
output = model(input_ids.to(DEVICE), attention_mask.to(DEVICE))
print(output.shape)

In [None]:
# Freeze bert layers
for name, param in model.named_parameters(): 
    if 'fcdense' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False
    #print(name, param.requires_grad)

### Let's train the model

In [None]:
def train_epoch(model, train_loader, criterion, optimizer, DEVICE):
    model.train()
    
    losses = []
    
    for batch_idx, data in enumerate(tqdm(train_loader)):
        input_ids, attention_mask, targets = data
        input_ids = input_ids.to(DEVICE)   # (batch_size, seq_len)
        attention_mask = attention_mask.to(DEVICE)   # (batch_size, seq_len)
        targets = targets.to(DEVICE)  # (batch_size,)

        output = model(input_ids, attention_mask)   # (batch_size, 1)

        loss = criterion(output.squeeze().float(), targets.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())

    return np.mean(losses)

In [None]:
def val_epoch(model, val_loader, criterion, DEVICE):
    model.eval()
    
    losses = []
    
    with torch.no_grad():
        for batch_idx, data in enumerate(tqdm(val_loader)):
            input_ids, attention_mask, targets = data
            input_ids = input_ids.to(DEVICE)   # (batch_size, seq_len)
            attention_mask = attention_mask.to(DEVICE)   # (batch_size, seq_len)
            targets = targets.to(DEVICE)  # (batch_size,)

            output = model(input_ids, attention_mask)   # (batch_size, 1)

            loss = criterion(output.squeeze().float(), targets.float())

            losses.append(loss.item())

    return np.mean(losses)

In [None]:
EPOCHS = 25
LEARNING_RATE = 2e-5

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

best_val_loss = np.inf

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}/{EPOCHS}')
    print('-' * 10)
    
    print('Training')
    train_loss = train_epoch(model,train_loader,criterion,optimizer,DEVICE)
    
    print('Validating')
    val_loss = val_epoch(model,val_loader,criterion,DEVICE)
    
    print(f'Train Loss: {train_loss}\t Val Loss: {val_loss}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'toxicity_best_model.pth.tar')

### Save the bert_model and bert_tokenizer

In [None]:
tokenizer.save_pretrained('./tokenizer_pretrained/')

In [None]:
bert_model.save_pretrained('./bert_model_pretrained/')

In [None]:
print('Done!')