#### Original work: https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
#### This notebook applies the same strategy by augmenting the data from other Jigsaw competitions (toxic classification , Ruddit Data, Jigsaw Unintended Bias data) by creating less toxic - more toxic pairs and training ROBERTA on the augmented data

### LB score: 0.834


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import copy
import time
import random
import string

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AdamW

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
import wandb

try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("w_b")
    wandb.login(key=api_key)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

In [None]:
def id_generator(size=12, chars=string.ascii_lowercase + string.digits):
    return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

HASH_NAME = id_generator(size=12)
print(HASH_NAME)

In [None]:
CONFIG = {"seed":42,
         "num_epochs":3,
         "train_batch_size": 16,
          "val_batch_size":32,
          "model_name":"roberta-base",
         "learning_rate": 1e-4,
          "scheduler": None,
          "min_lr": 1e-6,
          "n_fold":3,
          "weight_decay":1e-6,
          "T_max": 500,
          "num_classes":1,
          "margin": 0.5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME,
          "max_length":256}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG["model_name"])
CONFIG["group"] = 'f{HASH_NAME}-Baseline'

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [None]:
df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
df.head()

In [None]:
print(len(df))
len(np.unique(np.concatenate([df["less_toxic"], df["more_toxic"]])))

### Prepare Ruddit data

In [None]:
df_ruddit = pd.read_csv("/kaggle/input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df_ruddit = df_ruddit[df_ruddit["txt"]!="[deleted]"]
len(df_ruddit)

In [None]:
df_ruddit["offensiveness_score"] = (df_ruddit["offensiveness_score"] - df_ruddit["offensiveness_score"].min() )/ (df_ruddit["offensiveness_score"].max() - df_ruddit["offensiveness_score"].min() )

In [None]:

comment_pairs = []
for index, row in df_ruddit.iterrows():
    low_toxic_df = df_ruddit[df_ruddit["offensiveness_score"]<=(row["offensiveness_score"] - 0.3)]
#     print(low_toxic_df)
    if len(low_toxic_df)>=4:
        low_toxic = low_toxic_df.sample(n=4, random_state = index+1).reset_index(drop=True)
        comment_pairs.append((low_toxic["txt"][0], row["txt"]))
        comment_pairs.append((low_toxic["txt"][1], row["txt"]))
    more_toxic_df= df_ruddit[df_ruddit["offensiveness_score"]>=(row["offensiveness_score"] + 0.3)]
    if len(more_toxic_df)>=4:
        more_toxic =  more_toxic_df.sample(n=4, random_state = index+2).reset_index(drop=True)
        comment_pairs.append(( row["txt"],more_toxic["txt"][0]))
        comment_pairs.append(( row["txt"],more_toxic["txt"][1]))
    

In [None]:
df_ruddit_final = pd.DataFrame(comment_pairs, columns= ["less_toxic","more_toxic"])
df_ruddit_final

### Prepare toxic classification data

In [None]:


df_classification = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv")
df_classification.head()

In [None]:
## Overlapping comments

### Total unique comments in severity data
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)
tot_unique_comments = np.unique(np.concatenate([df_val["less_toxic"], df_val["more_toxic"]]))
print("total unique: ", len(tot_unique_comments))


# Find cases already present in toxic data

df_val_1 = pd.merge(df_val, df_classification.loc[:,['comment_text']], 
                  left_on = 'less_toxic', 
                  right_on = 'comment_text', how='inner')
# print(df_val_1.shape)

df_val_2 = pd.merge(df_val, df_classification.loc[:,['comment_text']], 
                  left_on = 'more_toxic', 
                  right_on = 'comment_text', how='inner')
# print(df_val_2.shape)

tot_unique_common = np.unique(np.concatenate([df_val_1["comment_text"], df_val_2["comment_text"]]))
print("total common: ", len(tot_unique_common))

# Removing those cases
df_classification_u = df_classification[~df_classification["comment_text"].isin(tot_unique_common)]
print("total uncommon :", len(df_classification_u) )

In [None]:
df_classification_u["neutral"] = 1 - df_classification_u[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]].max(axis=1)
more_toxic = df_classification_u[df_classification_u[["severe_toxic","threat", "toxic"]].max(axis=1)>=2]["comment_text"]
less_toxic = df_classification_u[df_classification_u["neutral"]==1].sample(n = 10*len(more_toxic), random_state = CONFIG["seed"])
len(less_toxic), len(more_toxic)



In [None]:
more_toxic = more_toxic.repeat(5)

for l_t, m_t in zip(less_toxic, more_toxic):
    comment_pairs.append((l_t,m_t))
    

### Prepare Jigsaw unintended Bias data

In [None]:
pd.set_option("display.max_columns",500)
df_multi = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
df_multi.head()

In [None]:
df_multi["identity_associated"] = df_multi.iloc[:,8:-13].sum(axis=1)

In [None]:
df_multi["neutral"] = df_multi[["toxic","severe_toxicity","obscene","threat","insult","identity_attack"]].sum(axis=1)==0

more_toxic_1 = df_multi[df_multi[["severe_toxicity","threat"]].sum(axis=1)>0.2]["comment_text"]
more_toxic_2 = df_multi[df_multi["toxic"]>=0.8]["comment_text"]
more_toxic_3 = df_multi[df_multi["identity_attack"]>=0.8]["comment_text"]
more_toxic = np.unique(np.concatenate([more_toxic_1, more_toxic_2, more_toxic_3]))

less_toxic_1 = df_multi.loc[((df_multi["neutral"]==1) & (df_multi["identity_associated"]==0)),:].sample(n = 4*len(more_toxic), random_state = CONFIG["seed"])["comment_text"]
less_toxic_2 = df_multi.loc[((df_multi["neutral"]==1) & (df_multi["identity_associated"]>0)),:].sample(n = len(more_toxic), random_state = CONFIG["seed"])["comment_text"]

less_toxic = np.concatenate([less_toxic_1, less_toxic_2])
len(less_toxic), 5*len(more_toxic)

In [None]:
more_toxic = more_toxic.repeat(5)

for l_t, m_t in zip(less_toxic, more_toxic):
    comment_pairs.append((l_t,m_t))

In [None]:
len(comment_pairs)

### Combining all Together

In [None]:
df_2  = pd.DataFrame(comment_pairs, columns = ["less_toxic","more_toxic"])
combined_data = pd.concat([df[["less_toxic","more_toxic"]], df_2])
combined_data.info()

In [None]:
combined_data["less_toxic"].str.len().describe() 

In [None]:
df = combined_data.sample(frac=1).reset_index(drop=True)
df["target"]=1

In [None]:

def clean(data, col):
    
    
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'s", " ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace('\s+', ' ')
    
  

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    # Remove ip address
    data[col] = data[col].str.replace(r'(([0-9]+\.){2,}[0-9]+)',' ')
    
    # Remove website
    data[col] = data[col].str.replace(r'https?://\S+|www\.\S+', ' ')
    
    
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\'"]+)',r' \1 ')    
    # Remove multiple white spaces
    data[col] = data[col].str.replace(r' +', ' ')
    # Remove html tags
    data[col] = data[col].str.replace(r'<[^<]+?>', ' ')
    
    return data
                     


In [None]:
df = clean(df, "less_toxic")
df = clean(df, "more_toxic")

### Fold prep and training

In [None]:
skf = StratifiedKFold(n_splits = CONFIG["n_fold"], shuffle = True, random_state = CONFIG["seed"])

for fold, (_, val_ind)  in enumerate(skf.split(X = df,y = df["target"])):
    df.loc[val_ind,"val_set"] = int(fold)
    
df["val_set"] = df["val_set"].astype(int)

In [None]:
class JigsawDataset():
    def __init__(self, df, tokenizer, max_length ):
        self.df = df
        self.more_toxic = df["more_toxic"].values
        self.less_toxic = df["less_toxic"].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        
        inputs_more_toxic = self.tokenizer(text = more_toxic , truncation=True, padding= "max_length", add_special_tokens = True, max_length = self.max_length)
        inputs_less_toxic = self.tokenizer(text = less_toxic , truncation=True, padding= "max_length", add_special_tokens = True, max_length = self.max_length)
        
        more_toxic_ids = inputs_more_toxic["input_ids"]
        more_toxic_mask = inputs_more_toxic["attention_mask"]
        
        less_toxic_ids = inputs_less_toxic["input_ids"]
        less_toxic_mask = inputs_less_toxic["attention_mask"]
        
        target = 1
        return {"less_toxic_ids": torch.tensor(less_toxic_ids),
               "less_toxic_mask": torch.tensor(less_toxic_mask),
               "more_toxic_ids": torch.tensor(more_toxic_ids),
               "more_toxic_mask": torch.tensor(more_toxic_mask),
                "target": torch.tensor(target)}
                                               
                                               
    
        
        
        
        

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG["num_classes"])
        
    def forward(self, ids, mask):
        out = self.model(input_ids = ids, attention_mask = mask, output_hidden_states = False)
        out = self.dropout(out[1])
        outputs  = self.fc(out)
        return outputs
        

In [None]:
### Loss Function

def criterion(outputs1, outputs2, targets):
    return nn.MarginRankingLoss(margin = CONFIG["margin"])(outputs1, outputs2, targets)

## Training

In [None]:
def train_one_epoch(model, optimizer, dataloader, scheduler, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    bar = tqdm(enumerate(dataloader), total = len(dataloader))
    for step , data in bar:
        less_toxic_ids = data["less_toxic_ids"].to(device)
        less_toxic_mask = data["less_toxic_mask"].to(device)
        more_toxic_ids = data["more_toxic_ids"].to(device)
        more_toxic_mask= data["more_toxic_mask"].to(device)
        targets = data["target"].to(device)
        batch_size = less_toxic_ids.size(0)
        
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        if scheduler:
            scheduler.step()
            
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss
        
    

In [None]:
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    dataset_size = 0
    running_loss = 0.0
    bar = tqdm(enumerate(dataloader), total = len(dataloader))
    for step, data in bar:
        with torch.no_grad():
            less_toxic_ids = data["less_toxic_ids"].to(device)
            less_toxic_mask = data["less_toxic_mask"].to(device)
            more_toxic_ids = data["more_toxic_ids"].to(device)
            more_toxic_mask= data["more_toxic_mask"].to(device)
            targets = data["target"].to(device)
            
            batch_size = less_toxic_ids.size(0)
            
            less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
            more_toxic_outputs = model(more_toxic_ids, more_toxic_mask) 
            
            loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
            
            running_loss += (loss.item() * batch_size)
            dataset_size += batch_size
        
            epoch_loss = running_loss / dataset_size
        
            bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss
        
            


In [None]:
def run_training(model, optimizer, scheduler, num_epochs, device, folds, train_loader, val_loader):
    
    wandb.watch(model, log_freq=10)
    
    if torch.cuda.is_available():
        print("Using GPU :) ==> {}".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_weights = copy.deepcopy(model.state_dict())
    best_epoch_loss= np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs+1):
        gc.collect()
        
        train_epoch_loss =  train_one_epoch(model, optimizer, dataloader = train_loader, scheduler = scheduler, device = CONFIG["device"], epoch= epoch)
        
        val_epoch_loss = valid_one_epoch(model,dataloader = val_loader,device = CONFIG["device"], epoch=epoch)
        
        history["train_loss"].append(train_epoch_loss)
        history["val_loss"].append(val_epoch_loss)
        
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
        
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history
    

In [None]:
def prepare_loaders(fold):
    df_train = df[df["val_set"]!= fold]
    df_val = df[df["val_set"]==fold]
    
    train_dataset = JigsawDataset(df_train, CONFIG["tokenizer"], CONFIG["max_length"])
    
    val_dataset = JigsawDataset(df_val, CONFIG["tokenizer"], CONFIG["max_length"])
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['val_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    
    return train_loader, val_loader


In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'cosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'cosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [None]:
for fold in range(0, CONFIG['n_fold']):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{HASH_NAME}', 'margin-loss'],
                     name=f'{HASH_NAME}-fold-{fold}',
                     anonymous='must')
    
    # Create Dataloaders
    train_loader, valid_loader = prepare_loaders(fold=fold)
    
    model = JigsawModel(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
                        
    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['num_epochs'],
                                  folds=fold,
                                  train_loader  = train_loader, val_loader = valid_loader)
    
    run.finish()
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()