In [1]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
import wandb
from sklearn.model_selection import StratifiedKFold
import time
import os

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")

In [3]:
wandb.login(key=secret_value_0)

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
class CONFIG:
    model_name = "distilroberta-base"
    seed = 101
    k_fold = 5
    train_batch_size = 32
    val_batch_size = 32
    no_class = 1
    lr = 1e-4
    weight_decay = 1e-6
    min_lr = 1e-6
    T_max = 500
    max_len = 128
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    criterion = nn.MarginRankingLoss(margin=0.5)
    group = model_name+'-'+str(int(time.time()))

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG.seed)

In [6]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


In [7]:
#create folds
skf = StratifiedKFold(n_splits=CONFIG.k_fold, shuffle=True, random_state=CONFIG.seed)

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.worker)):
    df.loc[val_ , "kfold"] = int(fold)
    
df["kfold"] = df["kfold"].astype(int)
df.head()



Unnamed: 0,worker,less_toxic,more_toxic,kfold
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...,4
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...,1
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu...",4
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...,0
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist",4


In [8]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len):
        self.df = df
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer
        self.more_toxic = df['more_toxic'].values
        self.less_toxic = df['less_toxic'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_seq_len,
                                padding='max_length'
                            )
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_seq_len,
                                padding='max_length'
                            )
        target = 1
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [9]:
def prepare_loaders(fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = JigsawDataset(df_train, tokenizer=CONFIG.tokenizer, max_seq_len=CONFIG.max_len)
    valid_dataset = JigsawDataset(df_valid, tokenizer=CONFIG.tokenizer, max_seq_len=CONFIG.max_len)

    train_loader = DataLoader(train_dataset, batch_size=CONFIG.train_batch_size, 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG.val_batch_size, 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [10]:
class jigsaw_toxicbert(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = AutoModel.from_pretrained("distilroberta-base")
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768,128)
        self.dropout_2 = nn.Dropout(0.3)
        self.output = nn.Linear(128,CONFIG.no_class)
    def forward(self,ids,mask):
        x = self.model(input_ids=ids,attention_mask=mask,output_hidden_states=False)
        x = self.dropout(x[1])
        x=  self.linear(x)
        x=  self.dropout_2(x)
        x = self.output(x)
        return x
    
    def configure_optimizers(self):
        optimiser = optim.AdamW(self.model.parameters(), lr=CONFIG.lr, weight_decay=CONFIG.weight_decay)
        scheduler = lr_scheduler.CosineAnnealingLR(optimiser,eta_min=CONFIG.min_lr,T_max = CONFIG.T_max)
        return [optimiser],[scheduler]
    
    def training_step(self, batch, batch_idx):
        more_toxic_id = batch["more_toxic_ids"]
        more_toxic_mask = batch["more_toxic_mask"]
        less_toxic_id =batch["less_toxic_ids"]
        less_toxic_mask =batch["less_toxic_mask"]
        target = batch["target"]
        more_toxic_pred = self(more_toxic_id,more_toxic_mask)
        less_toxic_pred = self(less_toxic_id,less_toxic_mask)
        loss = CONFIG.criterion(more_toxic_pred,less_toxic_pred,target)
        self.log('train_margin_loss',loss,on_step=False, on_epoch=True,prog_bar=True)
        return loss  
    def validation_step(self, batch, batch_idx):
        more_toxic_id = batch["more_toxic_ids"]
        more_toxic_mask = batch["more_toxic_mask"]
        less_toxic_id =batch["less_toxic_ids"]
        less_toxic_mask =batch["less_toxic_mask"]
        target = batch["target"]
        more_toxic_pred = self(more_toxic_id,more_toxic_mask)
        less_toxic_pred = self(less_toxic_id,less_toxic_mask)
        loss = CONFIG.criterion(more_toxic_pred,less_toxic_pred,target)
        self.log('val_margin_loss',loss,on_step=False, on_epoch=True,prog_bar=True)
        return loss 

        

In [11]:
for fold in range(0,CONFIG.k_fold):
    wandb_run = wandb.init(project='Jigsaw Rate Severity of Toxic Comments', 
                     job_type='Train',
                     group=CONFIG.group,
                     tags=[f'{CONFIG.model_name}', f'{CONFIG.group}', 'margin-loss'],
                     name=f'{CONFIG.model_name}-{CONFIG.group}-fold-{fold}',)
    train_loader,valid_loader = prepare_loaders(fold)
    model = jigsaw_toxicbert()
    checkpoint_callback = ModelCheckpoint(monitor="val_margin_loss",
    dirpath=".",
    filename="toxicbert_{val_margin_loss:.5f}"+"_fold_{}".format(str(fold)),
    save_top_k=2,
    mode="min",
    save_last=False)
    wandb_logger = WandbLogger()
    wandb_logger.watch(model, log='gradients', log_freq=100)          
    trainer = Trainer(gpus=1,max_epochs = 3,precision=16,callbacks=[checkpoint_callback],logger=wandb_logger)
    trainer.fit(model,train_loader,valid_loader)
    wandb_run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mpoipii[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Downloading:   0%|          | 0.00/331M [00:00<?, ?B/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▅▅██
train_margin_loss,█▄▁
trainer/global_step,▁▁▅▅██
val_margin_loss,▁▆█

0,1
epoch,2.0
train_margin_loss,0.30198
trainer/global_step,2255.0
val_margin_loss,0.37157


[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▅▅██
train_margin_loss,█▄▁
trainer/global_step,▁▁▅▅██
val_margin_loss,█▁▃

0,1
epoch,2.0
train_margin_loss,0.30578
trainer/global_step,2255.0
val_margin_loss,0.34491


[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▅▅██
train_margin_loss,█▄▁
trainer/global_step,▁▁▅▅██
val_margin_loss,▅█▁

0,1
epoch,2.0
train_margin_loss,0.31247
trainer/global_step,2255.0
val_margin_loss,0.33599


[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▅▅██
train_margin_loss,█▄▁
trainer/global_step,▁▁▅▅██
val_margin_loss,▁▁█

0,1
epoch,2.0
train_margin_loss,0.30585
trainer/global_step,2255.0
val_margin_loss,0.35258


[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade

CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▅▅██
train_margin_loss,█▄▁
trainer/global_step,▁▁▅▅██
val_margin_loss,▇▁█

0,1
epoch,2.0
train_margin_loss,0.30754
trainer/global_step,2255.0
val_margin_loss,0.35335


In [12]:
# model = jigsaw_toxicbert()

In [13]:
# checkpoint_callback = ModelCheckpoint(
#     monitor="val_margin_loss",
#     dirpath=".",
#     filename="toxicbert-{val_margin_loss:.5f}"+"fold-{}".format(1),
#     save_top_k=1,
#     mode="min",
#     save_last=False
# )

In [14]:
# trainer = Trainer(gpus=1,max_epochs = 1,precision=16,callbacks=[checkpoint_callback])
# trainer.fit(model,train_loader,valid_loader)

In [15]:
# !ls -al