In [None]:
!pip install -q '/kaggle/input/libraries/pytorch_lightning-1.5.4-py3-none-any.whl'
!pip freeze | grep 'pytorch-lightning'

In [None]:
import os
import gc
import pandas as pd
from argparse import Namespace

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule, LightningModule, Trainer

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW
from sklearn.model_selection import StratifiedKFold

pl.__version__

In [None]:
config = Namespace(
    seed = 7,
    n_folds = 5,
    
    trainer = Namespace(
        precision = 32,
        max_epochs = 1,
        gpus = -1 if torch.cuda.is_available() else 0,
        enable_checkpointing=False,
#         fast_dev_run=2
    ),
    
    model = Namespace(
        loss_margin = 0.5,
        
        # model layers
        model_path = '/kaggle/input/roberta-base',
        dropout = 0.2,
        num_classes = 1,
        hidden_size = 256,
        
        # scheduler
        T_max = 500,
        min_lr = 1e-6,
        
        # optimizer
        lr = 1e-4,
        weight_decay = 1e-6,
    ),
    
    data = Namespace(
        # tokenizer
        tokenizer_path = '/kaggle/input/roberta-base',
        max_length = 128,
        
        # dataloader
        train_batch_size = 32,
        val_batch_size = 64,
        predict_batch_size = 64,
        num_workers = os.cpu_count(),
    ),
)

In [None]:
class ToxicDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        less_toxic_text, more_toxic_text = self.df[['less_toxic', 'more_toxic']].iloc[idx]
        encoded_less_toxic = self.tokenizer.encode_plus(less_toxic_text, truncation=True, add_special_tokens=True, max_length=self.max_length, padding='max_length')
        encoded_more_toxic = self.tokenizer.encode_plus(more_toxic_text, truncation=True, add_special_tokens=True, max_length=self.max_length, padding='max_length')
        
        less_toxic_input_ids, less_toxic_attention_mask = encoded_less_toxic['input_ids'], encoded_less_toxic['attention_mask']
        more_toxic_input_ids, more_toxic_attention_mask = encoded_more_toxic['input_ids'], encoded_more_toxic['attention_mask']
        
        return {
            'less_toxic_input_ids': torch.tensor(less_toxic_input_ids, dtype=torch.long),
            'less_toxic_attention_mask': torch.tensor(less_toxic_attention_mask, dtype=torch.long),
            'more_toxic_input_ids': torch.tensor(more_toxic_input_ids, dtype=torch.long),
            'more_toxic_attention_mask': torch.tensor(more_toxic_attention_mask, dtype=torch.long),
            'target': torch.tensor(1, dtype=torch.long)
        }
    
    
class ToxicDatasetPredict(Dataset):
    def __init__(self, df, tokenizer, max_length):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        toxic_text = self.df['text'].iloc[idx]
        encoded_toxic = self.tokenizer.encode_plus(toxic_text, truncation=True, add_special_tokens=True, max_length=self.max_length, padding='max_length')
        
        toxic_input_ids, toxic_attention_mask = encoded_toxic['input_ids'], encoded_toxic['attention_mask']

        return {
            'toxic_input_ids': torch.tensor(toxic_input_ids, dtype=torch.long),
            'toxic_attention_mask': torch.tensor(toxic_attention_mask, dtype=torch.long),
        }
        

class ToxicDataModule(LightningDataModule):
    def __init__(self, df, predict_df, fold, **kwargs):
        super().__init__()
        self.df = df
        self.predict_df = predict_df
        self.fold = fold
        self.save_hyperparameters(ignore=['df', 'predict_df', 'fold'])
    
    def setup(self, stage):
        tokenizer = AutoTokenizer.from_pretrained(self.hparams.tokenizer_path)
        
        if stage == 'fit':
            self.train_ds = ToxicDataset(self.df.loc[self.df['fold'] != self.fold], tokenizer, self.hparams.max_length)
            self.valid_ds = ToxicDataset(self.df.loc[self.df['fold'] == self.fold], tokenizer, self.hparams.max_length)
        elif stage == 'predict':
            self.predict_ds = ToxicDatasetPredict(self.predict_df, tokenizer, self.hparams.max_length)
    
    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.hparams.train_batch_size, shuffle=True, num_workers=self.hparams.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.valid_ds, batch_size=self.hparams.val_batch_size, shuffle=False, num_workers=self.hparams.num_workers)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_ds, batch_size=self.hparams.predict_batch_size, shuffle=False, num_workers=self.hparams.num_workers)

In [None]:
class ToxicModule(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        base_model_config = AutoConfig.from_pretrained(self.hparams.model_path)
        self.base_model = AutoModel.from_pretrained(self.hparams.model_path, return_dict=False)
        self.layer_norm = nn.LayerNorm(base_model_config.hidden_size)
        self.dropout = nn.Dropout(self.hparams.dropout)
        self.dense = nn.Sequential(
            nn.Linear(base_model_config.hidden_size, self.hparams.hidden_size),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(self.hparams.dropout),
            nn.Linear(self.hparams.hidden_size, self.hparams.num_classes)
        )
        
        self.loss = nn.MarginRankingLoss(margin=self.hparams.loss_margin)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds
    
    def training_step(self, batch, batch_idx):
        less_toxic_logits = self(batch['less_toxic_input_ids'], batch['less_toxic_attention_mask'])
        more_toxic_logits = self(batch['more_toxic_input_ids'], batch['more_toxic_attention_mask'])
        target = batch['target']
        loss = self.loss(more_toxic_logits, less_toxic_logits, target)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        less_toxic_logits = self(batch['less_toxic_input_ids'], batch['less_toxic_attention_mask'])
        more_toxic_logits = self(batch['more_toxic_input_ids'], batch['more_toxic_attention_mask'])
        target = batch['target']
        loss = self.loss(more_toxic_logits, less_toxic_logits, target)
        self.log('val_loss', loss)
    
    def predict_step(self, batch, batch_idx):
        return self(batch['toxic_input_ids'], batch['toxic_attention_mask'])
    
    def configure_optimizers(self):
        opt = AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        sched = optim.lr_scheduler.CosineAnnealingLR(optimizer=opt, eta_min=self.hparams.min_lr, T_max=self.hparams.T_max)
        lr_sched_dict = {'scheduler': sched, 'interval': 'step'}
        return {'optimizer': opt, 'lr_scheduler': lr_sched_dict}

In [None]:
pl.seed_everything(config.seed)

df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
predict_df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')

skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)
df['fold'] = -1

for fold, (_, val_idxs) in enumerate(skf.split(X=df, y=df['worker'])):
    df.loc[val_idxs , 'fold'] = fold
    
df['fold'].value_counts(normalize=True)*100

In [None]:
final_preds = None

for fold in range(config.n_folds):
    print(f"{'#'*50} FOLD: {fold+1} {'#'*50}")

    dm = ToxicDataModule(df, predict_df, fold=fold, **vars(config.data))
    model = ToxicModule(**vars(config.model))
    trainer = Trainer(**vars(config.trainer))

    trainer.fit(model, datamodule=dm)
    preds = trainer.predict(model, datamodule=dm)
    preds = torch.cat(preds).view(-1) / config.n_folds

    if final_preds is None:
        final_preds = preds
    else:
        final_preds += preds

    del trainer, dm, model
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
predict_df['score'] = final_preds.numpy()
predict_df['score'] = predict_df['score'].rank(method='first')
predict_df = predict_df.drop('text', axis=1)
predict_df.to_csv("submission.csv", header=True, index=False)

print(predict_df.shape)
predict_df.head()