In [None]:
!pip install -q '/kaggle/input/libraries/pytorch_lightning-1.5.4-py3-none-any.whl'
!pip freeze | grep 'pytorch-lightning'

In [None]:
import os
import gc
import pandas as pd
from argparse import Namespace

import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from pathlib import Path

from sklearn.model_selection import StratifiedKFold

import pytorch_lightning as pl
from pytorch_lightning import Trainer, LightningModule, LightningDataModule
from pytorch_lightning.callbacks import ModelCheckpoint

from transformers import AutoTokenizer, AutoModel, AutoConfig


os.environ['TOKENIZERS_PARALLELISM'] = 'false'
pl.__version__

In [None]:
config = Namespace(
    seed = 7,
    n_folds = 5,
    
    trainer = Namespace(
        precision = 32,
        max_epochs = 1,
        gpus = 1 if torch.cuda.is_available() else 0,
        enable_checkpointing=False,
#         fast_dev_run=2
    ),
    
    model = Namespace(
        # model layers
        model_path = '/kaggle/input/roberta-base',
        dropout = 0.2,
        num_classes = 1,
        hidden_size = 256,
        
        # scheduler
        T_max = 500,
        min_lr = 1e-6,
        
        # optimizer
        lr = 1e-4,
        weight_decay = 1e-6,
    ),
    
    data = Namespace(
        # tokenizer
        tokenizer_path = '/kaggle/input/roberta-base',
        max_length = 128,
        
        # dataloader
        train_batch_size = 32,
        val_batch_size = 64,
        predict_batch_size = 64,
        num_workers = os.cpu_count(),
    ),
)

In [None]:
class ToxicDatasetFit(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'score': torch.tensor(item['score'], dtype=torch.float),
        }
    
class ToxicDatasetPredict(Dataset):
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
        }

In [None]:
class ToxicDataModule(LightningDataModule):
    def __init__(self, df, predict_df, fold, **kwargs):
        super().__init__()
        self.df = df
        self.predict_df = predict_df
        self.fold = fold
        self.save_hyperparameters(ignore=['df', 'predict_df', 'fold'])
    
    def setup(self, stage):
        if stage == 'fit':
            self.train_ds = ToxicDatasetFit(self.df.loc[self.df['fold'] != self.fold].drop('fold', axis=1))
            self.valid_ds = ToxicDatasetFit(self.df.loc[self.df['fold'] == self.fold].drop('fold', axis=1))
        elif stage == 'predict':
            self.predict_ds = ToxicDatasetPredict(self.predict_df)
    
    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.hparams.train_batch_size, shuffle=True, num_workers=self.hparams.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.valid_ds, batch_size=self.hparams.val_batch_size, shuffle=False, num_workers=self.hparams.num_workers)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_ds, batch_size=self.hparams.predict_batch_size, shuffle=False, num_workers=self.hparams.num_workers)

In [None]:
class ToxicModule(LightningModule):
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        
        base_model_config = AutoConfig.from_pretrained(self.hparams.model_path)
        self.base_model = AutoModel.from_pretrained(self.hparams.model_path, return_dict=False)
        self.layer_norm = nn.LayerNorm(base_model_config.hidden_size)
        self.dropout = nn.Dropout(self.hparams.dropout)
        self.dense = nn.Sequential(
            nn.Linear(base_model_config.hidden_size, self.hparams.hidden_size),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(self.hparams.dropout),
            nn.Linear(self.hparams.hidden_size, self.hparams.num_classes)
        )
        
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask):
        breakpoint()
        _, pooled_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds
    
    def training_step(self, batch, batch_idx):
        logits = self(batch['input_ids'], batch['attention_mask'])
        target = batch['score']
        loss = self.loss(logits.view(-1), target)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        logits = self(batch['input_ids'], batch['attention_mask'])
        target = batch['score']
        loss = self.loss(logits.view(-1), target)
        self.log('val_loss', loss)
    
    def predict_step(self, batch, batch_idx):
        return self(batch['input_ids'], batch['attention_mask'])
    
    def configure_optimizers(self):
        opt = optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        sched = optim.lr_scheduler.CosineAnnealingLR(optimizer=opt, eta_min=self.hparams.min_lr, T_max=self.hparams.T_max)
        lr_sched_dict = {'scheduler': sched, 'interval': 'step'}
        return {'optimizer': opt, 'lr_scheduler': lr_sched_dict}

In [None]:
# df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
# # df['severe_toxic'] = df['severe_toxic'] * 2
# cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# df['score'] = df[cols].sum(axis=1)
# cols += ['id']
# df = df.drop(cols, axis=1)

In [None]:
pred_df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/comments_to_score.csv')
tokenizer = AutoTokenizer.from_pretrained(config.data.tokenizer_path)
pl.seed_everything(config.seed)

In [None]:
# df = pd.concat([df.loc[df['score'] > 0], df.loc[df['score'] == 0].sample(frac=1.).head(4000)], ignore_index=True)
# df['input_ids'] = df['comment_text'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, add_special_tokens=True, max_length=config.data.max_length, padding='max_length'))
# df['attention_mask'] = df['input_ids'].apply(lambda x: x['attention_mask'])
# df['input_ids'] = df['input_ids'].apply(lambda x: x['input_ids'])
# df = df.drop('comment_text', axis=1)
# df.head()

In [None]:
pred_df['input_ids'] = pred_df['text'].apply(lambda x: tokenizer.encode_plus(x, truncation=True, add_special_tokens=True, max_length=config.data.max_length, padding='max_length'))
pred_df['attention_mask'] = pred_df['input_ids'].apply(lambda x: x['attention_mask'])
pred_df['input_ids'] = pred_df['input_ids'].apply(lambda x: x['input_ids'])
pred_df = pred_df.drop(['text'], axis=1)
pred_df.head()

In [None]:
# skf = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)
# df['fold'] = -1

# for fold, (_, val_idxs) in enumerate(skf.split(X=df, y=df['score'])):
#     df.loc[val_idxs , 'fold'] = fold
    
# df['score'] = df['score'] / df['score'].max()
# df['fold'].value_counts(normalize=True)*100

In [None]:
!tar -xf '/kaggle/input/2021-severity-toxic-mse-ce-rank/mse.tar'
ckpt_files = list(Path('.').iterdir())
ckpt_files = [f for f in ckpt_files if f.suffix == '.ckpt']
ckpt_files

In [None]:
final_preds = None

# for fold in range(config.n_folds):
for fold, ckpt_file in enumerate(ckpt_files):
    print(f"{'#'*50} FOLD: {fold+1} {'#'*50}")

#     dm = ToxicDataModule(df, pred_df, fold=fold, **vars(config.data))
    dm = ToxicDataModule(None, pred_df, fold=fold, **vars(config.data))
#     model = ToxicModule(**vars(config.model))
    model = ToxicModule.load_from_checkpoint(ckpt_file, **vars(config.model))
    trainer = Trainer(**vars(config.trainer))

#     trainer.fit(model, datamodule=dm)
    preds = trainer.predict(model, datamodule=dm)
    preds = torch.cat(preds).view(-1) / config.n_folds

    if final_preds is None:
        final_preds = preds
    else:
        final_preds += preds

    del trainer, dm, model
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
pred_df['score'] = final_preds.numpy()
pred_df['score'] = pred_df['score'].rank(method='first')
pred_df = pred_df.drop(['input_ids', 'attention_mask'], axis=1)
pred_df.to_csv("submission.csv", header=True, index=False)

print(pred_df.shape)
pred_df.head()