# MST8823

## EXP029

In [None]:
"""
pseudo_label
toxic-xlm-roberta
RMSE
Dropout=0.0
"""
class Config:
    author = "mst8823"
    wandb_entity = "mst8823"
    
    competition = "jigsaw-toxic-severity-rating"
    name = "Exp-029-toxic-xlm-roberta-Pseudo-Ruddit"
    debug = False
    inference_only = True
    use_pretrain_model = False
    target_cols = ["pseudo_label"]
    
    model_name = "unitary/multilingual-toxic-xlm-roberta"
    hidden_size = 768
    head = 256
    tail = 0
    max_length = head + tail

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 2022

    max_epochs = 4
    gradient_clip_val = 100
    accumulate_grad_batches = 1
    early_stopping = False
    optimizer = dict(
        optimizer="AdamW", 
        lr=1e-5, 
        weight_decay=2e-5
        )
    scheduler = dict(
        interval = "step",
        scheduler="CosineAnnealingWarmupRestarts",
        max_lr=1e-5,
        min_lr=1e-6,
        T_mult=1,
        warmup_steps=10,
        gamma=1)
    
    train_batch_size = 8
    valid_batch_size = 32
    num_workers = 4
    resume_from_checkpoint = None

    colab_dir = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments"
    drive_path = colab_dir + f"/{author}"
    api_path = drive_path + "/kaggle.json"

    upload_from_colab = False
    kaggle_dataset_path = "../input/exp-029-toxic-xlm-roberta-pseudo-ruddit"

    """
    - step scheduler example
    scheduler = dict(
        interval = "step",
        scheduler="get_cosine_schedule_with_warmup",
        num_warmup_steps=256, 
        num_cycles=0.5)

    """

In [None]:
import os
import re
import sys
import logging
import shutil
import json
import datetime
import requests
import itertools
import functools
import warnings
import joblib
import gc
import random
import string
import re
import collections

import pandas as pd
import numpy as np
import nltk

from tqdm.auto import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.special import softmax
from bs4 import BeautifulSoup

import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    MultiStepLR, 
    ReduceLROnPlateau
    )
from torch.utils.data import Dataset, DataLoader

In [None]:
# =========================
# Utils
# =========================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def seed_everything(seed=2022):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def setup(cfg):
    cfg.COLAB = "google.colab" in sys.modules
    if cfg.COLAB:
        print("This environment is Google Colab")
        
        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet pytorch_lightning
        ! pip install --quiet transformers
        ! pip install --quiet wandb
        ! pip install --quiet sentencepiece
        ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]
        
        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
        cfg.INPUT = os.path.join(cfg.DRIVE, "Input")
        cfg.OUTPUT = os.path.join(cfg.DRIVE, "Output")
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, "Submission")
        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS] + cfg.jigsaw_inputs:
            os.makedirs(d, exist_ok=True)

        if not os.path.isfile(os.path.join(cfg.INPUT_JIGSAW_04, "comments_to_score.csv")):
            print("load dataset")
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $cfg.INPUT_JIGSAW_01 
            ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $cfg.INPUT_JIGSAW_02 
            ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $cfg.INPUT_JIGSAW_03 
            ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $cfg.INPUT_JIGSAW_04 
            ! kaggle datasets download -d rajkumarl/ruddit-jigsaw-dataset -p $cfg.INPUT_RUDDIT

            for input_path in cfg.jigsaw_inputs:
                filepath = f'{input_path}/{input_path.split("/")[-1]}'
                ! unzip -d $input_path $filepath

    else:
        print("This environment is Kaggle Kernel")
        if not cfg.inference_only:
            ! pip install --quiet pytorch_lightning==1.5.8 

        # set dirs
        cfg.INPUT = f"../input"

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        cfg.EXP = cfg.OUTPUT_EXP = "./"
        if cfg.kaggle_dataset_path is not None:
            cfg.EXP_MODEL = os.path.join(cfg.kaggle_dataset_path, "model")
        else:
            cfg.EXP_MODEL = os.path.join(cfg.EXP, "model")

        cfg.SUBMISSION = "./"
        cfg.EXP_FIG = os.path.join(cfg.EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.EXP, "preds")

        # make dirs
        make_dirs = [cfg.EXP_FIG, cfg.EXP_PREDS]
        if not cfg.inference_only:
            make_dirs.append(cfg.EXP_MODEL)
        for d in make_dirs:
            os.makedirs(d, exist_ok=True)

    # set device    
    cfg.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    warnings.filterwarnings("ignore")
    seed_everything(cfg.seed)

    cfg.logger = Logger(cfg.OUTPUT_EXP)

    return cfg


# =========================
# SetUp
# =========================
Config = setup(Config)

# 2nd import
import pytorch_lightning as pl
import wandb

from transformers import (AutoConfig, AutoModel, AutoTokenizer)
from transformers import (get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup)

if not Config.inference_only:
    from cosine_annealing_warmup import CosineAnnealingWarmupRestarts

# wandb setting
if not Config.COLAB:
    if  not Config.inference_only:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("WANDB_API")
        wandb.login(key=api_key)
else:
    wandb.login()

In [None]:
# =============================
# Dataset
# =============================
class JigsawTrainDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].values
        self.targets = df[cfg.target_cols].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        targets = torch.tensor(self.targets[idx]).float()

        return inputs, targets


class JigsawTestDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].fillna("none").values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):
        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        return inputs


def prepare_input(cfg, text, tokenizer):
    if cfg.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=cfg.max_length,
            pad_to_max_length=True,
            truncation=True)
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)

    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True)
        
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_length:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])

            if k == 'input_ids':
                new_v = np.ones(cfg.max_length) * tokenizer.pad_token_id

            else:
                new_v = np.zeros(cfg.max_length)

            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)

    return inputs


class JigsawDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df, valid_df, text_col):
        super(JigsawDataModule).__init__()

        self.cfg = cfg
        self.text_col = text_col
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df

        self.train_dataset = None
        self.val_dataset = None

    def setup(self, stage=None):
        self.train_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.train_df, tokenizer=self.tokenizer, text_col=self.text_col)
        self.val_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.valid_df, tokenizer=self.tokenizer, text_col=self.text_col)
        
    def train_dataloader(self):
        train_dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.cfg.train_batch_size, 
            shuffle=True, 
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=True)
        
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            shuffle=False,
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=False)

        return val_dataloader

In [None]:
# =============================
# Model
# =============================
def get_optimizer(cfg, parameters):
    opt = cfg.optimizer
    if opt["optimizer"] == "AdamW":
        optimizer = AdamW(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    elif opt["optimizer"] == "Adam":
        optimizer = Adam(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    else:
        raise NotImplementedError
    
    return optimizer


def get_scheduler(cfg, optimizer, num_train_steps):
    sch = cfg.scheduler
    if sch["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps)
    
    elif sch["scheduler"] == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps,
            num_cycles=sch["num_cycles"]
            )

    elif sch["scheduler"] == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, 
            milestones=sch["milestones"], 
            gamma=sch["gamma"]
        )

    elif sch["scheduler"] == "CosineAnnealingWarmupRestarts":
        
        first_cycle_steps = (num_train_steps // cfg.max_epochs) * cfg.train_batch_size
        print(first_cycle_steps)
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            first_cycle_steps=int(first_cycle_steps),
            cycle_mult=sch['T_mult'],
            max_lr=sch["max_lr"],
            min_lr=sch['min_lr'],
            warmup_steps=sch['warmup_steps'],
            gamma=sch['gamma']
        )
    else:
        raise NotImplementedError
    
    return scheduler


class JigsawModel(pl.LightningModule):
    def __init__(self, cfg):
        super(JigsawModel, self).__init__()
        self.cfg = cfg
        self.total_steps = None
        self.dataset_size = None

        self.backborn = get_backborn(cfg)   
        self.out = nn.Linear(cfg.hidden_size, len(cfg.target_cols))

    def forward(self, inputs):
        x = self.backborn(**inputs)
        x = x[0]
        x = x[:, 0, :]

        x_out = self.out(x)

        return x_out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("val_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def loss(self, outputs, targets):
        loss_fn = nn.MSELoss()
        loss = loss_fn(outputs, targets)
        # loss = torch.sqrt(loss)
        return loss

    def setup(self, stage=None):
        if stage != "fit":
            return

        # calculate total steps
        if self.dataset_size is None:
            dataset = self.trainer._data_connector._train_dataloader_source.dataloader()
            self.dataset_size = len(dataset)
        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)  # gpus=-1だとそれが反映されちゃう
        effective_batch_size = self.cfg.train_batch_size * self.trainer.accumulate_grad_batches * num_devices
        print(self.dataset_size, effective_batch_size)
        self.total_steps = (self.dataset_size // effective_batch_size) * self.cfg.max_epochs

    def configure_optimizers(self):
        optimizer = get_optimizer(self.cfg, parameters=self.parameters())

        if self.cfg.scheduler is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_train_steps=self.total_steps)
            return [optimizer], [{"scheduler": scheduler, "interval": self.cfg.scheduler["interval"]}]

In [None]:
# =============================
# Metrics
# ============================= 
def get_validation_data_hat(cfg, tokenizer, filename, validation_data):
    validation_data_ = validation_data.copy()
    df = pd.DataFrame({"text":sorted(set(validation_data_["less_toxic"].unique()) |
                                     set(validation_data_["more_toxic"].unique()))})
    
    if filename is None:
        preds = predict_cv(cfg, df, tokenizer, text_col="text")
    else:
        preds = predict(cfg, df, tokenizer, filename, text_col="text")

    if np.ndim(preds) > 1:
        df["preds"] = np.mean(preds, axis=1)  # mean of targets
    else:
        df["preds"] = preds.reshape(-1)

    validation_data_ = (pd.merge(
        validation_data_, df, left_on="less_toxic", right_on="text", how="left").
        rename(columns={"preds":"less_toxic_preds"}).
        drop("text", axis=1))
    
    validation_data_ = (pd.merge(
        validation_data_, df, left_on="more_toxic", right_on="text", how="left").
        rename(columns={"preds":"more_toxic_preds"}).
        drop("text", axis=1))
    
    return validation_data_


def get_score(validation_data_hat):
    less_toxic, more_toxic = validation_data_hat["less_toxic_preds"], validation_data_hat["more_toxic_preds"]
    return np.mean(more_toxic > less_toxic)

In [None]:
# =============================
# Train & Predict
# =============================
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))


def train_fold(cfg, train_df, valid_df, tokenizer, filename, text_col):

    wandblogger = pl.loggers.WandbLogger(
        project=cfg.competition, 
        config=class2dict(cfg),
        group=f"{cfg.author}_{cfg.name}",  
        name="_".join(filename.split("-")[-2:]),
        job_type="train",
        reinit=True,
        anonymous=None,
        entity=cfg.wandb_entity
        )

    lightning_datamodule = JigsawDataModule(
        cfg=cfg, 
        tokenizer=tokenizer,
        train_df=train_df, 
        valid_df=valid_df, 
        text_col=text_col
        )
    
    lightning_model = JigsawModel(cfg=cfg)
    lightning_model.dataset_size = len(train_df)  # cuz setup donot work?

    checkpoint = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.EXP_MODEL,
        filename=filename,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    if cfg.early_stopping:
        early_stopping = pl.callbacks.EarlyStopping(
            monitor="val_loss", 
            min_delta=0.0, 
            patience=8, 
            mode='min', 
        )
        callbacks += [early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandblogger],
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        deterministic=False,
        gpus=-1,
        precision=16,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)
    torch.cuda.empty_cache()


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def train_cv(cfg, df, tokenizer, text_col=None, validation_data=None, get_oof=True):
    """cross validation & get oof"""
    oof_df = pd.DataFrame(np.zeros((len(df), len(cfg.target_cols))), columns=cfg.target_cols)

    for i_fold in range(cfg.n_fold):

        if i_fold in cfg.trn_fold:
            filename = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            filelist = get_filname_listdir(cfg.EXP_MODEL)

            val_mask = (df["fold"] == i_fold).astype(bool)
            train_df = df[~val_mask].reset_index(drop=True)
            valid_df = df[val_mask].reset_index(drop=True)

            if not filename in filelist:
                print(f"# --------- # Start Training Fold={i_fold} # --------- #")
                # training
                train_fold(
                    cfg=cfg, 
                    train_df=train_df, 
                    valid_df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col
                    )

            # get validation data score
            if validation_data is not None:
                validation_data_hat = get_validation_data_hat(cfg, tokenizer, filename, validation_data)
                val_score = get_score(validation_data_hat)
                log = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}: validation data score={val_score:.4f}"
                cfg.logger.info(log)

            # get validation prediction
            if get_oof:
                preds = predict(
                    cfg=cfg,
                    df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col)
                
                oof_df.loc[val_mask] = preds
                return oof_df


def predict(cfg, df, tokenizer, filename, text_col):
    test_dataset = JigsawTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, text_col=text_col)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.valid_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers, 
        pin_memory=True, 
        drop_last=False
        ) 
    
    lightning_model = JigsawModel(cfg=cfg).to(cfg.DEVICE).eval()
    checkpoint_path = os.path.join(cfg.EXP_MODEL, filename + ".ckpt") 
    lightning_model.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    num_targets = len(cfg.target_cols)
    preds = np.zeros((len(df), num_targets))  # N * num targets
    fill_start_idx = 0

    for inputs in tqdm(test_dataloader,total=len(test_dataloader)):
        # get predicted labels by batch
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.DEVICE)

        with torch.no_grad():
            pred = lightning_model(inputs)
            pred = pred.cpu().numpy()  # bs * num targets
        
        fill_end_idx = pred.shape[0] + fill_start_idx  # bs + idx
        preds[fill_start_idx:fill_end_idx] = pred
        fill_start_idx = fill_end_idx
        
    
    del test_dataset, test_dataloader, lightning_model
    gc.collect()

    return preds


def predict_cv(cfg, df, tokenizer, text_col):
    num_targets = len(cfg.target_cols)
    preds = []
    
    for i_fold in range(cfg.n_fold):
        if i_fold in cfg.trn_fold:
            filename =f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            preds_fold = predict(cfg, df, tokenizer, filename, text_col)
            preds.append(preds_fold)
    
    preds = np.mean(preds, axis=0)  # fold mean
    return preds

In [None]:
# =============================
# Load Model
# =============================
def get_tokenizer(cfg):

    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    tokenizer_path = os.path.join(pretrained_dir, "tokenizer_config.json")  # tokenizer.json??
    if not os.path.isfile(tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        tokenizer.save_pretrained(pretrained_dir)
    
    else:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)

    return tokenizer


def get_backborn(cfg):
    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    backborn_path = os.path.join(pretrained_dir, "pytorch_model.bin")
    if not os.path.isfile(backborn_path):
        model_config = AutoConfig.from_pretrained(cfg.model_name)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0

        backborn = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backborn.save_pretrained(pretrained_dir)
    
    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0
        
        if cfg.use_pretrain_model:
            backborn = AutoModel.from_pretrained(pretrained_dir, config=model_config)
        else:
            backborn = AutoModel.from_config(model_config)  # inference 時は pretrain weight いらない：cfg.use_pretrain_model=False

    return backborn

In [None]:
# =============================
# Create Data
# =============================
def read_csv(filepath, **kwargs):
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data


def text_cleaning(text):
    '''
    ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


def get_jigsaw_01_dataset(cfg):
    """
    jigsaw-toxic-comment-classification-challenge
    - text_col : "comment_text2
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    """
    jigsaw1_train = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "train.csv"))
    jigsaw1_test = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test.csv"))
    jigsaw1_test_label = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test_labels.csv"))
    scoring_mask = jigsaw1_test_label["toxic"] != -1
    jigsaw1_test = pd.merge(jigsaw1_test[scoring_mask], jigsaw1_test_label[scoring_mask], on="id", how="left")
    jigsaw1_train = pd.concat([jigsaw1_train, jigsaw1_test], axis=0).reset_index(drop=True)

    return jigsaw1_train


def get_jigsaw_02_dataset(cfg, cat_threshold=0.5):
    """
    jigsaw-unintended-bias-in-toxicity-classification
    - text_col : "comment_text"
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    """
    jigsaw2_data = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "all_data.csv"), usecols=["id", "comment_text"])
    jigsaw2_labels = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "toxicity_individual_annotations.csv"))
    jigsaw2_agg_labels = jigsaw2_labels.groupby(["id"]).agg("mean")

    if cat_threshold is not None:
        jigsaw2_agg_labels = pd.DataFrame(
            np.where(jigsaw2_agg_labels >= cat_threshold, 1, 0), 
            index=jigsaw2_agg_labels.index,
            columns=jigsaw2_agg_labels.columns)
    
    jigsaw2_train = pd.merge(jigsaw2_data, jigsaw2_agg_labels, on="id", how="left")
    jigsaw2_train = jigsaw2_train.dropna(axis=0).reset_index(drop=True)
    jigsaw2_train = (jigsaw2_train.
                        rename(columns={"identity_attack":"identity_hate"}).
                        drop(["sexual_explicit", "worker"], axis=1))
    
    return jigsaw2_train


def get_ruddit_dataset(cfg):
    """
    Ruddit Dataset
    - text_col : "comment_text"
    - target_cols : "offensiveness_score"
    """
    ruddit_df = read_csv(os.path.join(cfg.INPUT_RUDDIT, "Dataset", "ruddit_with_text.csv"))
    ruddit_df = ruddit_df[~ruddit_df["txt"].isin(["[deleted]", "[removed]"])].reset_index(drop=True)
    # ruddit_df["comment_text"] = text_normalization(ruddit_df["txt"])
    ruddit_df["comment_text"] = ruddit_df["txt"].fillna("none")
    return ruddit_df.drop("txt", axis=1)


def get_fold_idx(cfg, df):
    df["fold"] = -1
    y = df[cfg.target_cols].sum(axis=1)
    cv_strategy = KFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for i_fold, (tr_idx, va_idx) in enumerate(cv_strategy.split(X=df, y=y)):
        df.loc[va_idx, "fold"] = i_fold
    
    return df


def get_custom_jigsaw_dataset(cfg, train_data, validation_data):
    """
    ref) https://www.kaggle.com/toru59er/0-866-tfidf-ridge-simple-baseline
    target_cols : ["toxic_score"]
    weighted sum of targets:["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    undersampling
    """

    train_data["toxic_score"] = train_data[cfg.target_cols].sum(axis=1)
    
    # undersample
    toxic_mask = (train_data["toxic_score"] > 0).astype(bool)
    min_len = np.sum(toxic_mask)

    sampled_data = train_data[train_data["toxic_score"] == 0].sample(n=min_len, random_state=cfg.seed)
    train_data = pd.concat([train_data[toxic_mask], sampled_data]).reset_index(drop=True).drop("toxic_score", axis=1)

    val_comment_unq = np.unique(validation_data['less_toxic'].tolist() + validation_data['more_toxic'].tolist())
    duplicate_idx = np.isin(train_data['comment_text'], val_comment_unq)
    train_data = train_data.iloc[~duplicate_idx].reset_index(drop=True)

    return train_data

In [None]:
print("# ------------------ # Load Data # ------------------ #")

# load tokenizer
tokenizer = get_tokenizer(Config)

comments_to_score = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "comments_to_score.csv"))

if len(comments_to_score) == 7537:
    comments_to_score = comments_to_score.iloc[:100]

# comments_to_score["text"] = text_normalization(comments_to_score["text"])
sample_submission = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "sample_submission.csv"))

if not Config.inference_only:

    # load validation data
    validation_data = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "validation_data.csv"))

    # load train data
    train_data = read_csv("/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments/mst8823/Input/PseudoLabelDataset-Ruddit.csv")
    train_data = train_data[~train_data["txt"].isin(["[deleted]", "[removed]"])].reset_index(drop=True)
    train_data = get_fold_idx(cfg=Config, df=train_data)

    # train_data["comment_text"] = text_normalization(train_data["comment_text"])
    # validation_data["less_toxic"] = text_normalization(validation_data["less_toxic"])
    # validation_data["more_toxic"] = text_normalization(validation_data["more_toxic"])

    print("# ------------------ # Training # ------------------ #")
    # training
    train_cv(
        cfg=Config, 
        df=train_data, 
        tokenizer=tokenizer, 
        text_col="txt",  #comment_text
        validation_data=validation_data, 
        get_oof=False)

    print("# ------------------ # Validation # ------------------ #")
    # validation
    validation_data_hat = get_validation_data_hat(
        cfg=Config, 
        tokenizer=tokenizer, 
        filename=None, 
        validation_data=validation_data
        )
    filepath = os.path.join(Config.EXP_PREDS, "validation_data.csv")
    validation_data_hat.to_csv(filepath, index=False)
    score = get_score(validation_data_hat)
    Config.logger.info(f"validation score = {score:.4f}")

print("# ------------------ # Inference # ------------------ #")
preds = predict_cv(
    cfg=Config, 
    df=comments_to_score, 
    tokenizer=tokenizer, 
    text_col="text")

print(preds.shape)
if np.ndim(preds) > 1:
    mst029 = np.mean(preds, axis=1)  # mean of target
else:
    mst029 = preds

## MST030

In [None]:
"""
pseudo_label
toxic-xlm-roberta
RMSE
Dropout=0.0
"""
class Config:
    author = "mst8823"
    wandb_entity = "mst8823"
    
    competition = "jigsaw-toxic-severity-rating"
    name = "Exp-030-toxic-xlm-roberta-Pseudo-Jigsaw1"
    debug = False
    inference_only = True
    use_pretrain_model = False
    target_cols = ["pseudo_label"]
    
    model_name = "unitary/multilingual-toxic-xlm-roberta"
    hidden_size = 768
    head = 256
    tail = 0
    max_length = head + tail

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 2022

    max_epochs = 5
    gradient_clip_val = 100
    accumulate_grad_batches = 1
    early_stopping = False
    optimizer = dict(
        optimizer="AdamW", 
        lr=1e-5, 
        weight_decay=2e-5
        )
    scheduler = dict(
        interval = "step",
        scheduler="CosineAnnealingWarmupRestarts",
        max_lr=1e-5,
        min_lr=1e-6,
        T_mult=1,
        warmup_steps=10,
        gamma=1)
    
    train_batch_size = 8
    valid_batch_size = 32
    num_workers = 2
    resume_from_checkpoint = None

    colab_dir = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments"
    drive_path = colab_dir + f"/{author}"
    api_path = drive_path + "/kaggle.json"

    upload_from_colab = False
    kaggle_dataset_path = "../input/exp-030-toxic-xlm-roberta-pseudo-jigsaw1"

    """
    - step scheduler example
    scheduler = dict(
        interval = "step",
        scheduler="get_cosine_schedule_with_warmup",
        num_warmup_steps=256, 
        num_cycles=0.5)

    """

In [None]:
import os
import re
import sys
import logging
import shutil
import json
import datetime
import requests
import itertools
import functools
import warnings
import joblib
import gc
import random
import string
import re
import collections

import pandas as pd
import numpy as np
import nltk

from tqdm.auto import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.special import softmax
from bs4 import BeautifulSoup

import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    MultiStepLR, 
    ReduceLROnPlateau
    )
from torch.utils.data import Dataset, DataLoader

In [None]:
# =========================
# Utils
# =========================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def seed_everything(seed=2022):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def setup(cfg):
    cfg.COLAB = "google.colab" in sys.modules
    if cfg.COLAB:
        print("This environment is Google Colab")
        
        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet pytorch_lightning
        ! pip install --quiet transformers
        ! pip install --quiet wandb
        ! pip install --quiet sentencepiece
        ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]
        
        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
        cfg.INPUT = os.path.join(cfg.DRIVE, "Input")
        cfg.OUTPUT = os.path.join(cfg.DRIVE, "Output")
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, "Submission")
        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS] + cfg.jigsaw_inputs:
            os.makedirs(d, exist_ok=True)

        if not os.path.isfile(os.path.join(cfg.INPUT_JIGSAW_04, "comments_to_score.csv")):
            print("load dataset")
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $cfg.INPUT_JIGSAW_01 
            ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $cfg.INPUT_JIGSAW_02 
            ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $cfg.INPUT_JIGSAW_03 
            ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $cfg.INPUT_JIGSAW_04 
            ! kaggle datasets download -d rajkumarl/ruddit-jigsaw-dataset -p $cfg.INPUT_RUDDIT

            for input_path in cfg.jigsaw_inputs:
                filepath = f'{input_path}/{input_path.split("/")[-1]}'
                ! unzip -d $input_path $filepath

    else:
        print("This environment is Kaggle Kernel")
        if not cfg.inference_only:
            ! pip install --quiet pytorch_lightning==1.5.8 

        # set dirs
        cfg.INPUT = f"../input"

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        cfg.EXP = cfg.OUTPUT_EXP = "./"
        if cfg.kaggle_dataset_path is not None:
            cfg.EXP_MODEL = os.path.join(cfg.kaggle_dataset_path, "model")
        else:
            cfg.EXP_MODEL = os.path.join(cfg.EXP, "model")

        cfg.SUBMISSION = "./"
        cfg.EXP_FIG = os.path.join(cfg.EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.EXP, "preds")

        # make dirs
        make_dirs = [cfg.EXP_FIG, cfg.EXP_PREDS]
        if not cfg.inference_only:
            make_dirs.append(cfg.EXP_MODEL)
        for d in make_dirs:
            os.makedirs(d, exist_ok=True)

    # set device    
    cfg.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    warnings.filterwarnings("ignore")
    seed_everything(cfg.seed)

    cfg.logger = Logger(cfg.OUTPUT_EXP)

    return cfg


# =========================
# SetUp
# =========================
Config = setup(Config)

# 2nd import
import pytorch_lightning as pl
import wandb

from transformers import (AutoConfig, AutoModel, AutoTokenizer)
from transformers import (get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup)

if not Config.inference_only:
    from cosine_annealing_warmup import CosineAnnealingWarmupRestarts

# wandb setting
if not Config.COLAB:
    if  not Config.inference_only:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("WANDB_API")
        wandb.login(key=api_key)
else:
    wandb.login()

In [None]:
# =============================
# Dataset
# =============================
class JigsawTrainDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].values
        self.targets = df[cfg.target_cols].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        targets = torch.tensor(self.targets[idx]).float()

        return inputs, targets


class JigsawTestDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].fillna("none").values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):
        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        return inputs


def prepare_input(cfg, text, tokenizer):
    if cfg.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=cfg.max_length,
            pad_to_max_length=True,
            truncation=True)
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)

    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True)
        
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_length:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])

            if k == 'input_ids':
                new_v = np.ones(cfg.max_length) * tokenizer.pad_token_id

            else:
                new_v = np.zeros(cfg.max_length)

            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)

    return inputs


class JigsawDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df, valid_df, text_col):
        super(JigsawDataModule).__init__()

        self.cfg = cfg
        self.text_col = text_col
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df

        self.train_dataset = None
        self.val_dataset = None

    def setup(self, stage=None):
        self.train_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.train_df, tokenizer=self.tokenizer, text_col=self.text_col)
        self.val_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.valid_df, tokenizer=self.tokenizer, text_col=self.text_col)
        
    def train_dataloader(self):
        train_dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.cfg.train_batch_size, 
            shuffle=True, 
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=True)
        
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            shuffle=False,
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=False)

        return val_dataloader

In [None]:
# =============================
# Model
# =============================
def get_optimizer(cfg, parameters):
    opt = cfg.optimizer
    if opt["optimizer"] == "AdamW":
        optimizer = AdamW(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    elif opt["optimizer"] == "Adam":
        optimizer = Adam(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    else:
        raise NotImplementedError
    
    return optimizer


def get_scheduler(cfg, optimizer, num_train_steps):
    sch = cfg.scheduler
    if sch["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps)
    
    elif sch["scheduler"] == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps,
            num_cycles=sch["num_cycles"]
            )

    elif sch["scheduler"] == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, 
            milestones=sch["milestones"], 
            gamma=sch["gamma"]
        )

    elif sch["scheduler"] == "CosineAnnealingWarmupRestarts":
        
        first_cycle_steps = (num_train_steps // cfg.max_epochs) * cfg.train_batch_size
        print(first_cycle_steps)
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            first_cycle_steps=int(first_cycle_steps),
            cycle_mult=sch['T_mult'],
            max_lr=sch["max_lr"],
            min_lr=sch['min_lr'],
            warmup_steps=sch['warmup_steps'],
            gamma=sch['gamma']
        )
    else:
        raise NotImplementedError
    
    return scheduler


class JigsawModel(pl.LightningModule):
    def __init__(self, cfg):
        super(JigsawModel, self).__init__()
        self.cfg = cfg
        self.total_steps = None
        self.dataset_size = None

        self.backborn = get_backborn(cfg)   
        self.out = nn.Linear(cfg.hidden_size, len(cfg.target_cols))

    def forward(self, inputs):
        x = self.backborn(**inputs)
        x = x[0]
        x = x[:, 0, :]

        x_out = self.out(x)

        return x_out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("val_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def loss(self, outputs, targets):
        loss_fn = nn.MSELoss()
        loss = loss_fn(outputs, targets)
        # loss = torch.sqrt(loss)
        return loss

    def setup(self, stage=None):
        if stage != "fit":
            return

        # calculate total steps
        if self.dataset_size is None:
            dataset = self.trainer._data_connector._train_dataloader_source.dataloader()
            self.dataset_size = len(dataset)
        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)  # gpus=-1だとそれが反映されちゃう
        effective_batch_size = self.cfg.train_batch_size * self.trainer.accumulate_grad_batches * num_devices
        print(self.dataset_size, effective_batch_size)
        self.total_steps = (self.dataset_size // effective_batch_size) * self.cfg.max_epochs

    def configure_optimizers(self):
        optimizer = get_optimizer(self.cfg, parameters=self.parameters())

        if self.cfg.scheduler is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_train_steps=self.total_steps)
            return [optimizer], [{"scheduler": scheduler, "interval": self.cfg.scheduler["interval"]}]

In [None]:
# =============================
# Metrics
# ============================= 
def get_validation_data_hat(cfg, tokenizer, filename, validation_data):
    validation_data_ = validation_data.copy()
    df = pd.DataFrame({"text":sorted(set(validation_data_["less_toxic"].unique()) |
                                     set(validation_data_["more_toxic"].unique()))})
    
    if filename is None:
        preds = predict_cv(cfg, df, tokenizer, text_col="text")
    else:
        preds = predict(cfg, df, tokenizer, filename, text_col="text")

    if np.ndim(preds) > 1:
        df["preds"] = np.mean(preds, axis=1)  # mean of targets
    else:
        df["preds"] = preds.reshape(-1)

    validation_data_ = (pd.merge(
        validation_data_, df, left_on="less_toxic", right_on="text", how="left").
        rename(columns={"preds":"less_toxic_preds"}).
        drop("text", axis=1))
    
    validation_data_ = (pd.merge(
        validation_data_, df, left_on="more_toxic", right_on="text", how="left").
        rename(columns={"preds":"more_toxic_preds"}).
        drop("text", axis=1))
    
    return validation_data_


def get_score(validation_data_hat):
    less_toxic, more_toxic = validation_data_hat["less_toxic_preds"], validation_data_hat["more_toxic_preds"]
    return np.mean(more_toxic > less_toxic)

In [None]:
# =============================
# Train & Predict
# =============================
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))


def train_fold(cfg, train_df, valid_df, tokenizer, filename, text_col):

    wandblogger = pl.loggers.WandbLogger(
        project=cfg.competition, 
        config=class2dict(cfg),
        group=f"{cfg.author}_{cfg.name}",  
        name="_".join(filename.split("-")[-2:]),
        job_type="train",
        reinit=True,
        anonymous=None,
        entity=cfg.wandb_entity
        )

    lightning_datamodule = JigsawDataModule(
        cfg=cfg, 
        tokenizer=tokenizer,
        train_df=train_df, 
        valid_df=valid_df, 
        text_col=text_col
        )
    
    lightning_model = JigsawModel(cfg=cfg)
    lightning_model.dataset_size = len(train_df)  # cuz setup donot work?

    checkpoint = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.EXP_MODEL,
        filename=filename,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    if cfg.early_stopping:
        early_stopping = pl.callbacks.EarlyStopping(
            monitor="val_loss", 
            min_delta=0.0, 
            patience=8, 
            mode='min', 
        )
        callbacks += [early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandblogger],
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        deterministic=False,
        gpus=-1,
        precision=16,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)
    torch.cuda.empty_cache()


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def train_cv(cfg, df, tokenizer, text_col=None, validation_data=None, get_oof=True):
    """cross validation & get oof"""
    oof_df = pd.DataFrame(np.zeros((len(df), len(cfg.target_cols))), columns=cfg.target_cols)

    for i_fold in range(cfg.n_fold):

        if i_fold in cfg.trn_fold:
            filename = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            filelist = get_filname_listdir(cfg.EXP_MODEL)

            val_mask = (df["fold"] == i_fold).astype(bool)
            train_df = df[~val_mask].reset_index(drop=True)
            valid_df = df[val_mask].reset_index(drop=True)

            if not filename in filelist:
                print(f"# --------- # Start Training Fold={i_fold} # --------- #")
                # training
                train_fold(
                    cfg=cfg, 
                    train_df=train_df, 
                    valid_df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col
                    )

            # get validation data score
            if validation_data is not None:
                validation_data_hat = get_validation_data_hat(cfg, tokenizer, filename, validation_data)
                val_score = get_score(validation_data_hat)
                log = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}: validation data score={val_score:.4f}"
                cfg.logger.info(log)

            # get validation prediction
            if get_oof:
                preds = predict(
                    cfg=cfg,
                    df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col)
                
                oof_df.loc[val_mask] = preds
                return oof_df


def predict(cfg, df, tokenizer, filename, text_col):
    test_dataset = JigsawTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, text_col=text_col)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.valid_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers, 
        pin_memory=True, 
        drop_last=False
        ) 
    
    lightning_model = JigsawModel(cfg=cfg).to(cfg.DEVICE).eval()
    checkpoint_path = os.path.join(cfg.EXP_MODEL, filename + ".ckpt") 
    lightning_model.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    num_targets = len(cfg.target_cols)
    preds = np.zeros((len(df), num_targets))  # N * num targets
    fill_start_idx = 0

    for inputs in tqdm(test_dataloader,total=len(test_dataloader)):
        # get predicted labels by batch
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.DEVICE)

        with torch.no_grad():
            pred = lightning_model(inputs)
            pred = pred.cpu().numpy()  # bs * num targets
        
        fill_end_idx = pred.shape[0] + fill_start_idx  # bs + idx
        preds[fill_start_idx:fill_end_idx] = pred
        fill_start_idx = fill_end_idx
        
    
    del test_dataset, test_dataloader, lightning_model
    gc.collect()

    return preds


def predict_cv(cfg, df, tokenizer, text_col):
    num_targets = len(cfg.target_cols)
    preds = []
    
    for i_fold in range(cfg.n_fold):
        if i_fold in cfg.trn_fold:
            filename =f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            preds_fold = predict(cfg, df, tokenizer, filename, text_col)
            preds.append(preds_fold)
    
    preds = np.mean(preds, axis=0)  # fold mean
    return preds

In [None]:
# =============================
# Load Model
# =============================
def get_tokenizer(cfg):

    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    tokenizer_path = os.path.join(pretrained_dir, "tokenizer_config.json")  # tokenizer.json??
    if not os.path.isfile(tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        tokenizer.save_pretrained(pretrained_dir)
    
    else:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)

    return tokenizer


def get_backborn(cfg):
    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    backborn_path = os.path.join(pretrained_dir, "pytorch_model.bin")
    if not os.path.isfile(backborn_path):
        model_config = AutoConfig.from_pretrained(cfg.model_name)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0

        backborn = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backborn.save_pretrained(pretrained_dir)
    
    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0
        
        if cfg.use_pretrain_model:
            backborn = AutoModel.from_pretrained(pretrained_dir, config=model_config)
        else:
            backborn = AutoModel.from_config(model_config)  # inference 時は pretrain weight いらない：cfg.use_pretrain_model=False

    return backborn

In [None]:
# =============================
# Create Data
# =============================
def read_csv(filepath, **kwargs):
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data


def text_cleaning(text):
    '''
    ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


def get_jigsaw_01_dataset(cfg):
    """
    jigsaw-toxic-comment-classification-challenge
    - text_col : "comment_text2
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    """
    jigsaw1_train = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "train.csv"))
    jigsaw1_test = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test.csv"))
    jigsaw1_test_label = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test_labels.csv"))
    scoring_mask = jigsaw1_test_label["toxic"] != -1
    jigsaw1_test = pd.merge(jigsaw1_test[scoring_mask], jigsaw1_test_label[scoring_mask], on="id", how="left")
    jigsaw1_train = pd.concat([jigsaw1_train, jigsaw1_test], axis=0).reset_index(drop=True)

    return jigsaw1_train


def get_jigsaw_02_dataset(cfg, cat_threshold=0.5):
    """
    jigsaw-unintended-bias-in-toxicity-classification
    - text_col : "comment_text"
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    """
    jigsaw2_data = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "all_data.csv"), usecols=["id", "comment_text"])
    jigsaw2_labels = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "toxicity_individual_annotations.csv"))
    jigsaw2_agg_labels = jigsaw2_labels.groupby(["id"]).agg("mean")

    if cat_threshold is not None:
        jigsaw2_agg_labels = pd.DataFrame(
            np.where(jigsaw2_agg_labels >= cat_threshold, 1, 0), 
            index=jigsaw2_agg_labels.index,
            columns=jigsaw2_agg_labels.columns)
    
    jigsaw2_train = pd.merge(jigsaw2_data, jigsaw2_agg_labels, on="id", how="left")
    jigsaw2_train = jigsaw2_train.dropna(axis=0).reset_index(drop=True)
    jigsaw2_train = (jigsaw2_train.
                        rename(columns={"identity_attack":"identity_hate"}).
                        drop(["sexual_explicit", "worker"], axis=1))
    
    return jigsaw2_train


def get_ruddit_dataset(cfg):
    """
    Ruddit Dataset
    - text_col : "comment_text"
    - target_cols : "offensiveness_score"
    """
    ruddit_df = read_csv(os.path.join(cfg.INPUT_RUDDIT, "Dataset", "ruddit_with_text.csv"))
    ruddit_df = ruddit_df[~ruddit_df["txt"].isin(["[deleted]", "[removed]"])].reset_index(drop=True)
    # ruddit_df["comment_text"] = text_normalization(ruddit_df["txt"])
    ruddit_df["comment_text"] = ruddit_df["txt"].fillna("none")
    return ruddit_df.drop("txt", axis=1)


def get_fold_idx(cfg, df):
    df["fold"] = -1
    y = df[cfg.target_cols].sum(axis=1)
    cv_strategy = KFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for i_fold, (tr_idx, va_idx) in enumerate(cv_strategy.split(X=df, y=y)):
        df.loc[va_idx, "fold"] = i_fold
    
    return df


def get_custom_jigsaw_dataset(cfg, train_data, validation_data):
    """
    ref) https://www.kaggle.com/toru59er/0-866-tfidf-ridge-simple-baseline
    target_cols : ["toxic_score"]
    weighted sum of targets:["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    undersampling
    """

    train_data["toxic_score"] = train_data[cfg.target_cols].sum(axis=1)
    
    # undersample
    toxic_mask = (train_data["toxic_score"] > 0).astype(bool)
    min_len = np.sum(toxic_mask)

    sampled_data = train_data[train_data["toxic_score"] == 0].sample(n=min_len, random_state=cfg.seed)
    train_data = pd.concat([train_data[toxic_mask], sampled_data]).reset_index(drop=True).drop("toxic_score", axis=1)

    val_comment_unq = np.unique(validation_data['less_toxic'].tolist() + validation_data['more_toxic'].tolist())
    duplicate_idx = np.isin(train_data['comment_text'], val_comment_unq)
    train_data = train_data.iloc[~duplicate_idx].reset_index(drop=True)

    return train_data

In [None]:
print("# ------------------ # Load Data # ------------------ #")

# load tokenizer
tokenizer = get_tokenizer(Config)

comments_to_score = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "comments_to_score.csv"))

if len(comments_to_score) == 7537:
    comments_to_score = comments_to_score.iloc[:100]
    
# comments_to_score["text"] = text_normalization(comments_to_score["text"])
sample_submission = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "sample_submission.csv"))

if not Config.inference_only:

    # load validation data
    validation_data = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "validation_data.csv"))

    # load train data
    train_data = read_csv("/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments/mst8823/Input/PseudoLabelDataset-Jigsaw1.csv")
    train_data = get_custom_jigsaw_dataset(Config, train_data, validation_data)
    train_data = get_fold_idx(cfg=Config, df=train_data)

    # train_data["comment_text"] = text_normalization(train_data["comment_text"])
    # validation_data["less_toxic"] = text_normalization(validation_data["less_toxic"])
    # validation_data["more_toxic"] = text_normalization(validation_data["more_toxic"])

    print("# ------------------ # Training # ------------------ #")
    # training
    train_cv(
        cfg=Config, 
        df=train_data, 
        tokenizer=tokenizer, 
        text_col="comment_text",  #comment_text
        validation_data=validation_data, 
        get_oof=False)

    print("# ------------------ # Validation # ------------------ #")
    # validation
    validation_data_hat = get_validation_data_hat(
        cfg=Config, 
        tokenizer=tokenizer, 
        filename=None, 
        validation_data=validation_data
        )
    filepath = os.path.join(Config.EXP_PREDS, "validation_data.csv")
    validation_data_hat.to_csv(filepath, index=False)
    score = get_score(validation_data_hat)
    Config.logger.info(f"validation score = {score:.4f}")

print("# ------------------ # Inference # ------------------ #")
preds = predict_cv(
    cfg=Config, 
    df=comments_to_score, 
    tokenizer=tokenizer, 
    text_col="text")

print(preds.shape)
if np.ndim(preds) > 1:
    mst030 = np.mean(preds, axis=1)  # mean of target
else:
    mst030 = preds

## MSTtweet

In [None]:
"""
pseudo labeling
tweet dataset
"""
class Config:
    author = "mst8823"
    wandb_entity = "mst8823"
    
    competition = "jigsaw-toxic-severity-rating"
    name = "Pseudo-Labeling-001"
    debug = False
    inference_only = True
    use_pretrain_model = False
    target_cols = ["pseudo_label"]
    
    model_name = "unitary/multilingual-toxic-xlm-roberta"
    hidden_size = 768
    head = 256
    tail = 0
    max_length = head + tail

    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    seed = 2022

    max_epochs = 4
    gradient_clip_val = 100
    accumulate_grad_batches = 1
    early_stopping = False
    optimizer = dict(
        optimizer="AdamW", 
        lr=1e-5, 
        weight_decay=2e-5
        )
    scheduler = dict(
        interval = "step",
        scheduler="CosineAnnealingWarmupRestarts",
        max_lr=1e-5,
        min_lr=1e-6,
        T_mult=1,
        warmup_steps=10,
        gamma=1)
    
    train_batch_size = 8
    valid_batch_size = 32
    num_workers = 4
    resume_from_checkpoint = None

    colab_dir = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments"
    drive_path = colab_dir + f"/{author}"
    api_path = drive_path + "/kaggle.json"

    upload_from_colab = False
    kaggle_dataset_path = "../input/pseudo-labeling-001-code-fit"

    """
    - step scheduler example
    scheduler = dict(
        interval = "step",
        scheduler="get_cosine_schedule_with_warmup",
        num_warmup_steps=256, 
        num_cycles=0.5)

    """

In [None]:
import os
import re
import sys
import logging
import shutil
import json
import datetime
import requests
import itertools
import functools
import warnings
import joblib
import gc
import random
import string
import re
import collections

import pandas as pd
import numpy as np
import nltk

from tqdm.auto import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from scipy.special import softmax
from bs4 import BeautifulSoup

import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    MultiStepLR, 
    ReduceLROnPlateau
    )
from torch.utils.data import Dataset, DataLoader

In [None]:
# =========================
# Utils
# =========================
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def seed_everything(seed=2022):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def setup(cfg):
    cfg.COLAB = "google.colab" in sys.modules
    if cfg.COLAB:
        print("This environment is Google Colab")
        
        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet pytorch_lightning
        ! pip install --quiet transformers
        ! pip install --quiet wandb
        ! pip install --quiet sentencepiece
        ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]
        
        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
        cfg.INPUT = os.path.join(cfg.DRIVE, "Input")
        cfg.OUTPUT = os.path.join(cfg.DRIVE, "Output")
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, "Submission")
        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS] + cfg.jigsaw_inputs:
            os.makedirs(d, exist_ok=True)

        if not os.path.isfile(os.path.join(cfg.INPUT_JIGSAW_04, "comments_to_score.csv")):
            print("load dataset")
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $cfg.INPUT_JIGSAW_01 
            ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $cfg.INPUT_JIGSAW_02 
            ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $cfg.INPUT_JIGSAW_03 
            ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $cfg.INPUT_JIGSAW_04 
            ! kaggle datasets download -d rajkumarl/ruddit-jigsaw-dataset -p $cfg.INPUT_RUDDIT

            for input_path in cfg.jigsaw_inputs:
                filepath = f'{input_path}/{input_path.split("/")[-1]}'
                ! unzip -d $input_path $filepath

    else:
        print("This environment is Kaggle Kernel")
        if not cfg.inference_only:
            ! pip install --quiet pytorch_lightning==1.5.8 
            ! pip install --quiet 'git+https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup'

        # set dirs
        cfg.INPUT = f"../input"

        # input data
        cfg.INPUT_JIGSAW_01 = os.path.join(cfg.INPUT, "jigsaw-toxic-comment-classification-challenge")
        cfg.INPUT_JIGSAW_02 = os.path.join(cfg.INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
        cfg.INPUT_JIGSAW_03 = os.path.join(cfg.INPUT, "jigsaw-multilingual-toxic-comment-classification")
        cfg.INPUT_JIGSAW_04 = os.path.join(cfg.INPUT, "jigsaw-toxic-severity-rating")
        cfg.INPUT_RUDDIT = os.path.join(cfg.INPUT, "ruddit-jigsaw-dataset")
        cfg.jigsaw_inputs = [cfg.INPUT_JIGSAW_01, cfg.INPUT_JIGSAW_02, cfg.INPUT_JIGSAW_03, cfg.INPUT_JIGSAW_04, 
                             cfg.INPUT_RUDDIT]

        cfg.EXP = cfg.OUTPUT_EXP = "./"
        if cfg.kaggle_dataset_path is not None:
            cfg.EXP_MODEL = os.path.join(cfg.kaggle_dataset_path, "model")
        else:
            cfg.EXP_MODEL = os.path.join(cfg.EXP, "model")

        cfg.SUBMISSION = "./"
        cfg.EXP_FIG = os.path.join(cfg.EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.EXP, "preds")

        # make dirs
        make_dirs = [cfg.EXP_FIG, cfg.EXP_PREDS]
        if not cfg.inference_only:
            make_dirs.append(cfg.EXP_MODEL)
        for d in make_dirs:
            os.makedirs(d, exist_ok=True)

    # set device    
    cfg.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    warnings.filterwarnings("ignore")
    seed_everything(cfg.seed)

    cfg.logger = Logger(cfg.OUTPUT_EXP)

    return cfg


# =========================
# SetUp
# =========================
Config = setup(Config)

# 2nd import
import pytorch_lightning as pl
import wandb

from transformers import (AutoConfig, AutoModel, AutoTokenizer)
from transformers import (get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup)

if not Config.inference_only:
    from cosine_annealing_warmup import CosineAnnealingWarmupRestarts

# wandb setting
if not Config.COLAB:
    if  not Config.inference_only:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        api_key = user_secrets.get_secret("WANDB_API")
        wandb.login(key=api_key)
else:
    wandb.login()

In [None]:
# =============================
# Dataset
# =============================
class JigsawTrainDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].values
        self.targets = df[cfg.target_cols].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        targets = torch.tensor(self.targets[idx]).float()

        return inputs, targets


class JigsawTestDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, text_col):
        self.cfg = cfg
        self.comment_text = df[text_col].fillna("none").values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):
        text = str(self.comment_text[idx])
        inputs = prepare_input(self.cfg, text, self.tokenizer)
        return inputs


def prepare_input(cfg, text, tokenizer):
    if cfg.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=cfg.max_length,
            pad_to_max_length=True,
            truncation=True)
        
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)

    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True)
        
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_length:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])

            if k == 'input_ids':
                new_v = np.ones(cfg.max_length) * tokenizer.pad_token_id

            else:
                new_v = np.zeros(cfg.max_length)

            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)

    return inputs


class JigsawDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df, valid_df, text_col):
        super(JigsawDataModule).__init__()

        self.cfg = cfg
        self.text_col = text_col
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df

        self.train_dataset = None
        self.val_dataset = None

    def setup(self, stage=None):
        self.train_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.train_df, tokenizer=self.tokenizer, text_col=self.text_col)
        self.val_dataset = JigsawTrainDataset(
            cfg=self.cfg, df=self.valid_df, tokenizer=self.tokenizer, text_col=self.text_col)
        
    def train_dataloader(self):
        train_dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.cfg.train_batch_size, 
            shuffle=True, 
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=True)
        
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            shuffle=False,
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=False)

        return val_dataloader

In [None]:
# =============================
# Model
# =============================
def get_optimizer(cfg, parameters):
    opt = cfg.optimizer
    if opt["optimizer"] == "AdamW":
        optimizer = AdamW(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    elif opt["optimizer"] == "Adam":
        optimizer = Adam(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    else:
        raise NotImplementedError
    
    return optimizer


def get_scheduler(cfg, optimizer, num_train_steps):
    sch = cfg.scheduler
    if sch["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps)
    
    elif sch["scheduler"] == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps,
            num_cycles=sch["num_cycles"]
            )

    elif sch["scheduler"] == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, 
            milestones=sch["milestones"], 
            gamma=sch["gamma"]
        )

    elif sch["scheduler"] == "CosineAnnealingWarmupRestarts":
        
        first_cycle_steps = (num_train_steps // cfg.max_epochs) * cfg.train_batch_size
        print(first_cycle_steps)
        scheduler = CosineAnnealingWarmupRestarts(
            optimizer,
            first_cycle_steps=int(first_cycle_steps),
            cycle_mult=sch['T_mult'],
            max_lr=sch["max_lr"],
            min_lr=sch['min_lr'],
            warmup_steps=sch['warmup_steps'],
            gamma=sch['gamma']
        )
    else:
        raise NotImplementedError
    
    return scheduler


class JigsawModel(pl.LightningModule):
    def __init__(self, cfg):
        super(JigsawModel, self).__init__()
        self.cfg = cfg
        self.total_steps = None
        self.dataset_size = None

        self.backborn = get_backborn(cfg)   
        self.out = nn.Linear(cfg.hidden_size, len(cfg.target_cols))

    def forward(self, inputs):
        x = self.backborn(**inputs)
        x = x[0]
        x = x[:, 0, :]

        x_out = self.out(x)

        return x_out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs, targets)
        self.log("val_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def loss(self, outputs, targets):
        loss_fn = nn.MSELoss()
        loss = loss_fn(outputs, targets)
        # loss = torch.sqrt(loss)
        return loss

    def setup(self, stage=None):
        if stage != "fit":
            return

        # calculate total steps
        if self.dataset_size is None:
            dataset = self.trainer._data_connector._train_dataloader_source.dataloader()
            self.dataset_size = len(dataset)
        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)  # gpus=-1だとそれが反映されちゃう
        effective_batch_size = self.cfg.train_batch_size * self.trainer.accumulate_grad_batches * num_devices
        print(self.dataset_size, effective_batch_size)
        self.total_steps = (self.dataset_size // effective_batch_size) * self.cfg.max_epochs

    def configure_optimizers(self):
        optimizer = get_optimizer(self.cfg, parameters=self.parameters())

        if self.cfg.scheduler is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_train_steps=self.total_steps)
            return [optimizer], [{"scheduler": scheduler, "interval": self.cfg.scheduler["interval"]}]

In [None]:
# =============================
# Metrics
# ============================= 
def get_validation_data_hat(cfg, tokenizer, filename, validation_data):
    validation_data_ = validation_data.copy()
    df = pd.DataFrame({"text":sorted(set(validation_data_["less_toxic"].unique()) |
                                     set(validation_data_["more_toxic"].unique()))})
    
    if filename is None:
        preds = predict_cv(cfg, df, tokenizer, text_col="text")
    else:
        preds = predict(cfg, df, tokenizer, filename, text_col="text")

    if np.ndim(preds) > 1:
        df["preds"] = np.mean(preds, axis=1)  # mean of targets
    else:
        df["preds"] = preds.reshape(-1)

    validation_data_ = (pd.merge(
        validation_data_, df, left_on="less_toxic", right_on="text", how="left").
        rename(columns={"preds":"less_toxic_preds"}).
        drop("text", axis=1))
    
    validation_data_ = (pd.merge(
        validation_data_, df, left_on="more_toxic", right_on="text", how="left").
        rename(columns={"preds":"more_toxic_preds"}).
        drop("text", axis=1))
    
    return validation_data_


def get_score(validation_data_hat):
    less_toxic, more_toxic = validation_data_hat["less_toxic_preds"], validation_data_hat["more_toxic_preds"]
    return np.mean(more_toxic > less_toxic)

In [None]:
# =============================
# Train & Predict
# =============================
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))


def train_fold(cfg, train_df, valid_df, tokenizer, filename, text_col):

    wandblogger = pl.loggers.WandbLogger(
        project=cfg.competition, 
        config=class2dict(cfg),
        group=f"{cfg.author}_{cfg.name}",  
        name="_".join(filename.split("-")[-2:]),
        job_type="train",
        reinit=True,
        anonymous=None,
        entity=cfg.wandb_entity
        )

    lightning_datamodule = JigsawDataModule(
        cfg=cfg, 
        tokenizer=tokenizer,
        train_df=train_df, 
        valid_df=valid_df, 
        text_col=text_col
        )
    
    lightning_model = JigsawModel(cfg=cfg)
    lightning_model.dataset_size = len(train_df)  # cuz setup donot work?

    checkpoint = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.EXP_MODEL,
        filename=filename,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    if cfg.early_stopping:
        early_stopping = pl.callbacks.EarlyStopping(
            monitor="val_loss", 
            min_delta=0.0, 
            patience=8, 
            mode='min', 
        )
        callbacks += [early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandblogger],
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        deterministic=False,
        gpus=-1,
        precision=16,
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)
    torch.cuda.empty_cache()


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def train_cv(cfg, df, tokenizer, text_col=None, validation_data=None, get_oof=True):
    """cross validation & get oof"""
    oof_df = pd.DataFrame(np.zeros((len(df), len(cfg.target_cols))), columns=cfg.target_cols)

    for i_fold in range(cfg.n_fold):

        if i_fold in cfg.trn_fold:
            filename = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            filelist = get_filname_listdir(cfg.EXP_MODEL)

            val_mask = (df["fold"] == i_fold).astype(bool)
            train_df = df[~val_mask].reset_index(drop=True)
            valid_df = df[val_mask].reset_index(drop=True)

            if not filename in filelist:
                print(f"# --------- # Start Training Fold={i_fold} # --------- #")
                # training
                train_fold(
                    cfg=cfg, 
                    train_df=train_df, 
                    valid_df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col
                    )

            # get validation data score
            if validation_data is not None:
                validation_data_hat = get_validation_data_hat(cfg, tokenizer, filename, validation_data)
                val_score = get_score(validation_data_hat)
                log = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}: validation data score={val_score:.4f}"
                cfg.logger.info(log)

            # get validation prediction
            if get_oof:
                preds = predict(
                    cfg=cfg,
                    df=valid_df, 
                    tokenizer=tokenizer, 
                    filename=filename, 
                    text_col=text_col)
                
                oof_df.loc[val_mask] = preds
                return oof_df


def predict(cfg, df, tokenizer, filename, text_col):
    test_dataset = JigsawTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, text_col=text_col)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.valid_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers, 
        pin_memory=True, 
        drop_last=False
        ) 
    
    lightning_model = JigsawModel(cfg=cfg).to(cfg.DEVICE).eval()
    checkpoint_path = os.path.join(cfg.EXP_MODEL, filename + ".ckpt") 
    lightning_model.load_state_dict(torch.load(checkpoint_path)['state_dict'])

    num_targets = len(cfg.target_cols)
    preds = np.zeros((len(df), num_targets))  # N * num targets
    fill_start_idx = 0

    for inputs in tqdm(test_dataloader,total=len(test_dataloader)):
        # get predicted labels by batch
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.DEVICE)

        with torch.no_grad():
            pred = lightning_model(inputs)
            pred = pred.cpu().numpy()  # bs * num targets
        
        fill_end_idx = pred.shape[0] + fill_start_idx  # bs + idx
        preds[fill_start_idx:fill_end_idx] = pred
        fill_start_idx = fill_end_idx
        
    
    del test_dataset, test_dataloader, lightning_model
    gc.collect()

    return preds


def predict_cv(cfg, df, tokenizer, text_col):
    num_targets = len(cfg.target_cols)
    preds = []
    
    for i_fold in range(cfg.n_fold):
        if i_fold in cfg.trn_fold:
            filename =f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            preds_fold = predict(cfg, df, tokenizer, filename, text_col)
            preds.append(preds_fold)
    
    preds = np.mean(preds, axis=0)  # fold mean
    return preds

In [None]:
# =============================
# Load Model
# =============================
def get_tokenizer(cfg):

    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    tokenizer_path = os.path.join(pretrained_dir, "tokenizer_config.json")  # tokenizer.json??
    if not os.path.isfile(tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
        tokenizer.save_pretrained(pretrained_dir)
    
    else:
        tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)

    return tokenizer


def get_backborn(cfg):
    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    backborn_path = os.path.join(pretrained_dir, "pytorch_model.bin")
    if not os.path.isfile(backborn_path):
        model_config = AutoConfig.from_pretrained(cfg.model_name)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0

        backborn = AutoModel.from_pretrained(cfg.model_name, config=model_config)

        backborn.save_pretrained(pretrained_dir)
    
    else:
        model_config = AutoConfig.from_pretrained(pretrained_dir)

        # No dropout
        model_config.attention_probs_dropout_prob = 0.0
        model_config.hidden_dropout_prob = 0.0
        
        if cfg.use_pretrain_model:
            backborn = AutoModel.from_pretrained(pretrained_dir, config=model_config)
        else:
            backborn = AutoModel.from_config(model_config)  # inference 時は pretrain weight いらない：cfg.use_pretrain_model=False

    return backborn

In [None]:
# =============================
# Create Data
# =============================
def read_csv(filepath, **kwargs):
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data


def text_cleaning(text):
    '''
    ref) # https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train

    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


def get_jigsaw_01_dataset(cfg):
    """
    jigsaw-toxic-comment-classification-challenge
    - text_col : "comment_text2
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

    """
    jigsaw1_train = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "train.csv"))
    jigsaw1_test = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test.csv"))
    jigsaw1_test_label = read_csv(os.path.join(cfg.INPUT_JIGSAW_01 , "test_labels.csv"))
    scoring_mask = jigsaw1_test_label["toxic"] != -1
    jigsaw1_test = pd.merge(jigsaw1_test[scoring_mask], jigsaw1_test_label[scoring_mask], on="id", how="left")
    jigsaw1_train = pd.concat([jigsaw1_train, jigsaw1_test], axis=0).reset_index(drop=True)

    return jigsaw1_train


def get_jigsaw_02_dataset(cfg, cat_threshold=0.5):
    """
    jigsaw-unintended-bias-in-toxicity-classification
    - text_col : "comment_text"
    - target_cols : ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    """
    jigsaw2_data = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "all_data.csv"), usecols=["id", "comment_text"])
    jigsaw2_labels = read_csv(os.path.join(cfg.INPUT_JIGSAW_02 , "toxicity_individual_annotations.csv"))
    jigsaw2_agg_labels = jigsaw2_labels.groupby(["id"]).agg("mean")

    if cat_threshold is not None:
        jigsaw2_agg_labels = pd.DataFrame(
            np.where(jigsaw2_agg_labels >= cat_threshold, 1, 0), 
            index=jigsaw2_agg_labels.index,
            columns=jigsaw2_agg_labels.columns)
    
    jigsaw2_train = pd.merge(jigsaw2_data, jigsaw2_agg_labels, on="id", how="left")
    jigsaw2_train = jigsaw2_train.dropna(axis=0).reset_index(drop=True)
    jigsaw2_train = (jigsaw2_train.
                        rename(columns={"identity_attack":"identity_hate"}).
                        drop(["sexual_explicit", "worker"], axis=1))
    
    return jigsaw2_train


def get_ruddit_dataset(cfg):
    """
    Ruddit Dataset
    - text_col : "comment_text"
    - target_cols : "offensiveness_score"
    """
    ruddit_df = read_csv(os.path.join(cfg.INPUT_RUDDIT, "Dataset", "ruddit_with_text.csv"))
    ruddit_df = ruddit_df[~ruddit_df["txt"].isin(["[deleted]", "[removed]"])].reset_index(drop=True)
    # ruddit_df["comment_text"] = text_normalization(ruddit_df["txt"])
    ruddit_df["comment_text"] = ruddit_df["txt"].fillna("none")
    return ruddit_df.drop("txt", axis=1)


def get_fold_idx(cfg, df):
    df["fold"] = -1
    y = df[cfg.target_cols].sum(axis=1)
    cv_strategy = KFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for i_fold, (tr_idx, va_idx) in enumerate(cv_strategy.split(X=df, y=y)):
        df.loc[va_idx, "fold"] = i_fold
    
    return df


def get_custom_jigsaw_dataset(cfg, train_data, validation_data):
    """
    ref) https://www.kaggle.com/toru59er/0-866-tfidf-ridge-simple-baseline
    target_cols : ["toxic_score"]
    weighted sum of targets:["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    undersampling
    """

    train_data["toxic_score"] = train_data[cfg.target_cols].sum(axis=1)
    
    # undersample
    toxic_mask = (train_data["toxic_score"] > 0).astype(bool)
    min_len = np.sum(toxic_mask)

    sampled_data = train_data[train_data["toxic_score"] == 0].sample(n=min_len, random_state=cfg.seed)
    train_data = pd.concat([train_data[toxic_mask], sampled_data]).reset_index(drop=True).drop("toxic_score", axis=1)

    val_comment_unq = np.unique(validation_data['less_toxic'].tolist() + validation_data['more_toxic'].tolist())
    duplicate_idx = np.isin(train_data['comment_text'], val_comment_unq)
    train_data = train_data.iloc[~duplicate_idx].reset_index(drop=True)

    return train_data

In [None]:
print("# ------------------ # Load Data # ------------------ #")

# load tokenizer
tokenizer = get_tokenizer(Config)

comments_to_score = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "comments_to_score.csv"))

if len(comments_to_score) == 7537:
    comments_to_score = comments_to_score.iloc[:100]

# comments_to_score["text"] = text_normalization(comments_to_score["text"])
sample_submission = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "sample_submission.csv"))

if not Config.inference_only:

    # load validation data
    validation_data = read_csv(os.path.join(Config.INPUT_JIGSAW_04 , "validation_data.csv"))

    # load train data
    train_data = read_csv("../input/jigsaw-pseudo-toxictweetsdataset/PseudoLabelDataset.csv")
    train_data = get_fold_idx(cfg=Config, df=train_data)

#     train_data["comment_text"] = text_normalization(train_data["comment_text"])
#     validation_data["less_toxic"] = text_normalization(validation_data["less_toxic"])
#     validation_data["more_toxic"] = text_normalization(validation_data["more_toxic"])

    print("# ------------------ # Training # ------------------ #")
    # training
    train_cv(
        cfg=Config, 
        df=train_data, 
        tokenizer=tokenizer, 
        text_col="tweet",  # comment_text
        validation_data=validation_data, 
        get_oof=False)

    print("# ------------------ # Validation # ------------------ #")
    # validation
    validation_data_hat = get_validation_data_hat(
        cfg=Config, 
        tokenizer=tokenizer, 
        filename=None, 
        validation_data=validation_data
        )
    filepath = os.path.join(Config.EXP_PREDS, "validation_data.csv")
    validation_data_hat.to_csv(filepath, index=False)
    score = get_score(validation_data_hat)
    Config.logger.info(f"validation score = {score:.4f}")

print("# ------------------ # Inference # ------------------ #")
preds = predict_cv(
    cfg=Config, 
    df=comments_to_score, 
    tokenizer=tokenizer, 
    text_col="text")

print(preds.shape)
if np.ndim(preds) > 1:
    msttweet = np.mean(preds, axis=1)  # mean of target
else:
    msttweet = preds

# COLUM2131

In [None]:
import os
import gc
import copy
import time
import random
import string
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from glob import glob
from tqdm.notebook import tqdm

from collections import defaultdict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

In [None]:
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length'
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']        
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }


@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS


def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    del model
    gc.collect()
    return final_preds

## COLUM014

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-roberta-base/model',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])


MAIN_PATH = '../input/jigsaw-exp014-roberta-base'

MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs


###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]

test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum014 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

## COLUM015

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-roberta-large/model',
    test_batch_size = 32,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])


MAIN_PATH = '../input/jigsaw-exp015-roberta-large'

MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(1024, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs

    
###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]

test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum015 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

## COLUM016

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-deberta-v3-base/model',
    test_batch_size = 32,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])


MAIN_PATH = '../input/jigsaw-exp016-deberta-v3-base'

MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs


###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]

test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum016 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

## COLUM018

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-unbiased-toxic-roberta/model',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])


MAIN_PATH = '../input/jigsaw-exp018-unbiased-toxic-roberta'

MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs


###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]

test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum018 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

## COLUM019

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-multilingual-toxic-xlm-roberta/model',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])

MAIN_PATH = '../input/jigsaw-exp019-toxic-xlm-roberta'
MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs


###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]

test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum019 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

## COLUM020

In [None]:
###############
# CONFIG
###############

CONFIG = dict(
    seed = 42,
    model_name = '../input/jigsaw-toxic-bert/model',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
set_seed(CONFIG['seed'])

MAIN_PATH = '../input/jigsaw-exp020-toxic-bert'
MODEL_PATHS = [
    f'../input/{MAIN_PATH}/Loss-Fold-0.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-1.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-2.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-3.bin',
    f'../input/{MAIN_PATH}/Loss-Fold-4.bin',
]


###############
# MODEL
###############

class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "attention_probs_dropout_prob": 0.0,
        })
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(
            input_ids=ids,
            attention_mask=mask,
        )
        outputs = self.linear(out.last_hidden_state[:, 0, :])
        return outputs


###############
# INFERENCE
###############

comments = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if len(comments) == 7537:
    comments = comments.iloc[:100]
    
test_dataset = JigsawDataset(
    comments,
    CONFIG['tokenizer'],
    max_length=CONFIG['max_length']
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CONFIG['test_batch_size'],
    num_workers=2,
    shuffle=False,
    pin_memory=True
)

colum020 = inference(MODEL_PATHS, test_loader, CONFIG['device'])

del comments, test_dataset, test_loader
gc.collect()

# CALPIS10000

## CALPIS001

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import os
import math
import random
import time
import pathlib
from pathlib import Path
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import gc
gc.enable()


# ----------------------------------------------
# Set Config
# ----------------------------------------------
class Config:
    INPUT_DIR_0 = Path('../input/jigsaw-toxic-severity-rating/')
    INPUT_DIR_1 = Path('../input/jigsaw-toxic-comment-classification-challenge/')
    INPUT_DIR_2 = Path('../input/jigsaw-unintended-bias-in-toxicity-classification/')
    SEED = 2021
    FOLDS = 5
    BATCH_SIZE = 32
    BATCH_SIZE_PRED = 512
    NUM_EPOCHS = 3
    MAX_LEN = 128
    LEANING_RATE = 1e-5
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない
    DEBUG = False
    TRAIN = False
    RUN_VALID = False
    PRETRAINED = '../input/roberta-transformers-pytorch/roberta-large'
    TOKENIZER = PRETRAINED

os.environ["TOKENIZERS_PARALLELISM"] = "true"

# ----------------------------------------------
# Load Data
# ----------------------------------------------
# INPUT_0: This Competition
submission = pd.read_csv(Config.INPUT_DIR_0/'sample_submission.csv')
val_data = pd.read_csv(Config.INPUT_DIR_0/'validation_data.csv')
test = pd.read_csv(Config.INPUT_DIR_0/'comments_to_score.csv')

if len(test) == 7537:
    test = test.iloc[:100]

print('load data: this competition')

# INPUT_1: 1st Competition
#train_1st = pd.read_csv(INPUT_DIR_1/'train.csv')
#test_1st = pd.read_csv(INPUT_DIR_1/'test.csv')
#test_labels_1st = pd.read_csv(INPUT_DIR_1/'test_labels.csv')
#print('load data: 1st competition')

# INPUT_2: 2nd Competition
#train_2nd = pd.read_csv(INPUT_DIR_2/'train.csv')
#test_2nd = pd.read_csv(INPUT_DIR_2/'test.csv')
#idt_indiv_anno = pd.read_csv(INPUT_DIR_2/'identity_individual_annotations.csv')
#tox_indiv_anno = pd.read_csv(INPUT_DIR_2/'toxicity_individual_annotations.csv')
#print('load data: 2nd competition')


# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
def set_seed(SEED):
    random.seed(Config.SEED)
    np.random.seed(Config.SEED)
    os.environ['PYTHONHASHSEED'] = str(Config.SEED)
    
    torch.manual_seed(Config.SEED)
    torch.cuda.manual_seed(Config.SEED)
    torch.cuda.manual_seed_all(Config.SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(Config.SEED)


# ----------------------------------------------
# Create Tokenizer
# ----------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(Config.TOKENIZER)
print('create: tokenizer')


# ----------------------------------------------
# Preprocess func
# ----------------------------------------------
# Preprocess
import string
import re
import collections

import nltk
from bs4 import BeautifulSoup

# https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


# ----------------------------------------------
# Dataset Class
# ----------------------------------------------
class Jigsaw1stDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__
        
        self.df = df
        self.inference_only = inference_only
        
        if not self.inference_only:
            self.target = torch.tensor(df[toxic_cols].values, dtype=torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            text_normalization(df['comment_text']).tolist(),
            padding='max_length',
            max_length=Config.MAX_LEN,
            truncation=True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask
                    }
        else:
            target = self.target[index]
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask, 
                    'target': target}


# ----------------------------------------------
# Model Class
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class Jigsaw1stModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = AutoConfig.from_pretrained(Config.PRETRAINED)
        self.pre_model = AutoModel.from_pretrained(Config.PRETRAINED)
        self.head = AttentionHead(config.hidden_size, config.hidden_size,1)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(config.hidden_size, 6)
    
    def forward(self, input_ids, attention_mask):
        pre_out = self.pre_model(input_ids=input_ids, attention_mask=attention_mask)
        x0 = pre_out['last_hidden_state']
        x1 = self.head(x0)
        x2 = self.dropout(x1)
        x3 = self.regressor(x2)
        return x3
    

# ----------------------------------------------
# predict func
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), 6))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(Config.DEVICE)
            attention_mask = data['attention_mask'].to(Config.DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result

In [None]:
toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
model_path = Path('../input/jigsaw-calpis-001')
models = sorted([str(i) for i in list(model_path.iterdir())])
print(models)

In [None]:
test_preds = np.zeros((Config.FOLDS, len(test), 6))

test_dataset = Jigsaw1stDataset(test.rename(columns={'text':'comment_text'}),
                                inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE_PRED,
                         drop_last=False, shuffle=False, num_workers=2)

for i, model_ in enumerate(models):
    print(i)
    
    model = Jigsaw1stModel()
    model.to(Config.DEVICE)
    model.load_state_dict(torch.load(model_))    # 対応するモデルから、重みを読み込む
    test_preds[i, :] = predict(model, test_loader)

del test_dataset, test_loader, model
gc.collect()

calpis001 = test_preds.mean(axis=0)

In [None]:
opt_calpis001 = np.zeros(len(calpis001))
opt_calpis001 += calpis001[:, 0] * 0.9002713914265349
opt_calpis001 += calpis001[:, 1] * 0.6604849251407883
opt_calpis001 += calpis001[:, 2] * 0.37099326391227044
opt_calpis001 += calpis001[:, 3] * 0.12224998906148228
opt_calpis001 += calpis001[:, 4] * 0.2487358556500514
opt_calpis001 += calpis001[:, 5] * 0.3088187717206416
print(opt_calpis001.mean(), opt_calpis001.std())

## CALPIS011

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import os
import math
import random
import time
import pathlib
from pathlib import Path
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import gc
gc.enable()


# ----------------------------------------------
# Set Config
# ----------------------------------------------
class Config:
    INPUT_DIR_0 = Path('../input/jigsaw-toxic-severity-rating/')
    INPUT_DIR_1 = Path('../input/jigsaw-toxic-comment-classification-challenge/')
    INPUT_DIR_2 = Path('../input/jigsaw-unintended-bias-in-toxicity-classification/')
    SEED = 2021
    FOLDS = 5
    BATCH_SIZE = 32
    BATCH_SIZE_PRED = 512
    NUM_EPOCHS = 6
    NUM_CLASSES = 1
    MAX_LEN = 128
    LEANING_RATE = 2e-5
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない
    DEBUG = False
    TRAIN = False
    RUN_VALID = False
    PRETRAINED = '../input/jigsaw-multilingual-toxic-xlm-roberta/model'
    TOKENIZER = PRETRAINED
    TRAINED_MODELS = Path('../input/jigsaw-calpis-011')

os.environ["TOKENIZERS_PARALLELISM"] = "true"


# ----------------------------------------------
# Load Data
# ----------------------------------------------
# INPUT_0: This Competition
submission = pd.read_csv(Config.INPUT_DIR_0/'sample_submission.csv')
val_data = pd.read_csv(Config.INPUT_DIR_0/'validation_data.csv')
test = pd.read_csv(Config.INPUT_DIR_0/'comments_to_score.csv')

if len(test) == 7537:
    test = test.iloc[:100]

print('load data: this competition')

# INPUT_1: 1st Competition
#train_1st = pd.read_csv(INPUT_DIR_1/'train.csv')
#test_1st = pd.read_csv(INPUT_DIR_1/'test.csv')
#test_labels_1st = pd.read_csv(INPUT_DIR_1/'test_labels.csv')
#print('load data: 1st competition')

# INPUT_2: 2nd Competition
#train_2nd = pd.read_csv(INPUT_DIR_2/'train.csv')
#test_2nd = pd.read_csv(INPUT_DIR_2/'test.csv')
#idt_indiv_anno = pd.read_csv(INPUT_DIR_2/'identity_individual_annotations.csv')
#tox_indiv_anno = pd.read_csv(INPUT_DIR_2/'toxicity_individual_annotations.csv')
#print('load data: 2nd competition')


# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
def set_seed(SEED):
    random.seed(Config.SEED)
    np.random.seed(Config.SEED)
    os.environ['PYTHONHASHSEED'] = str(Config.SEED)
    
    torch.manual_seed(Config.SEED)
    torch.cuda.manual_seed(Config.SEED)
    torch.cuda.manual_seed_all(Config.SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(Config.SEED)


# ----------------------------------------------
# Create Tokenizer
# ----------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(Config.TOKENIZER)
print('create: tokenizer')


# ----------------------------------------------
# Dataset Class
# ----------------------------------------------
class JigsawDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__
        
        self.df = df
        self.inference_only = inference_only
        
        if not self.inference_only:
            self.target = torch.tensor(df['target'].values, dtype=torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            #text_normalization(df['comment_text']).tolist(),
            df['comment_text'].tolist(),
            padding='max_length',
            max_length=Config.MAX_LEN,
            truncation=True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask
                    }
        else:
            target = self.target[index]
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask, 
                    'target': target}


# ----------------------------------------------
# Model Class
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class JigsawModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.PRETRAINED)
        self.config.attention_probs_dropout_prob = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.pre_model = AutoModel.from_pretrained(Config.PRETRAINED, config=self.config)
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.config.hidden_size, Config.NUM_CLASSES)
    
    def forward(self, input_ids, attention_mask):
        pre_out = self.pre_model(input_ids=input_ids, attention_mask=attention_mask)
        x0 = pre_out['last_hidden_state']
        x1 = self.head(x0)
        #x2 = self.dropout(x1)
        x3 = self.regressor(x1)
        return x3
    

# ----------------------------------------------
# predict func
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), Config.NUM_CLASSES))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(Config.DEVICE)
            attention_mask = data['attention_mask'].to(Config.DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result

In [None]:
model_path = Config.TRAINED_MODELS
models = sorted([str(i) for i in list(model_path.iterdir())])
print(models)

In [None]:
test_preds = np.zeros((Config.FOLDS, len(test), Config.NUM_CLASSES))

test_dataset = JigsawDataset(test.rename(columns={'text':'comment_text'}),
                                inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE_PRED,
                         drop_last=False, shuffle=False, num_workers=2)

for i, model_ in enumerate(models):
    print(i)
    
    model = JigsawModel()
    model.to(Config.DEVICE)
    model.load_state_dict(torch.load(model_))    # 対応するモデルから、重みを読み込む
    test_preds[i, :] = predict(model, test_loader)
    
del test_dataset, test_loader, model
gc.collect()

calpis011 = test_preds.mean(axis=0)
calpis011 = calpis011.flatten()

## CALPIS012

In [None]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import os
import math
import random
import time
import pathlib
from pathlib import Path
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import gc
gc.enable()


# ----------------------------------------------
# Set Config
# ----------------------------------------------
class Config:
    INPUT_DIR_0 = Path('../input/jigsaw-toxic-severity-rating/')
    INPUT_DIR_1 = Path('../input/jigsaw-toxic-comment-classification-challenge/')
    INPUT_DIR_2 = Path('../input/jigsaw-unintended-bias-in-toxicity-classification/')
    SEED = 2021
    FOLDS = 5
    BATCH_SIZE = 32
    BATCH_SIZE_PRED = 512
    NUM_EPOCHS = 6
    NUM_CLASSES = 1
    MAX_LEN = 128
    LEANING_RATE = 2e-5
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない
    DEBUG = False
    TRAIN = False
    RUN_VALID = False
    PRETRAINED = '../input/jigsaw-multilingual-toxic-xlm-roberta/model'
    TOKENIZER = PRETRAINED
    TRAINED_MODELS = Path('../input/jigsaw-calpis-012')

os.environ["TOKENIZERS_PARALLELISM"] = "true"


# ----------------------------------------------
# Load Data
# ----------------------------------------------
# INPUT_0: This Competition
submission = pd.read_csv(Config.INPUT_DIR_0/'sample_submission.csv')
val_data = pd.read_csv(Config.INPUT_DIR_0/'validation_data.csv')
test = pd.read_csv(Config.INPUT_DIR_0/'comments_to_score.csv')

if len(test) == 7537:
    test = test.iloc[:100]

print('load data: this competition')

# INPUT_1: 1st Competition
#train_1st = pd.read_csv(INPUT_DIR_1/'train.csv')
#test_1st = pd.read_csv(INPUT_DIR_1/'test.csv')
#test_labels_1st = pd.read_csv(INPUT_DIR_1/'test_labels.csv')
#print('load data: 1st competition')

# INPUT_2: 2nd Competition
#train_2nd = pd.read_csv(INPUT_DIR_2/'train.csv')
#test_2nd = pd.read_csv(INPUT_DIR_2/'test.csv')
#idt_indiv_anno = pd.read_csv(INPUT_DIR_2/'identity_individual_annotations.csv')
#tox_indiv_anno = pd.read_csv(INPUT_DIR_2/'toxicity_individual_annotations.csv')
#print('load data: 2nd competition')


# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
def set_seed(SEED):
    random.seed(Config.SEED)
    np.random.seed(Config.SEED)
    os.environ['PYTHONHASHSEED'] = str(Config.SEED)
    
    torch.manual_seed(Config.SEED)
    torch.cuda.manual_seed(Config.SEED)
    torch.cuda.manual_seed_all(Config.SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(Config.SEED)


# ----------------------------------------------
# Create Tokenizer
# ----------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(Config.TOKENIZER)
print('create: tokenizer')


# ----------------------------------------------
# Dataset Class
# ----------------------------------------------
class JigsawDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__
        
        self.df = df
        self.inference_only = inference_only
        
        if not self.inference_only:
            self.target = torch.tensor(df['target'].values, dtype=torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            #text_normalization(df['comment_text']).tolist(),
            df['comment_text'].tolist(),
            padding='max_length',
            max_length=Config.MAX_LEN,
            truncation=True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask
                    }
        else:
            target = self.target[index]
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask, 
                    'target': target}


# ----------------------------------------------
# Model Class
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class JigsawModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(Config.PRETRAINED)
        self.config.attention_probs_dropout_prob = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.pre_model = AutoModel.from_pretrained(Config.PRETRAINED, config=self.config)
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.config.hidden_size, Config.NUM_CLASSES)
    
    def forward(self, input_ids, attention_mask):
        pre_out = self.pre_model(input_ids=input_ids, attention_mask=attention_mask)
        x0 = pre_out['last_hidden_state']
        x1 = self.head(x0)
        #x2 = self.dropout(x1)
        x3 = self.regressor(x1)
        return x3
    

# ----------------------------------------------
# predict func
# ----------------------------------------------
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), Config.NUM_CLASSES))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(Config.DEVICE)
            attention_mask = data['attention_mask'].to(Config.DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result

In [None]:
model_path = Config.TRAINED_MODELS
models = sorted([str(i) for i in list(model_path.iterdir())])
print(models)

In [None]:
test_preds = np.zeros((Config.FOLDS, len(test), Config.NUM_CLASSES))

test_dataset = JigsawDataset(test.rename(columns={'text':'comment_text'}),
                                inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE_PRED,
                         drop_last=False, shuffle=False, num_workers=2)

for i, model_ in enumerate(models):
    print(i)
    
    model = JigsawModel()
    model.to(Config.DEVICE)
    model.load_state_dict(torch.load(model_))    # 対応するモデルから、重みを読み込む
    test_preds[i, :] = predict(model, test_loader)
    
del test_dataset, test_loader, model
gc.collect()

calpis012 = test_preds.mean(axis=0)
calpis012 = calpis012.flatten()

# NAOISM

## NAOISM1004

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import glob
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
import gc
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


INPUT_PATH = "../input/jigsaw-toxic-severity-rating/"
MODEL_PATH = "../input/jigsaw2021-models-naoism/"
MODEL_PATHS = glob.glob(MODEL_PATH + "*/*.pth")
OUTPUT_DIR = '/' 
print(MODEL_PATHS)

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 3
    exp_name = "exp1004"
    print_freq = 100
    toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = len(toxic_cols)  # Binary 
    hidden_node = 1024  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
CFG.model_path = model_path_list[0]
CFG.base_model_name = CFG.model_path[len(f"../input/jigsaw2021-models-naoism/{CFG.exp_name}/"):-len("_best_score.pth'")+1]
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"

In [None]:
# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub



def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
        }


class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.5)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for batch_data in tk0:
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        with torch.no_grad():
            y_preds = model(ids, mask)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


def main():
    seed_torch(seed=CFG.seed)
    
    validation_data, test, submission = read_data()
    if len(test) == 7537:
        test = test.iloc[:100]
    
    print("Text cleaning...")
    test['text'] = test['text'].progress_apply(text_cleaning)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_path)
    print("tokenizer:", tokenizer)
    
    test_dataset = TestDataset(test, tokenizer, CFG.max_length)
    test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    predictions = []
    for ms in range(CFG.n_model_seed):
        model = Model(CFG.base_model_path)
        # print("Loaded model:", model.base_model)
        state = torch.load(model_path_list[ms], map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state; gc.collect()
        torch.cuda.empty_cache()
    
    # submission['score'] = np.mean(predictions, axis=0)
    # submission[['comment_id', 'score']].to_csv('submission.csv', index=False)
    predictions = np.array(np.mean(predictions, axis=0))
    return predictions

In [None]:
naoism1004 = main()

In [None]:
opt_naoism1004 = np.zeros(len(naoism1004))
opt_naoism1004 += naoism1004[:, 0] * 0.23555360425248387
opt_naoism1004 += naoism1004[:, 1] * 0.935995774834193
opt_naoism1004 += naoism1004[:, 2] * 0.3951396313555483
opt_naoism1004 += naoism1004[:, 3] * 0.10302749719202853
opt_naoism1004 += naoism1004[:, 4] * 0.3105753465044702
opt_naoism1004 += naoism1004[:, 5] * 0.4543676174582658
print(opt_naoism1004.mean(), opt_naoism1004.std())

# NAOISM4001

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import glob
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
import gc
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


INPUT_PATH = "../input/jigsaw-toxic-severity-rating/"
MODEL_PATH = "../input/jigsaw2021-models-naoism/"
MODEL_PATHS = glob.glob(MODEL_PATH + "*/*.pth")
OUTPUT_DIR = '/' 
print(MODEL_PATHS)

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 1
    exp_name = "exp4001"
    print_freq = 100
    target_col = "target"
    base_model_name = "multilingual-toxic-xlm-roberta"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"
elif CFG.base_model_name == "multilingual-toxic-xlm-roberta":
    CFG.base_model_path = "../input/jigsaw-multilingual-toxic-xlm-roberta/model"

In [None]:
# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub



def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    text = text.replace('\n','')
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
        }


class Model_reg(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model_reg, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for batch_data in tk0:
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        with torch.no_grad():
            y_preds = model(ids, mask)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


def main():
    seed_torch(seed=CFG.seed)
    
    validation_data, test, submission = read_data()
    if len(test) == 7537:
        test = test.iloc[:100]
    
    print("Text cleaning...")
    test['text'] = test['text'].progress_apply(text_cleaning)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_path)
    print("tokenizer:", tokenizer)
    
    test_dataset = TestDataset(test, tokenizer, CFG.max_length)
    test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    predictions = []
    for ms in range(CFG.n_model_seed):
        model = Model_reg(CFG.base_model_path)
        # print("Loaded model:", model.base_model)
        state = torch.load(model_path_list[ms], map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state; gc.collect()
        torch.cuda.empty_cache()
    
    # submission['score'] = np.mean(predictions, axis=0)
    # submission[['comment_id', 'score']].to_csv('submission.csv', index=False)
    predictions = np.array(np.mean(predictions, axis=0))
    return predictions

## NAOISM4001 SEED0

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 1
    exp_name = "exp4001_pseudo_jigsaw1st_seed0"
    print_freq = 100
    target_col = "target"
    base_model_name = "multilingual-toxic-xlm-roberta"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"
elif CFG.base_model_name == "multilingual-toxic-xlm-roberta":
    CFG.base_model_path = "../input/jigsaw-multilingual-toxic-xlm-roberta/model"


naoism4001_seed0 = main()
naoism4001_seed0 = naoism4001_seed0.flatten()

## NAOISM4001 SEED1

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 1
    exp_name = "exp4001_pseudo_jigsaw1st_seed1"
    print_freq = 100
    target_col = "target"
    base_model_name = "multilingual-toxic-xlm-roberta"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"
elif CFG.base_model_name == "multilingual-toxic-xlm-roberta":
    CFG.base_model_path = "../input/jigsaw-multilingual-toxic-xlm-roberta/model"


naoism4001_seed1 = main()
naoism4001_seed1 = naoism4001_seed1.flatten()

## NAOISM4001 SEED2

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 1
    exp_name = "exp4001_pseudo_jigsaw1st_seed2"
    print_freq = 100
    target_col = "target"
    base_model_name = "multilingual-toxic-xlm-roberta"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"
elif CFG.base_model_name == "multilingual-toxic-xlm-roberta":
    CFG.base_model_path = "../input/jigsaw-multilingual-toxic-xlm-roberta/model"


naoism4001_seed2 = main()
naoism4001_seed2 = naoism4001_seed2.flatten()

In [None]:
naoism4001 = np.array([
    naoism4001_seed0,
    naoism4001_seed1,
    naoism4001_seed2
])

naoism4001 = naoism4001.mean(axis=0)

# NAOISM4003

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import glob
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
import gc
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


INPUT_PATH = "../input/jigsaw-toxic-severity-rating/"
MODEL_PATH = "../input/jigsaw2021-models-naoism/"
MODEL_PATHS = glob.glob(MODEL_PATH + "*/*.pth")
OUTPUT_DIR = '/' 
print(MODEL_PATHS)

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    seed = 0
    n_model_seed = 3
    exp_name = "exp4003"
    print_freq = 100
    target_col = "target"
    base_model_name = "multilingual-toxic-xlm-roberta"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    pretrained = False
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768

model_path_list = [mp for mp in MODEL_PATHS if CFG.exp_name in mp]
model_path_list = sorted(model_path_list)
print("Model PATHs:", model_path_list)
print("Base_Model_Name:", CFG.base_model_name)
if CFG.base_model_name == "bertweet-large":
    CFG.base_model_path = "../input/hugging-face-bertweet-large/model"
elif CFG.base_model_name == "roberta-large-squad2":
    CFG.base_model_path = "../input/hugging-face-roberta-large-squad2/model"
elif CFG.base_model_name == "roberta-large":
    CFG.base_model_path = "../input/hugging-face-roberta-large/model"
elif CFG.base_model_name == "albert-large-v2":
    CFG.base_model_path = "../input/hugging-face-albert-large-v2/model"
elif CFG.base_model_name == "multilingual-toxic-xlm-roberta":
    CFG.base_model_path = "../input/jigsaw-multilingual-toxic-xlm-roberta/model"

In [None]:
# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub



def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    text = text.replace('\n','')
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
        }


class Model_reg(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model_reg, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for batch_data in tk0:
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        with torch.no_grad():
            y_preds = model(ids, mask)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


def main():
    seed_torch(seed=CFG.seed)
    
    validation_data, test, submission = read_data()
    if len(test) == 7537:
        test = test.iloc[:100]
    
    print("Text cleaning...")
    test['text'] = test['text'].progress_apply(text_cleaning)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_path)
    print("tokenizer:", tokenizer)
    
    test_dataset = TestDataset(test, tokenizer, CFG.max_length)
    test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    
    predictions = []
    for ms in range(CFG.n_model_seed):
        model = Model_reg(CFG.base_model_path)
    #     print("Loaded model:", model.base_model)
        state = torch.load(model_path_list[ms], map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state; gc.collect()
        torch.cuda.empty_cache()
    
    # submission['score'] = np.mean(predictions, axis=0)
    # submission[['comment_id', 'score']].to_csv('submission.csv', index=False)
    predictions = np.array(np.mean(predictions, axis=0))
    return predictions

In [None]:
naoism4003 = main()
naoism4003 = naoism4003.flatten()

# Ensemble

In [None]:
opt_calpis001 = (opt_calpis001 - (-6.704694566371928)) / 6.580703873327574
calpis011 = (calpis011 - 0.8956230733023224) / 2.425622948718177
calpis012 = (calpis012 - 0.6895453671041332) / 2.5203015957033124
mst029 = (mst029 - 0.1526331402436699) / 0.3893276159801335
mst030 = (mst030 - 0.14825907568890248) / 0.3999030418536887
msttweet = (msttweet - 0.12312389583278482) / 0.4091648306519364
colum014 = (colum014 - 0.24127438913680962) / 0.44425866838169137
colum015 = (colum015 - (-0.2577407630301099)) / 0.5301155853075479
colum016 = (colum016 - 0.5858787377958661) / 0.8508980633132797
colum018 = (colum018 - 0.22228450849477926) / 0.42716001301281553
colum019 = (colum019 - 0.14911354891415332) / 0.4663198934837049
colum020 = (colum020 - (-0.14148513577689115)) / 0.4680944916684828
naoism4001 = (naoism4001 - 0.17912335923882833) / 0.4534633684561525
naoism4003 = (naoism4003 - 0.1490397266759139) /  0.406784531557102
opt_naoism1004 = (opt_naoism1004 - (-10.102940974843643)) / 6.497813296296825

pred_df = pd.DataFrame({
    'opt_calpis001': opt_calpis001,
    'calpis011': calpis011,
    'calpis012': calpis012,
    'mst029': mst029,
    'mst030': mst030,
    'msttweet': msttweet,
    'colum014': colum014,
    'colum015': colum015,
    'colum016': colum016,
    'colum018': colum018,
    'colum019': colum019,
    'colum020': colum020,
    'naoism4001': naoism4001,
    'naoism4003': naoism4003,
    'opt_naoism1004': opt_naoism1004
})

display(pred_df.describe())
pred_df.to_csv('pred_df.csv', index=False)

sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
if len(sub) == 7537:
    sub = sub.iloc[:100]
    
sub['score'] = (
    opt_calpis001 * 0.8907374041352457 +
    calpis011 * 0.46794326158461463 +
    calpis012 * 0.0613928819716386 +
    mst029 * 0.14599159334648742 +
    mst030 * 0.9400258577765264 +
    msttweet * 0.2296755364508963 +
    colum014 * 0.46435656956718 +
    colum015 * 0.8156998001053446 +
    colum016 * 0.6003326557886719 +
    colum018 * 0.08731982956387017 +
    colum019 * 0.02050220970508744 +
    colum020 * 0.8002815649764881 +
    naoism4001 * 0.9085892865154106 +
    naoism4003 * 0.0312185722129308 +
    opt_naoism1004 * 0.9998908148400331
)
sub['score'] = sub['score'].rank(method='first')
sub.to_csv("submission.csv", index=False)
sub