- Multi Label Learning using jigsaw-toxic-comment-classification-challenge training dataset
- reference
    - https://www.kaggle.com/yasufuminakama/jigsaw4-luke-base-starter-train/notebook
    - https://www.kaggle.com/quincyqiang/download-huggingface-pretrain-for-kaggle
    - https://www.kaggle.com/kishalmandal/jigsaw-fit-multi-label-comment-classifier
 
 
training notebook is [here](https://www.kaggle.com/mst8823/roberta-base-with-jigsaw-toxic-comment-train)


In [None]:
# ========================================
# Config
# ========================================
class Config:
    name = "roberta-base-with-Jigsaw-Toxic-Comments"  
    only_inference = True
    model_name = "../input/roberta-base-with-jigsaw-toxic-comment-train/roberta-base"
    verbose = 200
    max_length = 256
    lr = 1e-5
    weight_decay = 1e-5
    epochs = 2
    gradient_accumulation_steps = 1
    max_grad_norm = 1000 
    batch_scheduler = True
    
    scheduler = dict(
        scheduler="cosine_scedule_with_warmup", 
        num_warmup_steps=100, 
        num_training_steps=None)

    train_batch_size = 32
    valid_batch_size = 256
    test_batch_size = 256
    
    num_workers = 2
    is_higher_score_better = True  #  Jigsaw01 metrics
    n_fold = 2
    trn_fold = [0, 1]
    seed = 2022
    target_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    debug = False

    # Colab Env
    submit_from_colab = False
    upload_from_colab = False
    api_path = "/content/drive/MyDrive/competition/kaggle.json"
    drive_path = "/content/drive/MyDrive/competition/Jigsaw-Rate-Severity-of-Toxic-Comments"
    
    # Kaggle Env
    kaggle_dataset_path = "../input/roberta-base-with-jigsaw-toxic-comment-train"

if Config.debug:
    Config.epochs = 2
    Config.trn_fold = [0]

In [None]:
# ========================================
# Library
# ========================================
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import math
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR, 
    ReduceLROnPlateau
    )

In [None]:
# ========================================
# Utils
# ========================================
class Logger:
    """save log"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def read_csv(filepath, **kwargs):
    
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data

In [None]:
# ========================================
# SetUp
# ========================================
COLAB = "google.colab" in sys.modules

if COLAB:
    print("This environment is Google Colab")
    # import library
    ! pip install --quiet transformers
    ! pip install --quiet iterative-stratification

    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])  # get notebook name
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # all jigsaw input data
    INPUT_JIGSAW_01 = os.path.join(INPUT, "jigsaw-toxic-comment-classification-challenge")
    INPUT_JIGSAW_02 = os.path.join(INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
    INPUT_JIGSAW_03 = os.path.join(INPUT, "jigsaw-multilingual-toxic-comment-classification")
    INPUT_JIGSAW_04 = os.path.join(INPUT, "jigsaw-toxic-severity-rating")
    jigsaw_inputs = [INPUT_JIGSAW_01, INPUT_JIGSAW_02, INPUT_JIGSAW_03, INPUT_JIGSAW_04]

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS] + jigsaw_inputs:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT_JIGSAW_04, "comments_to_score.csv.zip")):
        # load dataset
        ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $INPUT_JIGSAW_01 
        ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $INPUT_JIGSAW_02 
        ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $INPUT_JIGSAW_03 
        ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $INPUT_JIGSAW_04 
    
    # utils
    logger = Logger(OUTPUT_EXP)

else:
    print("This environment is Kaggle Kernel")
    ! pip install --quiet ../input/iterative-stratification/iterative-stratification-master
    
    # download pretrain weight(https://www.kaggle.com/quincyqiang/download-huggingface-pretrain-for-kaggle/notebook)
    if not Config.only_inference:
        !curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
        !apt-get install -y --allow-unauthenticated git-lfs
        !git lfs install
        !git clone https://huggingface.co/$Config.model_name
        !GIT_LFS_SKIP_SMUDGE=1

    INPUT = "../input"
    INPUT_JIGSAW_01 = os.path.join(INPUT, "jigsaw-toxic-comment-classification-challenge")
    INPUT_JIGSAW_02 = os.path.join(INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
    INPUT_JIGSAW_03 = os.path.join(INPUT, "jigsaw-multilingual-toxic-comment-classification")
    INPUT_JIGSAW_04 = os.path.join(INPUT, "jigsaw-toxic-severity-rating")

    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")

    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    # utils
    logger = Logger(EXP)

# utils
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')
seed_everything(seed=Config.seed)

# 2nd import
from transformers import AutoTokenizer, AutoModel, AdamW
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup,
    get_linear_schedule_with_warmup
)

# set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ========================================
# Load Data
# ========================================
comments_to_score = read_csv(os.path.join(INPUT_JIGSAW_04 , "comments_to_score.csv"))
validation_data = read_csv(os.path.join(INPUT_JIGSAW_04 , "validation_data.csv"))
sample_submission = read_csv(os.path.join(INPUT_JIGSAW_04 , "sample_submission.csv"))
train_jigsaw_01 = read_csv(os.path.join(INPUT_JIGSAW_01 , "train.csv"))

if Config.debug:
    train_jigsaw_01 = train_jigsaw_01.sample(1000).reset_index(drop=True)

# fisrt, add fold index
train_jigsaw_01["fold"] = -1
for i, lst in enumerate(
    MultilabelStratifiedKFold(
        n_splits=Config.n_fold, 
        shuffle=True,
        random_state=Config.seed)
    .split(X=train_jigsaw_01, y=train_jigsaw_01[Config.target_cols])):

    if i in Config.trn_fold:
        train_jigsaw_01.loc[lst[1].tolist(), "fold"] = i
    
display(train_jigsaw_01)

In [None]:
# ========================================
# Dataset
# ========================================
class TrainDataset(Dataset):
    def __init__(self, data, tokenizer, text_col):
        self.comment_text = data[text_col].values
        self.targets = data[Config.target_cols].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        tokenized_dict = self.tokenizer.encode_plus(
            self.comment_text[idx],
            truncation=True,
            max_length=Config.max_length,
            padding='max_length'
        )
        
        input_ids = torch.LongTensor(tokenized_dict["input_ids"])
        attention_mask = torch.BoolTensor(tokenized_dict["attention_mask"])
        targets = torch.tensor(self.targets[idx]).float()

        return input_ids, attention_mask, targets


class TestDataset(Dataset):
    def __init__(self, data, tokenizer, text_col):
        self.comment_text = data[text_col].values
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self, idx):

        tokenized_dict = self.tokenizer.encode_plus(
            self.comment_text[idx],
            truncation=True,
            max_length=Config.max_length,
            padding='max_length'
        )
        
        input_ids = torch.LongTensor(tokenized_dict["input_ids"])
        attention_mask = torch.BoolTensor(tokenized_dict["attention_mask"])

        return input_ids, attention_mask

In [None]:
# ========================================
# Model
# ========================================
class Jigsaw01Model(nn.Module):
    def __init__(self, ):
        super(Jigsaw01Model, self).__init__()
        
        self.model = AutoModel.from_pretrained(Config.model_name)    
        self.drop = nn.Dropout(p=0.2)
        self.head = nn.Linear(768, len(Config.target_cols))
    
    def forward(self, input_ids, attention_mask):
        x = self.model(input_ids=input_ids, attention_mask=attention_mask)
        x = x[0][:, 0, :]
        x = self.drop(x)
        x = self.head(x)
        return x

In [None]:
# ========================================
# Funcs
# ========================================
def get_score(y_true, y_pred, verbose=False):
    print(y_true.shape, y_pred.shape)
    scores = []
    if Config.debug:
        y_true[0] = 1
        
    for i, col_name in enumerate(Config.target_cols):
        score_i = roc_auc_score(y_true[:, i], y_pred[:, i])
        if verbose:
            print(f"{col_name}={score_i:.4f}")
        scores.append(score_i)
    score = np.mean(scores)
    return score


class Meter(object):
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
    
    def time_since(self, since, percent):
        now = time.time()
        s = now - since
        es = s / percent
        rs = es - s
        return f"{self.as_minutes(s)} (remain {self.as_minutes(rs)})"
    
    @staticmethod
    def as_minutes(s):
        m = math.floor(s / 60)
        s -= m * 60
        return f"{int(m)}m {s:.1f}s"


def train_fn(
    train_loader, 
    model, 
    criterion, 
    optimizer, 
    epoch, 
    scheduler, 
    device):

    model.train()
    losses = Meter()
    start = end = time.time()
    global_step = 0
    for step, (input_ids, attention_mask, targets) in enumerate(train_loader):
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        targets = targets.to(device)
        batch_size = targets.size(0)

        y_preds = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(y_preds, targets)

        # record loss
        losses.update(loss.item(), batch_size)
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        
        # backward propagation
        loss.backward() 
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), Config.max_grad_norm)

        if (step + 1) % Config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            if Config.batch_scheduler:
                scheduler.step()
        
        end = time.time()

        if step % Config.verbose == 0 or step == (len(train_loader) - 1):
            print(f"Epoch: [{epoch+1}][{step}/{len(train_loader)}] "
                  f"Elapsed {losses.time_since(start, float(step+1)/len(train_loader))} "
                  f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                  f"Grad: {grad_norm:.4f} "
                  f"LR: {scheduler.get_lr()[0]:4f}"
                  )
    return losses.avg



def valid_fn(valid_loader, model, criterion, device):
    model.eval()
    losses = Meter()
    preds = []
    start = end = time.time()
    for step, (input_ids, attention_mask, targets) in enumerate(valid_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        targets = targets.to(device)
        batch_size = targets.size(0)

        # compute loss
        with torch.no_grad():
            y_preds = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(y_preds, targets)
        losses.update(loss.item(), batch_size)

        # record predictions
        preds.append(y_preds.sigmoid().to("cpu").numpy())
        if Config.gradient_accumulation_steps > 1:
            loss = loss / Config.gradient_accumulation_steps
        end = time.time()

        if step % Config.verbose == 0 or step == (len(valid_loader) - 1):
            print(f"Eval: [{step}/{len(valid_loader)}] "
                  f"Elapsed {losses.time_since(start, float(step+1)/len(valid_loader))} "
                  f"Loss: {losses.val:.4f}({losses.avg:.4f}) "
                  )
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
def training(train_df, valid_df, model, filepath, text_col):

    # model & tokenizer
    model.to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(Config.model_name)

    va_targets = valid_df[Config.target_cols].values
    train_dataset = TrainDataset(train_df, tokenizer, text_col)
    valid_dataset = TrainDataset(valid_df, tokenizer, text_col)

    train_loader = DataLoader(
        train_dataset, 
        batch_size=Config.train_batch_size, 
        shuffle=True, 
        num_workers=Config.num_workers, 
        pin_memory=True, 
        drop_last=True)
    
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=Config.valid_batch_size,
        shuffle=False,
        num_workers=Config.num_workers, 
        pin_memory=True, 
        drop_last=False
    )

    # optimizer & scheduler
    optimizer = AdamW(
        model.parameters(),
        lr=Config.lr, 
        weight_decay=Config.weight_decay)

    # set warmup steps if it is None
    if "num_training_steps" in list(Config.scheduler.keys()):
        if Config.scheduler["num_training_steps"] is None:
            Config.scheduler["num_training_steps"] = ((len(train_loader) * Config.epochs) -
                                                      Config.scheduler["num_warmup_steps"])
    scheduler = get_scheduler(optimizer)

    # loop
    criterion = nn.BCEWithLogitsLoss()
    best_score = -np.inf if Config.is_higher_score_better else np.inf
    best_loss = np.inf
    
    for epoch in range(Config.epochs):
        start_time = time.time()

        # train
        avg_loss = train_fn(
            train_loader=train_loader,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            epoch=epoch, 
            scheduler=scheduler,
            device=DEVICE)
        
        # eval
        avg_val_loss, preds = valid_fn(
            valid_loader=valid_loader, 
            model=model, 
            criterion=criterion, 
            device=DEVICE)
        
        if isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(avg_val_loss)
        else:
            scheduler.step() 
        
        # scoring
        score = get_score(va_targets, preds)
        
        elapsed = time.time() - start_time
        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} "
              f"avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")

        judge = (score >= best_score if Config.is_higher_score_better
                 else score < best_score)
        if judge:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model")
            torch.save(model.state_dict(), filepath)


def inference(test_df, model, filepath, text_col):
    model.eval()
    state = torch.load(filepath, map_location=torch.device("cpu"))
    model.load_state_dict(state)
    model.to(DEVICE)

    tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
    test_dataset = TestDataset(test_df, tokenizer, text_col)
    test_loader = DataLoader(
        test_dataset,
        batch_size=Config.test_batch_size, 
        shuffle=False, 
        num_workers=Config.num_workers, 
        pin_memory=True, 
        drop_last=False)
    
    preds = []
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    for step, (input_ids, attention_mask) in bar:
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        with torch.no_grad():
            pred = model(input_ids, attention_mask)

        preds.append(pred.sigmoid().detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds


def get_scheduler(optimizer):
    if Config.scheduler["scheduler"] == "cosine_scedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.scheduler["num_warmup_steps"],
            num_training_steps=Config.scheduler["num_training_steps"],
            )
        
    elif Config.scheduler["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=Config.scheduler["num_warmup_steps"],
            num_training_steps=Config.scheduler["num_training_steps"],
            )
    
    elif Config.scheduler["scheduler"] == "CosineAnnealingWarmRestarts":
        scheduler = CosineAnnealingWarmRestarts(
            optimizer, 
            T_0=Config.scheduler["T_0"], 
            T_mult=1,
            eta_min=Config.scheduler["min_lr"], 
            last_epoch=-1)
    
    elif Config.scheduler["scheduler"] == "ReduceLROnPlateau":
        scheduler = ReduceLROnPlateau(
            optimizer, 
            mode="min",
            factor=Config.scheduler["factor"], 
            patience=Config.scheduler["patience"],
            verbose=True, 
            eps=Config.scheduler["eps"])
    
    else:
        raise NotImplementedError
    
    return scheduler


def train_cv(train, text_col):

    oof_df = pd.DataFrame(np.zeros((len(train), len(Config.target_cols))), columns=Config.target_cols)
    for i_fold in range(Config.n_fold):
        if i_fold in Config.trn_fold:
            filepath = os.path.join(
                        EXP_MODEL,
                        f"{Config.name}-seed{Config.seed}-fold{i_fold}.pth")
            
            tr_df, va_df = (train[train["fold"] != i_fold].reset_index(drop=True),
                            train[train["fold"] == i_fold].reset_index(drop=True))
                        
            if not os.path.isfile(filepath):  # if trained model, no training
                model = Jigsaw01Model()
                training(tr_df, va_df, model, filepath, text_col)
            
            model = Jigsaw01Model()
            preds = inference(va_df, model, filepath, text_col)

            oof_df.loc[train["fold"] == i_fold, Config.target_cols] = preds 

            # fold score
            score = get_score(va_df[Config.target_cols].values, preds)
            logger.info(f"{Config.name}-seed{Config.seed}-fold{i_fold} >>>>> Score={score:.4f}")

    # overall score
    score = get_score(va_df[Config.target_cols].values, preds)
    logger.info(f"{Config.name}-seed{Config.seed}-OOF-Score >>>>> Score={score:.4f}")

    return oof_df.reset_index(drop=True)


def predict_cv(test, text_col):
    model = Jigsaw01Model()

    preds_fold = []
    preds_fold_df = pd.DataFrame()
    for i_fold in range(Config.n_fold):
        if i_fold in Config.trn_fold:
            filepath = os.path.join(
                EXP_MODEL,
                f"{Config.name}-seed{Config.seed}-fold{i_fold}.pth")
            
            preds = inference(test, model, filepath, text_col)
            _df = (pd.DataFrame(preds, columns=Config.target_cols).
                   add_prefix(f"FOLD{i_fold:02}="))
            
            preds_fold.append(preds)
            preds_fold_df = pd.concat([preds_fold_df, _df], axis=1)
    
    preds = np.mean(preds_fold, axis=0)
    return preds, preds_fold_df


def predict_cv_jigsaw04_validation(validation_data):
    text_cols = ["less_toxic", "more_toxic"]
    for text_col in text_cols:

        preds, preds_fold_df = predict_cv(validation_data, text_col)

        # my toxic score = sum of target_cols(fold prediction)
        toxic = preds_fold_df.sum(axis=1)
        validation_data[f"preds={text_col}"] = toxic

        # to scv [preds: fold average, preds_fold_df: all fold]
        pd.DataFrame(preds, columns=Config.target_cols).to_csv(os.path.join(EXP_PREDS, f"{text_col}-preds.csv"), index=False)
        preds_fold_df.to_csv(os.path.join(EXP_PREDS, f"{text_col}-preds_fold_df.csv"), index=False)

    return validation_data


def jigsaw04_metrics(preds_less_toxic, preds_more_toxic):
    scores = np.zeros(len(preds_less_toxic))
    for i in range(len(scores)):
        if preds_less_toxic[i] < preds_more_toxic[i]:
            scores[i] = 1
    return scores

In [None]:
# ========================================
# Main
# ========================================
if not Config.only_inference:
    # training
    print("# ---------- # Start Training # ---------- #")
    oof_df = train_cv(train_jigsaw_01, text_col="comment_text")
    oof_df.to_csv(os.path.join(EXP_PREDS, "oof.csv"), index=False)

    # score (Jigsaw01)
    fold_mask = train_jigsaw_01["fold"].isin(Config.trn_fold)
    score = get_score(
        train_jigsaw_01.loc[fold_mask, Config.target_cols].values, 
        oof_df.loc[fold_mask, Config.target_cols].values, 
        verbose=True)
    logger.info(f"Jigsaw01-MCWROC={score:.4f}")

    # validation (Jigsaw04[This Compe])
    print("# ---------- # Start Validation # ---------- #")
    validation_preds_df = predict_cv_jigsaw04_validation(validation_data)
    validation_preds_df.to_csv(os.path.join(EXP_PREDS, f"validation_preds_df.csv"), index=False)
    scores = jigsaw04_metrics(preds_less_toxic=validation_preds_df["preds=less_toxic"], 
                            preds_more_toxic=validation_preds_df["preds=more_toxic"])
    score = np.mean(scores)
    logger.info(f"Jigsaw04-Jigsaw-Rate-Severity={score:.4f}")

# prediction
print("# ---------- # Start Inference # ---------- #")
preds, fold_preds_df = predict_cv(comments_to_score, text_col="text")
(pd.DataFrame(preds, columns=Config.target_cols).
 to_csv(os.path.join(EXP_PREDS, f"comments_to_score_preds_df.csv"), index=False))
fold_preds_df.to_csv(os.path.join(EXP_PREDS, f"comments_to_score_fold_preds_df.csv"), index=False)

comments_to_score["preds"] = fold_preds_df.sum(axis=1)
comments_to_score.to_csv(os.path.join(EXP_PREDS, f"comments_to_score_df.csv"), index=False)

# make submission
print("# ---------- # Make Submission # ---------- #")
sample_submission["score"] = comments_to_score["preds"].rank(method='first')
display(sample_submission)
sample_submission.to_csv(os.path.join(SUBMISSION, "submission.csv"), index=False)
print("# ---------- # Finish Experiment!! # ---------- #")

In [None]:
# upload output folder to kaggle dataset
if Config.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name, upload_dir):
        dataset_metadata = {}
        dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
        dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
        dataset_metadata['title'] = dataset_name
        with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

    if len(EXP) >= 50:
        dataset_name = EXP[:7]

    dataset_create_new(dataset_name=dataset_name, upload_dir=OUTPUT_EXP)