1. This notebook is an implementation of the Margin Ranking Loss model using TF / Keras.
2. It uses TPU for training. (GPU for inference)
3. roberta base.

- reference
    - https://www.kaggle.com/yasufuminakama/jigsaw4-luke-base-starter-train/notebook
    - https://www.kaggle.com/quincyqiang/download-huggingface-pretrain-for-kaggle
    - https://www.kaggle.com/its7171/jigsaw-cv-strategy

training notebook is [here](https://www.kaggle.com/mst8823/tf-keras-pairwise-toxic-model-tpu-train)

In [None]:
# ========================================
# Config
# ========================================
class Config:
    name = "TF-Pairwise-Toxic-Model"  
    only_inference = True
    model_name = "../input/roberta-base"
    margin = 0.5

    head = 60
    tail = 48
    max_length = head + tail
    lr = 1e-5
    weight_decay = 1e-5
    steps_per_epochs = 32

    scheduler = dict(
        scheduler="get_schedule_with_warmup", 
        num_warmup_steps=0, 
        num_train_steps=None,
        min_lr_ratio=0.1, 
        power=1.)
    # scheduler = None
    
    early_stop = dict(
        monitor = "val_loss",
        min_delta = 0.,
        patience = 4,
        mode = "min"
    )
    # early_stop = None

    epochs = 16
    train_batch_size = 64
    valid_batch_size = 256
    test_batch_size = 256
    
    n_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    seed = 2022
    target_col = ["target"]
    debug = False

    # Colab Env
    upload_from_colab = False
    api_path = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments/mst8823/kaggle.json"
    drive_path = "/content/drive/Shareddrives/Jigsaw-Rate-Severity-of-Toxic-Comments/mst8823"
    
    # Kaggle Env
    kaggle_dataset_path = "../input/tf-keras-pairwise-toxic-model-tpu-train"

if Config.debug:
    Config.epochs = 2
    Config.n_fold = 2
    Config.trn_fold = [0]
    Config.train_batch_size = 1
    Config.steps_per_epochs = None

print(len(Config.name))

In [None]:
# ========================================
# Library
# ========================================
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import math
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import torch
import tensorflow as tf
from tensorflow.keras import backend as K

In [None]:
# ========================================
# Utils
# ========================================
class Logger:
    """save log"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    tf.random.set_seed(seed)


def read_csv(filepath, **kwargs):
    
    if os.path.isdir(filepath):
        filename = filepath.split("/")[-1]
        filepath = os.path.join(filepath, filename)
        
    try:
        csv_data = pd.read_csv(filepath,  **kwargs)
    except:
        csv_data = pd.read_csv(filepath + ".zip",  **kwargs)

    return csv_data


def change_dict_key(d, old_key, new_key, default_value=None):
    d[new_key] = d.pop(old_key, default_value)


def auto_select_accelerator(tpu, is_colab=True):

    if is_colab:  # tpu in colab
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())

    else:  # tpu in kaggle kernel
        tpu_ = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
        tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu_)

    print(f"Running on {tpu_strategy.num_replicas_in_sync} replicas")
    return tpu_strategy


class GroupKFold:
    """
    GroupKFold with random shuffle with a sklearn-like structure (by katsu1110)
    """
    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = groups.unique()
        for tr_group_idx, va_group_idx in kf.split(unique_ids):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(groups.isin(tr_group))[0]
            val_idx = np.where(groups.isin(va_group))[0]
            yield train_idx, val_idx


def get_leak_free_valid_idx(train):
    """tito notebook"""
    df = train.copy()
    texts = set(df.less_toxic.to_list() + df.more_toxic.to_list())
    text2id = {t:id for id,t in enumerate(texts)}
    df['less_id'] = df['less_toxic'].map(text2id)
    df['more_id'] = df['more_toxic'].map(text2id)

    # Set array to store pair information
    len_ids = len(text2id)
    idarr = np.zeros((len_ids,len_ids), dtype=bool)

    for lid, mid in df[['less_id', 'more_id']].values:
        min_id = min(lid, mid)
        max_id = max(lid, mid)
        idarr[max_id, min_id] = True

    # Recursively retrieve the text that is paired with the text whose id is i,
    # and store it's id in this_list.
    # then set idarr[i, j] to False
    def add_ids(i, this_list):
        for j in range(len_ids):
            if idarr[i, j]:
                idarr[i, j] = False
                this_list.append(j)
                this_list = add_ids(j,this_list)
                #print(j,i)
        for j in range(i+1,len_ids):
            if idarr[j, i]:
                idarr[j, i] = False
                this_list.append(j)
                this_list = add_ids(j,this_list)
                #print(j,i)
        return this_list

    group_list = []
    for i in tqdm(range(len_ids)):
        for j in range(i+1,len_ids):
            if idarr[j, i]:
                this_list = add_ids(i,[i])
                #print(this_list)
                group_list.append(this_list)

    id2groupid = {}
    for gid,ids in enumerate(group_list):
        for id in ids:
            id2groupid[id] = gid

    df['less_gid'] = df['less_id'].map(id2groupid)
    df['more_gid'] = df['more_id'].map(id2groupid)
    return df["less_gid"]

In [None]:
# ========================================
# SetUp
# ========================================
COLAB = "google.colab" in sys.modules

if COLAB:
    print("This environment is Google Colab")
    # import library
    ! pip install --quiet transformers
    ! pip install --quiet iterative-stratification
    ! pip install --quiet tensorflow-addons

    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])  # get notebook name
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # all jigsaw input data
    INPUT_JIGSAW_01 = os.path.join(INPUT, "jigsaw-toxic-comment-classification-challenge")
    INPUT_JIGSAW_02 = os.path.join(INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
    INPUT_JIGSAW_03 = os.path.join(INPUT, "jigsaw-multilingual-toxic-comment-classification")
    INPUT_JIGSAW_04 = os.path.join(INPUT, "jigsaw-toxic-severity-rating")
    jigsaw_inputs = [INPUT_JIGSAW_01, INPUT_JIGSAW_02, INPUT_JIGSAW_03, INPUT_JIGSAW_04]

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS] + jigsaw_inputs:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT_JIGSAW_04, "comments_to_score.csv.zip")):
        # load dataset
        ! kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p $INPUT_JIGSAW_01 
        ! kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -p $INPUT_JIGSAW_02 
        ! kaggle competitions download -c jigsaw-multilingual-toxic-comment-classification -p $INPUT_JIGSAW_03 
        ! kaggle competitions download -c jigsaw-toxic-severity-rating -p $INPUT_JIGSAW_04 
    
    # utils
    logger = Logger(OUTPUT_EXP)

else:
    print("This environment is Kaggle Kernel")
    ! pip install --quiet ../input/iterative-stratification/iterative-stratification-master
    INPUT = "../input"
    INPUT_JIGSAW_01 = os.path.join(INPUT, "jigsaw-toxic-comment-classification-challenge")
    INPUT_JIGSAW_02 = os.path.join(INPUT, "jigsaw-unintended-bias-in-toxicity-classification")
    INPUT_JIGSAW_03 = os.path.join(INPUT, "jigsaw-multilingual-toxic-comment-classification")
    INPUT_JIGSAW_04 = os.path.join(INPUT, "jigsaw-toxic-severity-rating")

    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")

    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    # utils
    logger = Logger(EXP)

# utils
warnings.filterwarnings("ignore")
sns.set(style='whitegrid')
seed_everything(seed=Config.seed)

# 2nd import
from transformers import AutoTokenizer, TFAutoModel, WarmUp
import tensorflow_addons as tfa

# set device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# tpu
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    REPLICAS = auto_select_accelerator(TPU, COLAB).num_replicas_in_sync
    
except ValueError:
    TPU = None
    REPLICAS = 1

In [None]:
# ========================================
# Load Data
# ========================================
test = read_csv(os.path.join(INPUT_JIGSAW_04 , "comments_to_score.csv"))
sample_submission = read_csv(os.path.join(INPUT_JIGSAW_04 , "sample_submission.csv"))

if not Config.only_inference:
    train = read_csv(os.path.join(INPUT_JIGSAW_04 , "validation_data.csv"))
    if Config.debug:
        train = train.sample(100).reset_index(drop=True)
    groups = get_leak_free_valid_idx(train)

    # fisrt, add fold index
    train["fold"] = -1
    for i, lst in enumerate(
        GroupKFold(
            n_splits=Config.n_fold,
            shuffle=True, 
            random_state=Config.seed
            )
        .split(X=train, y=train, groups=groups)):

        if i in Config.trn_fold:
            train.loc[lst[1].tolist(), "fold"] = i
        
    # add target
    train[Config.target_col] = 1
    display(train)

In [None]:
# ========================================
# Data Set
# ========================================
def prepare_input(text, tokenizer):

    if Config.tail == 0:
        inputs = tokenizer.batch_encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=Config.max_length,
            pad_to_max_length=True,
            truncation=True)
        inputs = dict(inputs)

    else:
        inputs = tokenizer.batch_encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True)
        inputs = dict(inputs)
        
        for k, v_lst in inputs.items():

            new_v_lst = []
            for i in range(len(v_lst)):
                v = v_lst[i]

                v_length = len(v)
                if v_length > Config.max_length:
                    v = np.hstack([v[:Config.head], v[-Config.tail:]])

                if k == 'input_ids':
                    new_v = np.ones(Config.max_length) * tokenizer.pad_token_id

                else:
                    new_v = np.zeros(Config.max_length)

                new_v[:v_length] = v 
                new_v_lst.append(np.array(new_v, dtype=np.int))

            inputs[k] = new_v_lst

    return inputs


def get_dataset(X, y=None, dataset="test"):
    if dataset=="train":

        train_dataset = (
            tf.data.Dataset
            .from_tensor_slices((X, y))
            .shuffle(2048)
            .batch(Config.train_batch_size * REPLICAS)
            .prefetch(tf.data.experimental.AUTOTUNE)
            )
        if Config.steps_per_epochs is not None:
            train_dataset = train_dataset.repeat()

        return train_dataset

    elif dataset=="valid":

        valid_dataset = (
            tf.data.Dataset
            .from_tensor_slices((X, y))
            .batch(Config.valid_batch_size * REPLICAS)
            .prefetch(tf.data.experimental.AUTOTUNE)
        )
        return valid_dataset
    
    elif dataset=="test":
        test_dataset = (
            tf.data.Dataset
            .from_tensor_slices(X)
            .batch(Config.test_batch_size * REPLICAS)
            .prefetch(tf.data.experimental.AUTOTUNE)
        )
        return test_dataset

In [None]:
# ========================================
# Model
# ========================================
def build_toxic_model():
    """TFAutoModel"""
    transformer = TFAutoModel.from_pretrained(Config.model_name)
    input_word_ids = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='attention_mask')

    x = transformer(input_word_ids, attention_mask=attention_mask)
    x = x[1]
    x = tf.keras.layers.Dropout(0.2)(x)

    output = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs=[input_word_ids, attention_mask],
                                  outputs=[output])
    
    return model


def build_pairwise_toxic_model(toxic_model, optimizer=None):
    less_toxic_input_ids = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='less_toxic_input_ids')
    less_toxic_attention_mask = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='less_toxic_attention_mask')
    less_toxic_inputs = {"input_ids":less_toxic_input_ids, "attention_mask":less_toxic_attention_mask}

    more_toxic_input_ids = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='more_toxic_input_ids')
    more_toxic_attention_mask = tf.keras.layers.Input(shape=(Config.max_length, ), dtype=tf.int32, name='more_toxic_attention_mask')
    more_toxic_inputs = {"input_ids":more_toxic_input_ids, "attention_mask":more_toxic_attention_mask}

    less_toxic_preds = toxic_model(less_toxic_inputs)
    more_toxic_preds = toxic_model(more_toxic_inputs)
    rank_diff = more_toxic_preds - less_toxic_preds

    model = tf.keras.Model(
        inputs=[less_toxic_input_ids, less_toxic_attention_mask, more_toxic_input_ids, more_toxic_attention_mask], 
        outputs=[rank_diff]
        )

    if optimizer is not None:
        model.compile(optimizer=optimizer, loss=margin_ranking_loss, metrics=custom_accuracy)

    return model


def margin_ranking_loss(true_rank_diff, preds_rank_diff):
    return tf.math.maximum(Config.margin + (-tf.cast(true_rank_diff, tf.float32) * preds_rank_diff), 0)


def custom_accuracy(t, rank_diff): 
    num_correct = tf.math.reduce_sum(tf.where(rank_diff > 0, 1, 0))
    num_error = tf.math.reduce_sum(tf.where(rank_diff > 0, 0, 1))
    score = num_correct / (num_correct + num_error)
    return score


def get_tokenizer():
    return AutoTokenizer.from_pretrained(Config.model_name)


In [None]:
def get_score_v2(df):
    score = len(df[df["less_toxic_pred"] < df["more_toxic_pred"]]) / len(df)
    return score


def get_scheduler():

    # set warmup steps if it is None
    if "num_train_steps" in list(Config.scheduler.keys()):
        if Config.scheduler["num_train_steps"] is None:
            Config.scheduler["num_train_steps"] = ((Config.steps_per_epochs * Config.epochs) -
                                                      Config.scheduler["num_warmup_steps"])
    
    if Config.scheduler["scheduler"] == "ReduceLROnPlateau":
        scheduler = tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss", 
            factor=Config.scheduler["factor"],
            patience=Config.scheduler["patience"],
            min_lr=Config.scheduler["min_lr"], 
            verbose=1
            )
        return scheduler
        
    elif Config.scheduler == "CosineDecayRestarts":
        cisine_decay_r = tf.keras.experimental.CosineDecayRestarts(
            Config.lr,
            first_decay_steps=Config.scheduler["first_decay_step"],
            t_mul=Config.scheduler["t_mul"],
            m_mul=Config.scheduler["m_mul"],
            alpha=Config.scheduler["alpha"]
            )
        scheduler = tf.keras.callbacks.LearningRateScheduler(cisine_decay_r, verbose=1)
        return scheduler
    
            
    elif Config.scheduler["scheduler"] == "get_schedule_with_warmup":

        scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=Config.lr,
            decay_steps=Config.scheduler["num_train_steps"] - Config.scheduler["num_warmup_steps"],
            end_learning_rate=Config.lr * Config.scheduler["min_lr_ratio"],
            power=Config.scheduler["power"],
        )

        if Config.scheduler["num_warmup_steps"] > 0:
            scheduler = WarmUp(
                initial_learning_rate=Config.lr,
                decay_schedule_fn=scheduler,
                warmup_steps=Config.scheduler["num_warmup_steps"],
            )
        optimizer = tfa.optimizers.AdamW(learning_rate=scheduler, weight_decay=Config.weight_decay)
        return scheduler, optimizer

    else:
        raise NotImplementedError
    
    return scheduler


def get_pairwise_tocix_inputs(df, tokenizer):

    less_toxic_text = prepare_input(df["less_toxic"].fillna("none").tolist(), tokenizer=tokenizer)
    more_toxic_text = prepare_input(df["more_toxic"].fillna("none").tolist(), tokenizer=tokenizer)

    change_dict_key(less_toxic_text, "input_ids", "less_toxic_input_ids")
    change_dict_key(less_toxic_text, "attention_mask", "less_toxic_attention_mask")
    change_dict_key(more_toxic_text, "input_ids", "more_toxic_input_ids")
    change_dict_key(more_toxic_text, "attention_mask", "more_toxic_attention_mask")

    out_text = {**less_toxic_text, **more_toxic_text}
    return out_text


def training_v2(train_df, valid_df, filepath):

    if Config.steps_per_epochs is None:
        Config.steps_per_epochs = len(train_df) // Config.train_batch_size
    
    # model setting
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath, 
        monitor="val_loss", 
        verbose=1, 
        save_best_only=True, 
        save_weights_only=True,
        mode="min")
    
    callbacks = [checkpoint]
    optimizer = tfa.optimizers.AdamW(learning_rate=Config.lr, weight_decay=Config.weight_decay)
    if Config.scheduler is not None:
        if Config.scheduler["scheduler"] == "get_schedule_with_warmup":
            _,  optimizer = get_scheduler()  # new optimizer
        else:
            scheduler = get_scheduler()
            callbacks += [scheduler]

    if Config.early_stop is not None:
        early_stop = tf.keras.callbacks.EarlyStopping(
            monitor=Config.early_stop["monitor"],
            min_delta=Config.early_stop["min_delta"],
            patience=Config.early_stop["patience"], 
            mode=Config.early_stop["mode"],
            verbose=1
        )
        callbacks += [early_stop]

    # get model :TPU setting
    if TPU:
        tpu_strategy = auto_select_accelerator(TPU, COLAB)
        with tpu_strategy.scope():
            toxic_model = build_toxic_model()
            pairwise_toxic_model = build_pairwise_toxic_model(toxic_model, optimizer)

    else:
        toxic_model = build_toxic_model()
        pairwise_toxic_model = build_pairwise_toxic_model(toxic_model, optimizer)

    # tf dataset
    tokenizer = get_tokenizer()
    tr_x_inputs = get_pairwise_tocix_inputs(train_df, tokenizer)
    va_x_inputs = get_pairwise_tocix_inputs(valid_df, tokenizer)
    tr_y = train_df[Config.target_col].values
    va_y = valid_df[Config.target_col].values

    tr_dataset = get_dataset(X=tr_x_inputs, y=tr_y, dataset="train")
    va_dataset = get_dataset(X=va_x_inputs, y=va_y, dataset="valid")

    # training pairwise toxic model
    pairwise_toxic_model.fit(
        tr_dataset, 
        epochs=Config.epochs, 
        verbose=1, 
        callbacks=callbacks,
        validation_data=va_dataset, 
        steps_per_epoch=Config.steps_per_epochs)

    # save toxic model weights
    toxic_model = build_toxic_model()
    pairwise_toxic_model = build_pairwise_toxic_model(toxic_model)
    pairwise_toxic_model.load_weights(filepath)  # use check point weights
    toxic_model.save_weights(filepath)
    

def inference_v2(text, filepath):
    tokenizer = get_tokenizer()

    # get model :TPU setting
    if TPU:
        tpu_strategy = auto_select_accelerator(TPU, COLAB)
        with tpu_strategy.scope():
            toxic_model = build_toxic_model()
    else:
        toxic_model = build_toxic_model()
        
    toxic_model.load_weights(filepath)
    toxic_inputs = prepare_input(text, tokenizer)
    te_dataset = get_dataset(toxic_inputs, dataset="test")

    preds = toxic_model.predict(te_dataset)
    return preds


def train_cv_v2(train):

    oof_cols = ["worker", "less_toxic", "more_toxic", "fold", 
                "less_toxic_pred", "more_toxic_pred"]
    oof_df = pd.DataFrame(np.zeros((len(train), len(oof_cols))), columns=oof_cols)

    for i_fold in range(Config.n_fold):
        if i_fold in Config.trn_fold:
            K.clear_session()
            filepath = os.path.join(
                        EXP_MODEL,
                        f"{Config.name}-seed{Config.seed}-fold{i_fold}.h5")
            
            tr_df, va_df = (train[train["fold"] != i_fold].reset_index(drop=True),
                            train[train["fold"] == i_fold].reset_index(drop=True))
                        
            if not os.path.isfile(filepath):  # if trained model, no training
                training_v2(tr_df, va_df, filepath)
            
            va_text = list(sorted(set(va_df["less_toxic"].unique()) | set(va_df["more_toxic"].unique())))
            va_preds = inference_v2(va_text, filepath)
            _df = pd.DataFrame({"text":va_text, "pred":np.concatenate(va_preds)})
 
            preds_df = pd.merge(
                va_df, 
                _df.rename(columns={"text": "less_toxic", "pred": "less_toxic_pred"}),
                on="less_toxic", how="left")
            
            preds_df = pd.merge(
                preds_df, 
                _df.rename(columns={"text": "more_toxic", "pred": "more_toxic_pred"}),
                on="more_toxic", how="left")
            oof_df.loc[train["fold"] == i_fold, oof_cols] = preds_df[oof_cols].values

            # fold score
            score = get_score_v2(preds_df)
            logger.info(f"{Config.name}-seed{Config.seed}-fold{i_fold} >>>>> Score={score:.4f}")

    # overall score
    score = get_score_v2(oof_df[["less_toxic_pred", "more_toxic_pred"]])
    logger.info(f"{Config.name}-seed{Config.seed}-OOF-Score >>>>> Score={score:.4f}")

    return oof_df.reset_index(drop=True)


def predict_cv_v2(text):
    K.clear_session()

    preds_fold = []
    preds_fold_df = pd.DataFrame()

    for i_fold in range(Config.n_fold):
        if i_fold in Config.trn_fold:
            filepath = os.path.join(
                EXP_MODEL,
                f"{Config.name}-seed{Config.seed}-fold{i_fold}.h5")
            
            preds = inference_v2(text, filepath)
            preds_fold.append(preds)
            preds_fold_df[f"FOLD={i_fold:02}"] = np.concatenate(preds)    

    return preds_fold, preds_fold_df

In [None]:
# ========================================
# Main
# ========================================
if not Config.only_inference:
    # training
    print("# ---------- # Start Training # ---------- #")
    oof_df = train_cv_v2(train)
    oof_df.to_csv(os.path.join(EXP_PREDS, "oof.csv"), index=False)

    fold_mask = train["fold"].isin(Config.trn_fold)
    score = get_score_v2(oof_df.loc[fold_mask, ["less_toxic_pred", "more_toxic_pred"]])
    logger.info(f"Jigsaw04-Jigsaw-Rate-Severity={score:.4f}")

# prediction
print("# ---------- # Start Inference # ---------- #")
preds_fold, preds_fold_df = predict_cv_v2(test["text"].fillna("none").tolist())
preds_fold_df.to_csv(os.path.join(EXP_PREDS, f"comments_to_score_preds_fold_df.csv"), index=False)

# make submission
print("# ---------- # Make Submission # ---------- #")
sample_submission["score"] = np.mean(preds_fold, axis=0)
sample_submission["score"] = sample_submission["score"].rank(method='first') # to rank
display(sample_submission)
filename = Config.name + ".csv" if COLAB else "submission.csv"
sample_submission.to_csv(os.path.join(SUBMISSION, filename), index=False)
print("# ---------- # Finish Experiment!! # ---------- #")

In [None]:
# upload output folder to kaggle dataset
if Config.upload_from_colab:
    from kaggle.api.kaggle_api_extended import KaggleApi

    def dataset_create_new(dataset_name, upload_dir):
        dataset_metadata = {}
        dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
        dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
        dataset_metadata['title'] = dataset_name
        with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
            json.dump(dataset_metadata, f, indent=4)
        api = KaggleApi()
        api.authenticate()
        api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

    if len(EXP) >= 50:
        dataset_name = EXP[:7]
    else:
        dataset_name = EXP

    dataset_create_new(dataset_name=dataset_name, upload_dir=OUTPUT_EXP)