## Pytorch-Lightning & Longformer Baseline


- **Notebook**
    - pytorch lightning 
    - IO Tagging using custom tokenizer
    - backborn : longformer (max_length : 1024)
    - optimizer : AdamW
    - postprocessing Ã— 2 
    
    
- **Reference**
    - https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633
    - https://www.kaggle.com/vuxxxx/tensorflow-longformer-ner-postprocessing


Inference notebook -> https://www.kaggle.com/mst8823/pytorch-lightning-longformer-io-tagging-infer

In [None]:
class Config:
    competition = "feedback-prize-2021"
    name = "Longformer-Baseline"
    debug = False
    inference_only = False

    model_name = "allenai/longformer-base-4096"
    max_length = 1024
    hidden_size = 768

    n_fold = 10
    trn_fold = [0]
    seed = 2022

    max_epochs = 8
    gradient_clip_val = 100
    accumulate_grad_batches = 1
    early_stopping = False
    optimizer = dict(
        optimizer="AdamW", 
        lr=2e-5, 
        weight_decay=1e-5
        )
    scheduler = dict(
        interval = "epoch",
        scheduler = "MultiStepLR",
        milestones = [2, 3, 4, 5, 6, 7],
        gamma = 0.5
    )
    train_batch_size = 4
    valid_batch_size = 4
    num_workers = 4
    resume_from_checkpoint = None

    colab_dir = ""
    api_path = colab_dir + "kaggle.json"
    drive_path = colab_dir + "mst8823"
    upload_from_colab = False

    kaggle_dataset_path = None

    """
    - step scheduler example
    scheduler = dict(
        interval = "step",
        scheduler="get_cosine_schedule_with_warmup",
        num_warmup_steps=256, 
        num_cycles=0.5)

    """
if Config.debug:
    Config.train_batch_size = 2
    Config.valid_batch_size = 2
    Config.max_epochs = 2
    Config.max_length = 128

In [None]:
import os
import re
import sys
import logging
import shutil
import json
import datetime
import requests
import itertools
import functools
import warnings

import pandas as pd
import numpy as np
import spacy

from tqdm.auto import tqdm
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    MultiStepLR, 
    ReduceLROnPlateau
    )
from torch.utils.data import Dataset, DataLoader

In [None]:
class Logger:
    """ ref) https://github.com/ghmagazine/kagglebook/blob/master/ch04-model-interface/code/util.py"""
    def __init__(self, path):
        self.general_logger = logging.getLogger(path)
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler(os.path.join(path, 'Experiment.log'))
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)

    def info(self, message):
        # display time
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    @staticmethod
    def now_string():
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))


def setup(cfg):
    
    cfg.COLAB = "google.colab" in sys.modules
    if cfg.COLAB:
        print("This environment is Google Colab")
        
        # mount
        from google.colab import drive
        if not os.path.isdir("/content/drive"):
            drive.mount('/content/drive') 
        
        # import library
        ! pip install --quiet transformers
        ! pip install --quiet tensorflow-addons
        ! pip install --quiet pytorch_lightning
        ! pip install --quiet wandb

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ["KAGGLE_USERNAME"] = json_data["username"]
        os.environ["KAGGLE_KEY"] = json_data["key"]
        
        # set dirs
        cfg.DRIVE = cfg.drive_path
        cfg.EXP = (cfg.name if cfg.name is not None 
            else requests.get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
        cfg.INPUT = os.path.join(cfg.DRIVE, "Input")
        cfg.OUTPUT = os.path.join(cfg.DRIVE, "Output")
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, "Submission")
        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)

        if not os.path.isfile(os.path.join(cfg.INPUT, "train.csv")):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c cfg.competition -p $INPUT
            ! unzip -d $INPUT $INPUT/$cfg.competition.zip 

    else:
        print("This environment is Kaggle Kernel")
        if not cfg.inference_only:
            ! pip install --quiet pytorch_lightning==1.5.8 
        
        # set dirs
        cfg.INPUT = f"../input/{cfg.competition}"
        
        cfg.EXP = cfg.OUTPUT_EXP = cfg.name
        cfg.SUBMISSION = "./"
        cfg.EXP_MODEL = os.path.join(cfg.EXP, "model")
        cfg.EXP_FIG = os.path.join(cfg.EXP, "fig")
        cfg.EXP_PREDS = os.path.join(cfg.EXP, "preds")

        if cfg.kaggle_dataset_path is not None:
            shutil.copytree(cfg.kaggle_dataset_path, cfg.EXP)

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)

    # set device    
    cfg.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    warnings.filterwarnings("ignore")

    return cfg


def get_target_labels(cfg):
    cfg.LABELS = [
        "O", 
        "Lead", 
        "Position", 
        "Claim", 
        "Counterclaim", 
        "Rebuttal", 
        "Evidence", 
        "Concluding Statement"
    ]

    cfg.IDS2LABELS = {k : v for k, v in enumerate(cfg.LABELS)}
    cfg.LABELS2IDS = {v : k for k, v in enumerate(cfg.LABELS)}

    return cfg

In [None]:
# =========================
# SetUp
# =========================
Config = setup(Config)
Config = get_target_labels(cfg=Config)
Config.logger = Logger(Config.OUTPUT_EXP)

# 2nd import
from transformers import (LongformerConfig, LongformerModel, LongformerTokenizerFast)
from transformers import (get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup)

import pytorch_lightning as pl
import wandb

# wandb setting
if not Config.COLAB:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("WANDB_API")
    wandb.login(key=api_key)
else:
    wandb.login()

In [None]:
# =============================
# Custom Tokenizer
# =============================
class IOTokenizerForNER(LongformerTokenizerFast):
    """
    ref: bert-book (japanese)
    https://github.com/stockmarkteam/bert-book/blob/master/Chapter8.ipynb
    """
    def encode_plus_tagged(self, text, entities, max_length):
        """
        tokenize & labeling
        """
        encoding = self.encode_plus(
            text,
            max_length = max_length,
            padding = "max_length",
            truncation = True,
            return_offsets_mapping=True
        )

        offset_df = pd.DataFrame(
            encoding["offset_mapping"],
            columns=["offset_start", "offset_end"]
            )
        offset_df["labels"] = 0
        entities = sorted(entities, key=lambda x:x["span"][0])
        for entity in entities:

            start = int(entity["span"][0])
            end = int(entity["span"][1])
            label = entity["type_id"]

            entity_mask = ((offset_df["offset_start"] >= start) &
                           (offset_df["offset_end"] <= end)).astype(bool)
            offset_df.loc[entity_mask, "labels"] = label
        
        encoding["labels"] = offset_df["labels"].tolist()
        del encoding["offset_mapping"]
        return encoding

    def encode_plus_untagged(self, text, max_length=None, return_tensors="pt"):

        encoding = self.encode_plus(
            text, 
            max_length=max_length, 
            padding="max_length" if max_length else False,
            truncation=True if max_length else False,
            return_offsets_mapping=True
            )
        
        sequence_length = len(encoding["input_ids"])
        spans = encoding["offset_mapping"]
        spans = spans + [(0, 0)] * (sequence_length - len(spans))

        del encoding["offset_mapping"]
        
        if return_tensors == "pt":
            encoding = {k:torch.tensor([v]) for k, v in encoding.items()}

        return encoding, spans
    
    def convert_output_to_entities(self, text, labels, spans):
        """
        get named entity
        """
        # remove spetial tokens part from labels & spans

        labels = [label for label, span in zip(labels, spans) if span[1] != 0]
        spans = [span for span in spans if span[1] != 0] 
        entities = []
        for label, group in itertools.groupby(enumerate(labels), key=lambda x:x[1]):

            group = list(group)
            start = spans[group[0][0]][0]
            end = spans[group[-1][0]][1]

            if label != 0:
                entity = {
                    "span":[start, end],
                    "type_id":label
                }
                entities.append(entity)
        
        return entities


# =============================
# Funcs
# =============================
def get_train_entities(cfg, train_df, id_code):
    entities = []
    _df = train_df[train_df["id"] == id_code].reset_index()

    for _, row in _df.iterrows():
        start, end = row["discourse_start"], row["discourse_end"]
        label = row["discourse_type"]

        type_id = cfg.LABELS2IDS[label]
        entity = {"span":(start, end), "type_id":type_id}
        entities.append(entity)
    
    return entities


def get_full_text(cfg, filename, data="train"):
    return open(cfg.INPUT + f"/{data}/{filename}.txt", "r").read()


def get_text_df(cfg, data="train"):
    
    if cfg.COLAB:
        filepath = os.path.join(cfg.INPUT, f"{data}_text_df.csv")
    else:
        filepath = f"{data}_text_df.csv"
    if os.path.isfile(filepath):
        text_df = pd.read_csv(filepath)
    
    else:
        listdir = os.listdir(os.path.join(cfg.INPUT, data))
        texts, ids = [], []
        for f in tqdm(listdir):
            id_code = os.path.splitext(f)[0]
            text= get_full_text(cfg, filename=id_code, data=data)
            texts.append(text)
            ids.append(id_code)
        
        text_df = pd.DataFrame({"id":ids, "text":texts})
        text_df.to_csv(filepath, index=False)

    return text_df
# =============================
# Dataset
# =============================
class FeedbackTrainDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, directory="train", text_df=None):
        self.cfg = cfg
        self.df = df
        self.filenames = df["id"].unique().tolist()
        self.tokenizer = tokenizer
        self.directory = directory
        self.text_df = text_df

    def __getitem__(self, idx):
        filename = self.filenames[idx]
        if self.text_df is None:
            text = get_full_text(self.cfg, filename, data=self.directory)
        else:
            text = self.text_df.loc[self.text_df["id"] == f"{filename}", "text"].values[0]

        entities = get_train_entities(self.cfg, self.df, id_code=filename)
        encoding = self.tokenizer.encode_plus_tagged(
            text, 
            entities=entities, 
            max_length=self.cfg.max_length)
        
        inputs = {"input_ids":encoding["input_ids"], 
                  "attention_mask":encoding["attention_mask"]}
                  
        labels = torch.tensor(encoding["labels"])
        inputs = {k:torch.tensor([v]).flatten() for k, v in inputs.items()}
        return inputs, labels
    
    def __len__(self):
        return len(self.filenames)


class FeedbackTestDataset(Dataset):
    def __init__(self, cfg, df, tokenizer, directory, text_df=None):
        self.cfg = cfg
        self.df = df
        self.filenames = df["id"].unique().tolist()
        self.tokenizer = tokenizer
        self.directory = directory
        self.text_df = text_df

    def __getitem__(self, idx):
        filename = self.filenames[idx]
        if self.text_df is None:
            text = get_full_text(self.cfg, filename, data=self.directory)
        else:
            text = self.text_df.loc[self.text_df["id"] == f"{filename}", "text"].values[0]

        inputs, spans = self.tokenizer.encode_plus_untagged(
            text, 
            max_length=self.cfg.max_length, 
            return_tensors=None)
        
        inputs = {k:torch.tensor([v]).flatten() for k, v in inputs.items()}
        return (inputs, spans, text, filename)
    
    def __len__(self):
        return len(self.filenames)


class FeedbackDataModule(pl.LightningDataModule):
    def __init__(self, cfg, tokenizer, train_df, valid_df, text_df=None):
        super(FeedbackDataModule).__init__()

        self.cfg = cfg
        self.tokenizer = tokenizer
        self.train_df = train_df
        self.valid_df = valid_df

        self.text_df = text_df
        self.train_dataset = None
        self.va_dataset = None

    def prepare_data(self):
        self.train_dataset = FeedbackTrainDataset(
            cfg=self.cfg, tokenizer=self.tokenizer, df=self.train_df, directory="train", text_df=self.text_df)
        self.val_dataset = FeedbackTrainDataset(
            cfg=self.cfg, tokenizer=self.tokenizer, df=self.valid_df, directory="train", text_df=self.text_df)
        

    def train_dataloader(self):
        train_dataloader = DataLoader(
            self.train_dataset, 
            batch_size=self.cfg.train_batch_size, 
            shuffle=True, 
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=True)
        
        return train_dataloader

    def val_dataloader(self):
        val_dataloader = DataLoader(
            self.val_dataset,
            batch_size=self.cfg.valid_batch_size,
            shuffle=False,
            num_workers=self.cfg.num_workers, 
            pin_memory=True, 
            drop_last=False)

        return val_dataloader


# =============================
# Model
# =============================
def get_optimizer(cfg, parameters):
    opt = cfg.optimizer
    if opt["optimizer"] == "AdamW":
        optimizer = AdamW(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    elif opt["optimizer"] == "Adam":
        optimizer = Adam(
            parameters,
            lr=opt["lr"],
            weight_decay=opt["weight_decay"]
            )
    
    else:
        raise NotImplementedError
    
    return optimizer


def get_scheduler(cfg, optimizer, num_train_steps):
    sch = cfg.scheduler
    if sch["scheduler"] == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps)
    
    elif sch["scheduler"] == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=sch["num_warmup_steps"],
            num_training_steps=num_train_steps,
            num_cycles=sch["num_cycles"]
            )

    elif sch["scheduler"] == "MultiStepLR":
        scheduler = MultiStepLR(
            optimizer, 
            milestones=sch["milestones"], 
            gamma=sch["gamma"]
        )

    else:
        raise NotImplementedError
    
    return scheduler


class FeedbackModel(pl.LightningModule):
    def __init__(self, cfg, backborn):
        super(FeedbackModel, self).__init__()
        self.cfg = cfg

        self.backborn = backborn   
        self.loss = nn.CrossEntropyLoss()
        self.linear = nn.Linear(self.cfg.hidden_size, len(self.cfg.LABELS))

        self.total_steps = None

    def forward(self, inputs):
        x = self.backborn(**inputs)
        x = x[0]
        x = self.linear(x)
        return x

    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs.view(-1, len(self.cfg.LABELS)), labels.view(-1))
        self.log("train_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.forward(inputs)
        loss = self.loss(outputs.view(-1, len(self.cfg.LABELS)), labels.view(-1))
        self.log("val_loss", loss, on_step=True, logger=True, prog_bar=True)
        return loss

    def setup(self, stage=None):
        if stage != "fit":
            return
        # calculate total steps
        train_dataloader = self.trainer._data_connector._train_dataloader_source.dataloader()
        gpus = 0 if self.trainer.gpus is None else self.trainer.gpus
        tb_size = self.cfg.train_batch_size * max(1, gpus)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(train_dataloader.dataset) // tb_size) // ab_size

    def configure_optimizers(self):
        optimizer = get_optimizer(self.cfg, parameters=self.parameters())

        if self.cfg.scheduler is None:
            return [optimizer]
        else:
            scheduler = get_scheduler(self.cfg, optimizer, num_train_steps=self.total_steps)
            return [optimizer], [{"scheduler": scheduler, "interval": self.cfg.scheduler["interval"]}]


# =============================
# Train & Predict
# =============================
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))


def get_filname_listdir(dirctory):
    listdir = os.listdir(dirctory)
    out_lst = [os.path.splitext(d)[0] for d in listdir]
    return out_lst


def get_predition_strings(full_text, discourse_start, discourse_end):
    """https://www.kaggle.com/c/feedback-prize-2021/discussion/297591
    """
    char_start = discourse_start
    char_end = discourse_end
    word_start = len(full_text[:char_start].split())
    word_end = word_start + len(full_text[char_start:char_end].split())
    word_end = min( word_end, len(full_text.split()) )
    predictionstring = [x for x in range(word_start,word_end)]
    return predictionstring


def train_fold(cfg, train_df, valid_df, tokenizer, backborn, filename, text_df):

    wandblogger = pl.loggers.WandbLogger(
        project=cfg.competition + "-Public", 
        config=class2dict(cfg),
        group=cfg.name, 
        name="-".join(filename.split("-")[-2:]),
        job_type="train",
        reinit=True,
        anonymous="must"  # public
    )

    lightning_datamodule = FeedbackDataModule(
        cfg=cfg,
        train_df=train_df, 
        valid_df=valid_df, 
        tokenizer=tokenizer,
        text_df=text_df
        )
    
    lightning_model = FeedbackModel(cfg=cfg, backborn=backborn)
    
    checkpoint = pl.callbacks.ModelCheckpoint(
        dirpath=cfg.EXP_MODEL,
        filename=filename,
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval="step")
    callbacks = [checkpoint, lr_monitor]

    if cfg.early_stopping:
        early_stopping = pl.callbacks.EarlyStopping(
            monitor="val_loss", 
            min_delta=0.0, 
            patience=8, 
            mode='min', 
        )
        callbacks += [early_stopping]
    
    trainer = pl.Trainer(
        max_epochs=cfg.max_epochs,
        callbacks=callbacks,
        logger=[wandblogger],
        gradient_clip_val=cfg.gradient_clip_val,
        accumulate_grad_batches=cfg.accumulate_grad_batches,
        resume_from_checkpoint=cfg.resume_from_checkpoint,
        gpus=-1
    )

    trainer.fit(lightning_model, datamodule=lightning_datamodule)
    wandb.finish(quiet=True)

def train_cv(cfg, df, tokenizer, backborn, text_df=None):
    """cross validation & get oof"""
    oof_df = pd.DataFrame()
    for i_fold in range(cfg.n_fold):

        if i_fold in cfg.trn_fold:
            filename = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            filelist = get_filname_listdir(cfg.EXP_MODEL)

            val_mask = (df["fold"] == i_fold).astype(bool)
            train_df = df[~val_mask].reset_index(drop=True)
            valid_df = df[val_mask].reset_index(drop=True)

            if not filename in filelist:
                print(f"# --------- # Start Training Fold={i_fold} # --------- #")
                # training
                train_fold(
                    cfg=cfg, 
                    train_df=train_df, 
                    valid_df=valid_df, 
                    tokenizer=tokenizer, 
                    backborn=backborn, 
                    filename=filename, 
                    text_df=text_df
                    )

            # get validation prediction
            pred_df = predict(
                cfg=cfg,
                df=valid_df, 
                tokenizer=tokenizer, 
                backborn=backborn, 
                filename=filename)
            
            val_score = get_score(pred_df=pred_df, true_df=valid_df)
            log = f"{cfg.name}-seed{cfg.seed}-fold{i_fold}:F1 score={val_score:.4f}"
            cfg.logger.info(log)

            oof_df = pd.concat([oof_df, pred_df])
        
    return oof_df.reset_index().rename(columns={"index":"sequence_id"})


def predict(cfg, df, tokenizer, backborn, filename, directory="train", text_df=None):
    preds = get_raw_prediction(cfg, df, tokenizer, backborn, filename, directory, text_df)
    pred_labels = [np.argmax(pred, axis=-1) for pred in preds]

    output_df = get_output_prediction(cfg, pred_labels, df, tokenizer, directory, text_df)
    return output_df


def predict_cv(cfg, df, tokenizer, backborn, directory="test", text_df=None):

    fold_preds = []
    for i_fold in range(cfg.n_fold):
        if i_fold in cfg.trn_fold:
            filename =f"{cfg.name}-seed{cfg.seed}-fold{i_fold}"
            preds = get_raw_prediction(cfg, df, tokenizer, backborn, filename, directory, text_df)
            fold_preds.append(preds)
    
    fold_preds = np.mean(fold_preds, axis=0)
    pred_labels = [np.argmax(pred, axis=-1) for pred in fold_preds]
    output_df = get_output_prediction(cfg, pred_labels, df, tokenizer, directory, text_df)
    return output_df


def get_raw_prediction(cfg, df, tokenizer, backborn, filename, directory="train", text_df=None):
    test_dataset = FeedbackTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, directory=directory, text_df=text_df)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.valid_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers, 
        pin_memory=True, 
        drop_last=False)
    
    lightning_model = FeedbackModel(cfg=cfg, backborn=backborn)
    checkpoint_path = os.path.join(cfg.EXP_MODEL, filename + ".ckpt")
    lightning_model = lightning_model.load_from_checkpoint(
        checkpoint_path, cfg=cfg, backborn=backborn)

    preds = []
    lightning_model.eval()
    lightning_model.to(cfg.DEVICE)

    for (inputs, _, _, _) in tqdm(test_dataloader,total=len(test_dataloader)):
        # get predicted labels by batch
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.DEVICE)

        with torch.no_grad():
            pred = lightning_model(inputs)

        preds.append(pred.cpu().numpy()) 

    preds = np.concatenate(preds)  # (N * max_length * num_classes)
    return preds


def get_output_prediction(cfg, pred_labels, df, tokenizer, directory="train", text_df=None):
    test_dataset = FeedbackTestDataset(
        cfg=cfg, tokenizer=tokenizer, df=df, directory=directory, text_df=text_df)
    output_df = pd.DataFrame()

    for (_, span, text, id_code), pred in zip(tqdm(DataLoader(test_dataset, batch_size=1)), pred_labels):
        # get submission format predictions
        text = text[0]
        entities = tokenizer.convert_output_to_entities(
            text=text,
            labels=pred,
            spans=span)
        
        predstr_by_id, labels_by_id = [], []
        for entity in entities:
            span, type_id = entity["span"], entity["type_id"]
            prediction_strings = get_predition_strings(
                full_text=text, 
                discourse_start=span[0].cpu().numpy()[0], 
                discourse_end=span[1].cpu().numpy()[0])
            
            predstr_by_id.append(" ".join(map(str, prediction_strings)))
            labels_by_id.append(cfg.IDS2LABELS[type_id])
        
        out_df_by_id = pd.DataFrame({
            "id":[id_code[0]]*len(labels_by_id),
            "class":labels_by_id, 
            "predictionstring":predstr_by_id
            })
        output_df = pd.concat([output_df, out_df_by_id])
    return output_df

# =============================
# Metrics
# =============================   
def calc_overlap(row):
    """
    ref: https://www.kaggle.com/robikscube/student-writing-competition-twitch
    """
    set_pred = set(row["predictionstring_pred"].split(" "))
    set_true = set(row["predictionstring_true"].split(" "))

    len_true, len_pred = len(set_true), len(set_pred)
    intersection = len(set_true.intersection(set_pred))

    overlap_t_and_p = intersection / len_true
    overlap_p_and_t = intersection / len_pred

    return [overlap_t_and_p, overlap_p_and_t]


def get_micro_f1_score(pred_df, true_df):

    true_df = (true_df[["id", "discourse_type", "predictionstring"]]
               .reset_index(drop=True)
               .copy())
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    pred_df["pred_id"] = pred_df.index
    true_df["true_id"] = true_df.index

    # 1. all ground truths and predictions for a given class are compared.
    joined_df = pred_df.merge(
        true_df,
        left_on=["id", "class"],
        right_on=["id", "discourse_type"],
        how="outer",
        suffixes=("_pred", "_true"),
    )
    joined_df["predictionstring_true"].fillna(" ", inplace=True)
    joined_df["predictionstring_pred"].fillna(" ", inplace=True)
    joined_df["overlaps"] = joined_df.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    #    and the overlap between the prediction and the ground truth >= 0.5,
    #    the prediction is a match and considered a true positive.
    #    If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined_df["overlap1"] = joined_df["overlaps"].apply(lambda x: eval(str(x))[0])
    joined_df["overlap2"] = joined_df["overlaps"].apply(lambda x: eval(str(x))[1])

    joined_df["potential_TP"] = (joined_df["overlap1"] >= 0.5) & (joined_df["overlap2"] >= 0.5)
    joined_df["max_overlap"] = joined_df[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = (
        joined_df.query("potential_TP")
        .sort_values("max_overlap", ascending=False)
        .groupby(["id", "predictionstring_true"])
        .first()["pred_id"]
        .values
    )

    # 3. Any unmatched ground truths are false negatives
    #    and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined_df["pred_id"].unique() if p not in tp_pred_ids]
    matched_gt_ids = joined_df.query("potential_TP")["true_id"].unique()
    unmatched_gt_ids = [c for c in joined_df["true_id"].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microF1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score


def get_score(pred_df, true_df, return_class_scores=False):
    class_scores = {}
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    for discourse_type, true_subset in true_df.groupby("discourse_type"):
        pred_subset = (
            pred_df.loc[pred_df["class"] == discourse_type]
            .reset_index(drop=True)
            .copy()
        )
        class_score = get_micro_f1_score(pred_subset, true_subset)
        class_score = np.round(class_score, decimals=4)
        class_scores[discourse_type] = class_score

    score = np.mean([v for v in class_scores.values()])
    score = np.round(score, decimals=4)

    if return_class_scores:
        return score, class_scores

    return score

# =============================
# Load Data
# =============================
def get_tokenizer_and_backborn(cfg):
    
    pretrained_dir = os.path.join(cfg.EXP_MODEL, "Pretrain")
    if not os.path.isdir(pretrained_dir):
        model_config = LongformerConfig.from_pretrained(cfg.model_name)
        backborn = LongformerModel.from_pretrained(cfg.model_name, config=model_config)
        tokenizer = IOTokenizerForNER.from_pretrained(cfg.model_name)

        tokenizer.save_pretrained(pretrained_dir)
        backborn.save_pretrained(pretrained_dir)
    
    else:
        model_config = LongformerConfig.from_pretrained(pretrained_dir)
        backborn = LongformerModel.from_pretrained(pretrained_dir, config=model_config)
        tokenizer = IOTokenizerForNER.from_pretrained(pretrained_dir)

    return tokenizer, backborn


def get_all_text_df(cfg):
    train_text_df = get_text_df(cfg, data="train")
    test_text_df = get_text_df(cfg, data="test")
    all_text_df = pd.concat([train_text_df, test_text_df])
    return all_text_df.reset_index(drop=True)

    
def get_fold_idx(cfg, train_df):
    cv_df = pd.DataFrame({"id":train_df["id"].unique()})
    cv_df["fold"] = -1
    cv_strategy = KFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    for i_fold, (tr_idx, va_idx) in enumerate(cv_strategy.split(X=cv_df)):
        cv_df.loc[va_idx, "fold"] = i_fold
    
    train_df = pd.merge(train_df, cv_df, on="id", how="left")
    return train_df
    
# =============================
# PostProcess
# =============================
def postprocessing_for_blank(pred_df, num_blank=1):
    """ 
    df must has ["id", "class", "predictionstring"]
    - label=Lead, predictionstring = [1, 2, 3]
    - label=Lead, predictionstring = [5, 6, 7]
    =>
    - label=Lead, predicitonstring = [1, 2, 3, 5, 6, 7]
    """

    ids = pred_df["id"].unique().tolist()
    output_df = pd.DataFrame()
    for id_code in tqdm(ids):
        output_by_id = []
        _df = pred_df[pred_df["id"] == id_code].reset_index()

        # get sequence label's predictionstrings
        for label, group in itertools.groupby(zip(_df["predictionstring"], _df["class"]), key=lambda x:x[1]):
            group = list(group)
            if len(group) > 1:
                pred_str_lst = []
                for i, g in enumerate(group):
                    strings_lst = list(map(int, g[0].split()))
                    if i == 0:
                        # initial val
                        pred_str_lst += strings_lst

                    elif pred_str_lst[-1] + num_blank >=  strings_lst[0]:
                        # avoid blank
                        pred_str_lst += strings_lst

                    else:
                        output_by_id_class_no_seq = [id_code, label, strings_lst]
                        output_by_id.append(output_by_id_class_no_seq)
            else:
                pred_str_lst = list(map(int, group[0][0].split()))

            output_by_id_class = [id_code, label, pred_str_lst] 
            output_by_id.append(output_by_id_class)

        output_by_id = sorted(output_by_id, key=lambda x:x[2])  # sort by predictionstrings sequence
        output_by_id_df = pd.DataFrame(
            output_by_id, 
            columns=["id", "class", "predictionstring"])
        # convert list to str
        output_by_id_df["predictionstring"] = (
            output_by_id_df["predictionstring"].apply(lambda x:" ".join(map(str, x))))
        # concat by id
        output_df = pd.concat([output_df, output_by_id_df])

    return output_df.reset_index(drop=True)


def postprocessing_for_min_predstr_length(
    pred_df, 
    map_clip = {
        "Lead":9, 
        "Position":5,
        "Evidence":14,
        "Claim":3,
        "Concluding Statement":11,
        "Counterclaim":6, 
        "Rebuttal":4}):
    
    """ref:https://www.kaggle.com/vuxxxx/tensorflow-longformer-ner-postprocessing"""
    
    output_df = pred_df.copy()
    output_df["len"] = pd.Series([len(list(map(int, x.split()))) for x in pred_df["predictionstring"]])

    for key, value in map_clip.items():
        index = output_df[output_df["class"] == key].query(f"len < {value}").index
        output_df.drop(index, inplace = True)
    
    return output_df.reset_index(drop=True).drop("len", axis=1)

# =============================
# EDA
# =============================
def visualize_entity(cfg, id_code, train_df, text_df=None):
    """ Not work in COLAB"""
    if text_df is None:
        full_text = get_full_text(cfg, filename=id_code, data="train")
    else:
        full_text = text_df[text_df["id"] == id_code]["text"].values[0]
    _df = train_df[train_df["id"] == id_code].reset_index(drop=True) 

    entities = []
    for i, row in _df.iterrows():
        entity = {
            'start': int(row['discourse_start']), 
            'end': int(row['discourse_end']), 
            'label': row['discourse_type']
        }
        entities.append(entity)
    doc2 = {
        "text": full_text,
        "ents": entities,
        "title": id_code
    }
    colors = {
        'Lead': '#EE11D0',
        'Position': '#AB4DE1',
        'Claim': '#1EDE71',
        'Evidence': '#33FAFA',
        'Counterclaim': '#4253C1',
        'Concluding Statement': 'yellow',
        'Rebuttal': 'red'
        }
    options = {"ents": list(colors.keys()), "colors":colors}
    spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)

In [None]:
def main():
    
    # load data
    train_df = pd.read_csv(Config.INPUT + "/train.csv")
    if Config.debug:
        train_df = train_df[train_df["id"].isin(train_df["id"].unique()[:20])].reset_index(drop=True)
    submission_df = pd.read_csv(Config.INPUT + "/sample_submission.csv")
    text_df = get_all_text_df(Config)

    # add fold idx
    train_df = get_fold_idx(cfg=Config, train_df=train_df)

    # load tokenizer & backborn
    tokenizer, backborn = get_tokenizer_and_backborn(Config)
    
    if not Config.inference_only:
        # training
        raw_oof_df = train_cv(cfg=Config, df=train_df, tokenizer=tokenizer, backborn=backborn, text_df=text_df)
        raw_oof_df["predictionstring"] = raw_oof_df["predictionstring"].replace("", np.nan)
        raw_oof_df = raw_oof_df.dropna()
        true_df = train_df[train_df["id"].isin(raw_oof_df["id"].unique())].reset_index(drop=True)

        print("# ----------------# RAW OOF # ----------------#")
        display(raw_oof_df)
        Config.logger.info(f"score={get_score(pred_df=raw_oof_df, true_df=true_df, return_class_scores=True)}")

        # postprocessing for train
        print("\n# ----------------# AFTER PP OOF # ----------------#")
        oof_df = postprocessing_for_blank(raw_oof_df, num_blank=1)
        Config.logger.info(f"pp1-score={get_score(pred_df=oof_df, true_df=true_df, return_class_scores=True)}")

        oof_df = postprocessing_for_min_predstr_length(oof_df)
        Config.logger.info(f"pp2-score={get_score(pred_df=oof_df, true_df=true_df, return_class_scores=True)}")

        oof_df = postprocessing_for_blank(oof_df, num_blank=1)
        Config.logger.info(f"pp3-score={get_score(pred_df=oof_df, true_df=true_df, return_class_scores=True)}")
        display(oof_df)

    # inference for test
    print("\n# ----------------# RAW TEST PREDS # ----------------#")
    raw_preds_df = predict_cv(cfg=Config, df=submission_df, tokenizer=tokenizer, backborn=backborn, text_df=text_df)
    raw_preds_df["predictionstring"] = raw_preds_df["predictionstring"].replace("", np.nan)
    raw_preds_df = raw_preds_df.dropna()
    display(raw_preds_df)

    print("\n# ----------------# AFTER PP TEST PREDS # ----------------#")
    preds_df = postprocessing_for_blank(raw_preds_df, num_blank=1)
    preds_df = postprocessing_for_min_predstr_length(preds_df)
    preds_df = postprocessing_for_blank(preds_df, num_blank=1)
    display(preds_df)

    # upload output folder to kaggle dataset
    if Config.upload_from_colab:
        from kaggle.api.kaggle_api_extended import KaggleApi

        def dataset_create_new(dataset_name, upload_dir):
            dataset_metadata = {}
            dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
            dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
            dataset_metadata['title'] = dataset_name
            with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
                json.dump(dataset_metadata, f, indent=4)
            api = KaggleApi()
            api.authenticate()
            api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')
        dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

        
if __name__ == "__main__":
    main()