In [None]:
import ast, gc, os, warnings, glob, random, sys, pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch import Tensor
from transformers import AutoConfig, AutoModel, AutoTokenizer
from collections import OrderedDict
from scipy import stats
from collections import Counter
from bisect import bisect_left
from joblib import Parallel, delayed
from multiprocessing import Manager
from tqdm.auto import tqdm

import xgboost as xgb
from sklearn.model_selection import cross_val_score, GroupKFold, KFold, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from skopt.space import Real
from skopt import gp_minimize
sys.path.append("../input/tez-lib/")
sys.path.append("/kaggle/input/acceleratemaster/")
sys.path.append("/kaggle/input/acceleratemaster/src")
import src.accelerate 
import tez

warnings.filterwarnings('ignore', '.*ragged nested sequences*')
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['LRU_CACHE_CAPACITY'] = "1"

In [None]:
""" 
Configuration Variable for setting this notebook mode, tuning classifier or inference for submission
Variable:
    TRAIN_SEQ_CLASSIFIERS: if you want to tune boosting algorithm, set true
    submit: if you want to inference for making submission, set true
    SUBMISSION: if you want to inference for making submission, set true
"""
TRAIN_SEQ_CLASSIFIERS = True
submit = False
SUBMISSION = False


""" if you set SUBMISSION True, make test dataframe from test.txt """
if SUBMISSION:
    test_names, test_texts = [], []
    for f in list(os.listdir('../input/feedback-prize-2021/test')):
        test_names.append(f.replace('.txt', ''))
        test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
    test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})

In [None]:
""" Configuration Class for LLM, Classifier such as XGBoost, LightGBM, CatBoost """

class CFG1:
    """ Inference Configuration Class for DeBERTa-V3-Large Sequence Length 2048 """
    wandb = True
    seed = 42
    n_gpu = 1
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 0
    weight_path = '/kaggle/input/feedbackprize-2-deberta-v3-larrge-baseline-fold5'
    model = '/kaggle/input/huggingface-automodel-save/deberta-v3-large/'
    reinit = True
    tokenizer = AutoTokenizer.from_pretrained(model)
    n_folds = 5
    max_len = 2048
    val_batch_size = 8
    xgb_params = {
        'learning_rate': 0.05,
        'n_estimators': 200,
        'max_depth': 7,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 0.7,
        'reg_alpha': 0.0005,
        'colsample_bytree': 0.6,
        'scale_pos_weight': 1,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'random_state': 42,
        'n_jobs': -1,
    }

# class CFG2:
#     """ Inference Configuration Class for Longformer Sequence Length 4096 """
#     wandb = True
#     seed = 42
#     n_gpu = 1
#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     gpu_id = 0
#     num_workers = 0
#     weight_path = '/kaggle/input/longformer-large-v9'
#     model = '/kaggle/input/longformerlarge4096/longformer-large-4096/'
#     reinit = True
#     tokenizer = AutoTokenizer.from_pretrained(model)
#     n_folds = 5
#     max_len = 4096
#     val_batch_size = 8
#     xgb_params = {
#         'learning_rate': 0.05,
#         'n_estimators': 200,
#         'max_depth': 7,
#         'min_child_weight': 5,
#         'gamma': 0,
#         'subsample': 0.7,
#         'reg_alpha': 0.0005,
#         'colsample_bytree': 0.6,
#         'scale_pos_weight': 1,
#         'use_label_encoder': False,
#         'eval_metric': 'logloss',
#         'tree_method': 'hist',
#         'random_state': 42,
#         'n_jobs': -1,
#     }

In [None]:
""" python & pytorch Reproducility Setting """

def check_library(checker: bool) -> tuple:
    """
    1) checker == True
        - current device is mps
    2) checker == False
        - current device is cuda with cudnn
    """
    if not checker:
        _is_built = torch.backends.cudnn.is_available()
        _is_enable = torch.backends.cudnn.enabled
        version = torch.backends.cudnn.version()
        device = (_is_built, _is_enable, version)
        return device

def class2dict(cfg) -> dict:
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))

def all_type_seed(cfg, checker: bool) -> None:
    """ init seed for python, random, numpy, pytorch """
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)  # python Seed
    random.seed(cfg.seed)  # random module Seed
    np.random.seed(cfg.seed)  # numpy module Seed
    torch.manual_seed(cfg.seed)  # Pytorch CPU Random Seed Maker

    # device == cuda
    if not checker:
        torch.cuda.manual_seed(cfg.seed)  # Pytorch GPU Random Seed Maker
        torch.cuda.manual_seed_all(cfg.seed)  # Pytorch Multi Core GPU Random Seed Maker
        # torch.cudnn seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.enabled = True

    # devide == mps
    else:
        torch.mps.manual_seed(cfg.seed)

def seed_worker(worker_id) -> None:
    """ init seed for numpy, random library """
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

print(check_library(False))
all_type_seed(CFG2, True)
g = torch.Generator()
g.manual_seed(CFG2.seed)

In [None]:
""" Data Preprocessing Utils """

def ner_tokenizing(cfg, text: str):
    """
    Preprocess text for NER Pipeline
    if you want to set param 'return_offsets_mapping' == True, you must use FastTokenizer
    you must use PretrainedTokenizer which is supported FastTokenizer
    Converting text to torch.Tensor will be done in Custom Dataset Class
    Params:
        return_offsets_mapping:
            - bool, defaults to False
            - Whether or not to return (char_start, char_end) for each token.
            => useful for NER Task
    Args:
        cfg: configuration.CFG, needed to load tokenizer from Huggingface AutoTokenizer
        text: text from dataframe or any other dataset, please pass str type
    """
    inputs = cfg.tokenizer(
        text,
        return_offsets_mapping=True,  # only available for FastTokenizer by Rust, not erase /n, /n/n
        max_length=cfg.max_len,
        padding='max_length',
        truncation=True,
        return_tensors=None,
        add_special_tokens=True,
    )
    return inputs

def load_data(data_path: str) -> pd.DataFrame:
    """
    Load data_folder from csv file like as train.csv, test.csv, val.csv
    """
    df = pd.read_csv(data_path)
    return df

def labels2ids():
    """
    Encoding labels to ids for neural network with BIO Styles
    labels2dict = {
    'O': 0, 'B-Lead': 1, 'I-Lead': 2, 'B-Position': 3, 'I-Position': 4, 'B-Claim': 5,
    'I-Claim': 6, 'B-Counterclaim': 7, 'I-Counterclaim': 8, 'B-Rebuttal': 9, 'I-Rebuttal': 10,
    'B-Evidence': 11, 'I-Evidence': 12, 'B-Concluding Statement': 13, 'I-Concluding Statement': 14
     }
    """
    output_labels = [
        'O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim',
        'I-Counterclaim', 'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement',
        'I-Concluding Statement'
    ]
    labels_to_ids = {v: k for k, v in enumerate(output_labels)}
    return labels_to_ids


def ids2labels():
    """
    Decoding labels to ids for neural network with BIO Styles
    labels2dict = {
    'O': 0, 'B-Lead': 1, 'I-Lead': 2, 'B-Position': 3, 'I-Position': 4, 'B-Claim': 5,
    'I-Claim': 6, 'B-Counterclaim': 7, 'I-Counterclaim': 8, 'B-Rebuttal': 9, 'I-Rebuttal': 10,
    'B-Evidence': 11, 'I-Evidence': 12, 'B-Concluding Statement': 13, 'I-Concluding Statement': 14
     }
    """
    output_labels = [
        'O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim',
        'I-Counterclaim', 'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement',
        'I-Concluding Statement'
    ]
    ids_to_labels = {k: v for k, v in enumerate(output_labels)}
    return ids_to_labels


def split_mapping(unsplit):
    """ Return array which is mapping character index to index of word in list of split() words """
    splt = unsplit.split()
    offset_to_wordidx = np.full(len(unsplit), -1)
    txt_ptr = 0
    for split_index, full_word in enumerate(splt):
        while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
            txt_ptr += 1
        offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
        txt_ptr += len(full_word)
    return offset_to_wordidx


# class Collate:
#     def __init__(self, tokenizer):
#         self.tokenizer = tokenizer

#     def __call__(self, batch):
#         output = dict()
#         output["ids"] = [sample["ids"] for sample in batch]
#         output["mask"] = [sample["mask"] for sample in batch]

#         # calculate max token length of this batch
#         batch_max = max([len(ids) for ids in output["ids"]])

#         # add padding
#         if self.tokenizer.padding_side == "right":
#             output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
#             output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
#         else:
#             output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
#             output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

#         # convert to tensors
#         output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
#         output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

#         return output

    
# def _prepare_test_data_helper(args, tokenizer, ids):
#     test_samples = []
#     for idx in ids:
#         filename = os.path.join(args.input_path, "test", idx + ".txt")
#         with open(filename, "r") as f:
#             text = f.read()

#         encoded_text = tokenizer.encode_plus(
#             text,
#             add_special_tokens=False,
#             return_offsets_mapping=True,
#         )
#         input_ids = encoded_text["input_ids"]
#         offset_mapping = encoded_text["offset_mapping"]

#         sample = {
#             "id": idx,
#             "input_ids": input_ids,
#             "text": text,
#             "offset_mapping": offset_mapping,
#         }

#         test_samples.append(sample)
#     return test_samples


# def prepare_test_data(df, tokenizer, args):
#     test_samples = []
#     ids = df["id"].unique()
#     ids_splits = np.array_split(ids, 4)

#     results = Parallel(n_jobs=4, backend="multiprocessing")(
#         delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
#     )
#     for result in results:
#         test_samples.extend(result)

#     return test_samples

In [None]:
""" Trainer Utils & Metric Function """ 

def get_name(cfg) -> str:
    """ get name of model """
    try:
        name = cfg.model.replace('/', '-')
    except ValueError:
        name = cfg.model
    return name

def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def calculate_f1(pred_df: pd.DataFrame, gt_df: pd.DataFrame) -> float:
    """
    Function for scoring for competition
    Step 1:
        Make dataframe all ground truths and predictions for a given class are compared
    Step 2:
        If the overlap between the ground truth and prediction is >= 0.5 (Recall),
        and the overlap between the prediction and the ground truth >= 0.5 (Precision),
        In other words, prediction will be accepted 'True Positive',
        when Precision & Recall greater than 0.5
        the prediction is a match and considered a true positive.
        If multiple matches exist, the match with the highest pair of overlaps is taken.
        And then count number of Potential True Positive ids
    Step 3:
        Any unmatched ground truths are false negatives and any unmatched predictions are false positives.
        And then count number of Potential False Positives
    Step 4.
        Calculate Micro F1-Score for Cross Validation
    Reference:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].reset_index(drop=True).copy()
    pred_df = pred_df[['id', 'class', 'predictionstring']].reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id', 'class'],
                           right_on=['id', 'discourse_type'],
                           how='outer',
                           suffixes=('_pred', '_gt')
                           )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])

    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1', 'overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id', 'predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score

In [None]:
""" Custom Dataset Class """

class NERDataset(Dataset):
    """
    Custom Dataset Class for NER Task
    Args:
        cfg: configuration.CFG
        df: dataframe from .txt file
        is_train: if this param set False, return word_ids from self.df.entities
    """
    def __init__(self, cfg: CFG2, df: pd.DataFrame, is_train: bool = True) -> None:
        self.cfg = cfg
        self.df = df
        self.tokenizer = ner_tokenizing
        self.labels2ids = labels2ids()  # Function for Encoding Labels to ids
        self.ids2labels = ids2labels()  # Function for Decoding ids to Labels
        self.is_train = is_train

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, item: int) -> tuple[list, [dict[Tensor, Tensor, Tensor], Tensor]]:
        """
        1) Tokenizing input text:
            - if you param 'return_offsets_mapping' == True, tokenizer doen't erase \n or \n\n
              but, I don't know this param also applying for DeBERTa Pretrained Tokenizer
        2) Create targets and mapping of tokens to split() words by tokenizer
            - Mapping Labels to split tokens
            - Iterate in reverse to label whitespace tokens until a Begin token is encountered
            - Tokenizer will split word into subsequent of character such as copied => copy, ##ed
            - So, we need to find having same parent token and then label BIO NER Tags
        3) Return dict:
            - Train: dict.keys = [inputs_id, attention_mask, token_type_ids, labels]
            - Validation/Test: dict.keys = [inputs_id, attention_mask, token_type_ids, word_ids]
        """
        ids = self.df.id[item]
        text = self.df.text[item]
        if self.is_train:
            word_labels = ast.literal_eval(self.df.entities[item])

        # 1) Tokenizing input text
        encoding = self.tokenizer(
            self.cfg,
            text,
        )
        word_ids = encoding.word_ids()
        split_word_ids = np.full(len(word_ids), -1)
        offset_to_wordidx = split_mapping(text)  # [1, sequence_length]
        offsets = encoding['offset_mapping']  # [(src, end), (src, end), ...]

        # 2) Find having same parent token and then label BIO NER Tags
        label_ids = []
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):
            if word_idx is None:
                """ for padding token """
                if self.is_train:
                    label_ids.append(-100)
            else:
                if offsets[token_idx] != (0, 0):
                    # Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(
                        np.unique(split_idxs)) > 1 else split_idxs[0]
                    if split_index != -1:
                        if self.is_train:
                            label_ids.append(self.labels2ids[word_labels[split_index]])
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        if label_ids and label_ids[-1] != -100 and self.ids2labels[label_ids[-1]][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if self.is_train:
                                label_ids.append(label_ids[-1])
                        else:
                            if self.is_train:
                                label_ids.append(-100)
                else:
                    if self.is_train:
                        label_ids.append(-100)
        if not self.is_train:
            encoding['word_ids'] = torch.as_tensor(split_word_ids)
        else:
            encoding['labels'] = list(reversed(label_ids))
        for k, v in encoding.items():
            encoding[k] = torch.as_tensor(v)
        return ids, encoding
    
    
# class LongformerNERDataset:
#     """ Custom Dataset Class for Longformer """
#     def __init__(self, samples, max_len, tokenizer):
#         self.samples = samples
#         self.max_len = max_len
#         self.tokenizer = tokenizer
#         self.length = len(samples)

#     def __len__(self):
#         return self.length

#     def __getitem__(self, idx):
#         input_ids = self.samples[idx]["input_ids"]
#         # print(input_ids)
#         # print(input_labels)

#         # add start token id to the input_ids
#         input_ids = [self.tokenizer.cls_token_id] + input_ids

#         if len(input_ids) > self.max_len - 1:
#             input_ids = input_ids[: self.max_len - 1]

#         # add end token id to the input_ids
#         input_ids = input_ids + [self.tokenizer.sep_token_id]
#         attention_mask = [1] * len(input_ids)

#         # padding_length = self.max_len - len(input_ids)
#         # if padding_length > 0:
#         #     if self.tokenizer.padding_side == "right":
#         #         input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
#         #         attention_mask = attention_mask + [0] * padding_length
#         #     else:
#         #         input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
#         #         attention_mask = [0] * padding_length + attention_mask

#         # return {
#         #     "ids": torch.tensor(input_ids, dtype=torch.long),
#         #     "mask": torch.tensor(attention_mask, dtype=torch.long),
#         # }

#         return {
#             "ids": input_ids,
#             "mask": attention_mask,
#         }

In [None]:
""" Custom Model Class """

class DeBERTaModel(nn.Module):
    """
    Model class For NER Task Pipeline, in this class no pooling layer with backbone named "DeBERTa"
    This pipeline apply B.I.O Style, so the number of classes is 15 which is 7 unique classes original
    Each of 7 unique classes has sub 2 classes (B, I) => 14 classes
    And 1 class for O => 1 class
    14 + 1 = 15 classes
    Args:
        cfg: configuration.CFG
    """
    def __init__(self, cfg) -> None:
        super().__init__()
        self.cfg = cfg
        self.auto_cfg = AutoConfig.from_pretrained(
            self.cfg.model,
            output_hidden_states=True
        )
        self.model = AutoModel.from_pretrained(
            self.cfg.model,
            config=self.auto_cfg
        )
        self.fc = nn.Linear(self.auto_cfg.hidden_size, 15)  # BIO Style NER Task

    def feature(self, inputs_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=inputs_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        return outputs

    def forward(self, inputs) -> Tensor:
        """
        No Pooling Layer for word-level task
        Args:
            inputs: Dict type from AutoTokenizer
            => {input_ids, attention_mask, token_type_ids, offset_mapping, labels}
        """
        outputs = self.feature(
            inputs_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            token_type_ids=inputs["token_type_ids"],
        )
        logit = self.fc(outputs.last_hidden_state)
        return logit

class LongformerModel(nn.Module):
    """ Model class for fine-tuned longformer """
    def __init__(self, cfg):
        super(LongformerModel, self).__init__()
        self.cfg = cfg
        self.auto_cfg = AutoConfig.from_pretrained(
            self.cfg.model,
        )
        self.model = AutoModel.from_pretrained(
            self.cfg.model,
        )
        self.output = nn.Linear(self.auto_cfg.hidden_size, 15)
        self.drop_out = nn.Dropout(0.1)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
    def feature(self, inputs_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=inputs_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        return outputs

    def forward(self, inputs) -> Tensor:
        """
        No Pooling Layer for word-level task
        Args:
            inputs: Dict type from AutoTokenizer
            => {input_ids, attention_mask, token_type_ids, offset_mapping, labels}
        """
        outputs = self.feature(
            inputs_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            token_type_ids=inputs["token_type_ids"],
        ).last_hidden_state
        
        preds1 = self.output(self.dropout1(outputs))
        preds2 = self.output(self.dropout2(outputs))
        preds3 = self.output(self.dropout3(outputs))
        preds4 = self.output(self.dropout4(outputs))
        preds5 = self.output(self.dropout5(outputs))
        preds = (preds1 + preds2 + preds3 + preds4 + preds5) / 5
        return preds

In [None]:
""" torch.cuda, cudnn, reproducibility setting """

check_library(True)
all_type_seed(CFG2, True)
g = torch.Generator()
g.manual_seed(CFG2.seed)


""" Trainer Class for Make Sequence Dataset for Multiple Label Classification Task Pipeline """

class SequenceDataTrainer:
    """
    Only Forward Pass with Validation Dataset for Making Sequence Dataset by whole Competition Data
    This Trainer Class for DeBERTa model
    """
    def __init__(self, cfg, df: pd.DataFrame, generator: torch.Generator) -> None:
        self.cfg = cfg
        self.model_name = get_name(self.cfg)
        self.generator = generator
        self.df = df

    def make_batch(self, fold: int) -> tuple[torch.utils.data.DataLoader, pd.DataFrame]:
        """
        Make Batch Dataset for main train loop
        Select 50% Dataset of each fold randomly
        Using Fold 0 for tuning XGBoost
        """
        # Custom Datasets
        valid_dataset = NERDataset(self.cfg, self.df, is_train=False)
        loader_valid = DataLoader(
            valid_dataset,
            batch_size=self.cfg.val_batch_size,
            shuffle=False,
            worker_init_fn=seed_worker,
            generator=self.generator,
            num_workers=self.cfg.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        return loader_valid, self.df

    def model_setting(self, model_name: str, path: str, fold: int):
        """ load fine-tuned model's weight, iterate by fold """
        if model_name == 'deberta':
            model = DeBERTaModel(self.cfg)
            model.load_state_dict(torch.load(path, map_location="cuda:0"),)
        else:
            model = LongformerModel(
                self.cfg
            )
#             model = torch.nn.DataParallel(model)
#             model.load_state_dict(torch.load(path, map_location="cuda:0"),)
            longformer_state_dict = torch.load(path, map_location="cuda:0")
            new_state_dict = OrderedDict()
            for k, v in longformer_state_dict.items():
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v
            model.load_state_dict(new_state_dict, strict=False)  # strict=False
            
        model.to(self.cfg.device)
        return model

    def inference_fn(self, loader_valid: torch.utils.data.DataLoader, model: nn.Module) -> tuple[list, list]:
        """
        Validation Functions
        Not convert probability to string label text with torch.argmax
        function should return those shape of Tensor: [batch_size, sequence_length, num_labels] == outputs.last_hidden_state
        Variable:
            val_ids_list: list of ids for calculating sequence dataset
        """
        val_ids_list = []
        full_pred = np.zeros((len(loader_valid.dataset), self.cfg.max_len, 15), dtype=np.float32)
        word_ids = np.full((len(loader_valid.dataset), self.cfg.max_len), -100)
        model.eval()
        with torch.no_grad():
            for step, (ids, inputs) in enumerate(tqdm(loader_valid)): 
                # inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(self.cfg.device)  # prompt to GPU

                val_pred = model(inputs)  # [batch_size, sequence_length, num_labels]
                val_prob = F.softmax(val_pred, dim=2).cpu().detach().numpy()  # dim 2 == num_labels dim
                full_pred[step*self.cfg.val_batch_size:(step+1)*self.cfg.val_batch_size] += val_prob
                word_ids[step*self.cfg.val_batch_size:(step+1)*self.cfg.val_batch_size] = inputs["word_ids"].cpu().detach().numpy()
                val_ids_list.extend(ids)
                
            # 2) make prediction list
            predictions = []
            for idx in range(full_pred.shape[0]):
                """ loop for each unique ids """
                prediction, prob_buffer, previous_word_idx = [], [], -1
                sequence_logit = full_pred[idx]
                sub_ids = word_ids[idx][word_ids[idx] != -100]
                for i, word_idx in enumerate(sub_ids):
                    if word_idx == -1:
                        pass
                    elif word_idx != previous_word_idx:
                        if prob_buffer:
                            prediction.append(np.mean(prob_buffer, dtype=np.float32, axis=0))
                            prob_buffer = []
                        prob_buffer.append(sequence_logit[i])
                        previous_word_idx = word_idx
                    else:
                        prob_buffer.append(sequence_logit[i])
                prediction.append(np.mean(prob_buffer, dtype=np.float32, axis=0))
                predictions.append(prediction)
            gc.collect()
            torch.cuda.empty_cache()
        return val_ids_list, predictions

In [None]:
""" Sorting OOF DeBERTa-V3-Large Model weight list """

sorted_model_list = []
model_list = glob.glob(f'{CFG1.weight_path}/*.pth')
for idx in range(len(model_list)):
    num = model_list[idx].split('/')[4][4]
    sorted_model_list.append([model_list[idx], num])
sorted_model_list.sort(key=lambda x:x[1])

for idx in range(len(model_list)):
    model_list[idx] = sorted_model_list[idx][0]

In [None]:
"""
DeBERTa-V3-Large Inference
"""
tmp, model_name = 0, 'deberta'
all_id_list, all_pred_list = [], []
tmp_dataframe = load_data('/kaggle/input/fbp2-preprocessed-train-dataframe/final_converted_train_df.csv')
random.seed(42)
sample_id_list = random.sample(tmp_dataframe.id.to_list(), int(len(tmp_dataframe) * 0.5))
tmp_dataframe = tmp_dataframe[tmp_dataframe['id'].isin(sample_id_list) == True].reset_index(drop=True)

for fold, model_path in tqdm(enumerate(model_list)):
    print(f'============== {fold}th Fold forward ==============')
    forward_input = SequenceDataTrainer(CFG1, tmp_dataframe, g)
    loader_valid, valid = forward_input.make_batch(fold)
    fold_model = forward_input.model_setting(model_name, model_path, fold)
    # forward pass
    all_id_list, predictions = forward_input.inference_fn(loader_valid, fold_model)
    all_pred_list.append(predictions)
    del fold_model
    gc.collect()
    torch.cuda.empty_cache()

""" OOF 5 Folds """
for i in range(len(all_pred_list)):
    for j in range(len(all_pred_list[i])):
        tmp += all_pred_list[i][j]
deberta_oof = tmp / 5

In [None]:
"""
Let's Start Making Sequence Dataset by forwarding each fold's dataset to fold's model weight
This loop function for pred list which is used to making sequence dataframe
We can tune 3 Boosting Algorithm with "Full Train Dataset"
This code aim to use full train data but, due to time out problem, we can't use all of them
So, We will use 50% of each fold first and then increase amount of dataset
"""
# model_name = 'longformer'
# longformer_all_id_list, longformer_all_pred_list = [], []
# model_list = glob.glob(f'{CFG2.weight_path}/*.pt')
# for fold, model_path in tqdm(enumerate(model_list)):
#     print(f'============== {fold}th fold forward ==============')
#     forward_input = SequenceDataTrainer(CFG2, g)
#     loader_valid, valid = forward_input.make_batch(fold)
#     fold_model = forward_input.model_setting(model_name, model_path, fold)
#     # forward pass
#     longformer_all_id_list, predictions = forward_input.inference_fn(loader_valid, fold_model)
#     longformer_all_pred_list.append(predictions)
#     del fold_model
#     gc.collect()
#     torch.cuda.empty_cache()

In [None]:
""" Save prediction array for speed up debugging & tuning """

""" DeBERTa-V3-Large """
np.save('sampling_pred_list.npy', deberta_oof)  # save predictions
np.save('sampling_id_list.npy', np.array(sample_id_list))  # save id list

# """ Longformer-4096-Large """
# longformer_sampling_predictions = np.array(longformer_all_pred_list)
# np.save('longformer_sampling_pred_list.npy', longformer_sampling_predictions)  # save predictions
# np.save('longformer_sampling_id_list.npy', np.array(longformer_all_id_list))  # save id list

In [None]:
# """ 
# 1) Load DeBERTa-V3-Large, Longformer-Large-4096 inference array
# 2) Ensemble Each inference Result
# """

# """
# Load prediction result for Sampling train dataframe from DeBERTa-V3-Large
# """
# deberta_all_pred_list = np.load(
#     "/kaggle/input/feedbackprize2-sampling-inference-for-tuning/sampling_pred_list.npy",
#     allow_pickle=True
# )
# deberta_all_id_list = np.load(
#     '/kaggle/input/feedbackprize2-sampling-inference-for-tuning/sampling_id_list.npy',
#     allow_pickle=True
# )

# """
# Load prediction result for Sampling train dataframe from Longformer-Large-4096
# """
# longformer_all_pred_list = np.load(
#     "/kaggle/input/feedbackprize2-sampling-inference-for-tuning/sampling_pred_list.npy",
#     allow_pickle=True
# )
# lonformer_all_id_list = np.load(
#     '/kaggle/input/feedbackprize2-sampling-inference-for-tuning/sampling_id_list.npy',
#     allow_pickle=True
# )

""" Ensemble Two Inference Result for Tuning XGBoost """

uniqueValidGroups = range(len(all_pred_list))
uniqueValidGroups

In [None]:
"""
Convert pred list into Sequence DataFrame
This code aim to use full train data but, due to time out problem, we can't use all of them
So, We will use 10% data of each fold first and then increase if it will be fine
"""
# 1) Make Input
train_df = load_data('/kaggle/input/feedback-prize-2021/train.csv')
valid_df = train_df[train_df['id'].isin(all_id_list) == True].reset_index(drop=True)

disc_type_to_ids = {
    'Evidence':(11,12),
    'Claim':(5,6),
    'Lead':(1,2),
    'Position':(3,4),
    'Counterclaim':(7,8),
    'Rebuttal':(9,10),
    'Concluding Statement':(13,14)
}

# 2) Minimum Threshold Value of Each Target Classes, Reference from Discussion from competition
MIN_BEGIN_PROB = {
    'Claim': .35,
    'Concluding Statement': .15,
    'Counterclaim': .04,
    'Evidence': .1,
    'Lead': .32,
    'Position': .25,
    'Rebuttal': .01,
}

MAX_SEQ_LEN = {}
train_df['len'] = train_df['predictionstring'].apply(lambda x:len(x.split()))
max_lens = train_df.groupby('discourse_type')['len'].quantile(.995)
for disc_type in disc_type_to_ids:
    MAX_SEQ_LEN[disc_type] = int(max_lens[disc_type])

# 3) Custom Dataset Class
class SeqDataset(object):
    """
    Args:
        features: sequence length, position, and various kinds of class probability
        labels: whether the sequence matches exactly a discourse instance
        truePos: whether the sequence matches a discourse instance by competition criteria for true positive
        groups: the integer index of the text where the sequence is found
        wordRanges: the start and end word index of the sequence in the text
    Reference:
        https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb
    """
    def __init__(self, features: list, labels: list, groups: list, wordRanges: list, truePos: list) -> None:
        self.features = np.array(features, dtype=np.float32)
        self.labels = np.array(labels)
        self.groups = np.array(groups, dtype=np.int16)
        self.wordRanges = np.array(wordRanges, dtype=np.int16)
        self.truePos = np.array(truePos)

# 4) Making DataFrame Utils Function
def sorted_quantile(array: list, q: float):
    """
    This is used to prevent re-sorting to compute quantile for every sequence.
    Args:
        array: list of element
        q: accumulate probability which you want to calculate spot
    Reference:
        https://stackoverflow.com/questions/60467081/linear-interpolation-in-numpy-quantile
        https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb/notebook
    """
    array = np.array(array)
    n = len(array)
    index = (n - 1) * q
    left = np.floor(index).astype(int)
    fraction = index - left
    right = left
    right = right + (fraction > 0).astype(int)
    i, j = array[left], array[right]
    return i + (j - i) * fraction

def sequence_dataset(
    disc_type: str,
    valid_word_preds: np.ndarray,
    dataframe: pd.DataFrame,
    test_word_preds: np.ndarray = None,
    pred_indices: bool = None,
    submit: bool = False
        ):
    """
    Function for making sequence dataset for changing NER Task to Multi-Class Classification Task
    Args:
        disc_type: discourse type, for example 'Claim', 'Evidence' later turned into target classes
        valid_word_preds: valid word predictions from neural network which is trained NER Task
        dataframe: train dataframe
        test_word_preds: test word predictions from neural network which is trained NER Task
        pred_indices: indices of valid word predictions
        submit: if True, use test_word_preds instead of valid_word_preds
    Reference:
        https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb/notebook
    """
    word_preds = valid_word_preds if not submit else test_word_preds
    window = pred_indices if pred_indices else range(len(word_preds))
    X = np.empty((int(1e6),13), dtype=np.float32)
    X_ind = 0
    y = []
    truePos = []
    wordRanges = []
    groups = []
    for text_i in tqdm(window):
        text_preds = np.array(word_preds[text_i])
        num_words = len(text_preds)
        disc_begin, disc_inside = disc_type_to_ids[disc_type]

        # The probability that a word corresponds to either a 'B'-egin or 'I'-nside token for a class
        prob_or = lambda word_preds: (1-(1-word_preds[:,disc_begin]) * (1-word_preds[:,disc_inside]))

        if not submit:
            gt_idx = set()
            gt_arr = np.zeros(num_words, dtype=int)
#             text_gt = valid.loc[valid.id == dataframe.id.values[text_i]]
            text_gt = dataframe
            disc_gt = text_gt.loc[text_gt.discourse_type == disc_type]

            # Represent the discourse instance locations in a hash set and an integer array for speed
            for row_i, row in enumerate(disc_gt.iterrows()):
                splt = row[1]['predictionstring'].split()
                start, end = int(splt[0]), int(splt[-1]) + 1
                gt_idx.add((start, end))
                gt_arr[start:end] = row_i + 1
            gt_lens = np.bincount(gt_arr)

        # Iterate over every sub-sequence in the text
        quants = np.linspace(0,1,7)  # for quantile
        prob_begins = np.copy(text_preds[:,disc_begin])
        min_begin = MIN_BEGIN_PROB[disc_type]
        for pred_start in range(num_words):
            prob_begin = prob_begins[pred_start]
            if prob_begin > min_begin:
                begin_or_inside = []
                for pred_end in range(pred_start+1,min(num_words+1, pred_start+MAX_SEQ_LEN[disc_type]+1)):

                    new_prob = prob_or(text_preds[pred_end-1:pred_end])
                    insert_i = bisect_left(begin_or_inside, new_prob)
                    begin_or_inside.insert(insert_i, new_prob[0])

                    # Generate features for a word sub-sequence

                    # The length and position of start/end of the sequence
                    features = [pred_end - pred_start, pred_start / float(num_words), pred_end / float(num_words)]

                    # 7 evenly spaced quantiles of the distribution of relevant class probabilities for this sequence
                    features.extend(list(sorted_quantile(begin_or_inside, quants)))

                    # The probability that words on either edge of the current sub-sequence belong to the class of interest
                    features.append(prob_or(text_preds[pred_start-1:pred_start])[0] if pred_start > 0 else 0)
                    features.append(prob_or(text_preds[pred_end:pred_end+1])[0] if pred_end < num_words else 0)

                    # The probability that the first word corresponds to a 'B'-egin token
                    features.append(text_preds[pred_start,disc_begin])

                    exact_match = (pred_start, pred_end) in gt_idx if not submit else None

                    if not submit:
                        true_pos = False
                        for match_cand, count in Counter(gt_arr[pred_start:pred_end]).most_common(2):
                            if match_cand != 0 and count / float(pred_end - pred_start) >= .5 and float(count) / gt_lens[match_cand] >= .5: true_pos = True
                    else:
                        true_pos = None

                    # For efficiency, use a numpy array instead of a list that doubles in size when full to conserve constant "append" time complexity
                    if X_ind >= X.shape[0]:
                        new_X = np.empty((X.shape[0]*2,13), dtype=np.float32)
                        new_X[:X.shape[0]] = X
                        X = new_X
                    X[X_ind] = features
                    X_ind += 1

                    y.append(exact_match)
                    truePos.append(true_pos)
                    wordRanges.append((np.int16(pred_start), np.int16(pred_end)))
                    groups.append(np.int16(text_i))

    return SeqDataset(X[:X_ind], y, groups, wordRanges, truePos)

In [None]:
"""
Parallelize part sequence dataset generation for Tuning 3 Boosting Algorithm
Source Code from:
    https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb
"""

submit, SUBMISSION = False, False
manager = Manager()

def generate_sequence_dataset(disc_type: str, submit=False):
    if not submit:
        if not SUBMISSION:
            validSeqSets[disc_type] = sequence_dataset(
                disc_type,
                all_pred_list,
                valid_df,
            )
    # else:
    #     submitSeqSets[disc_type] = sequence_dataset(
    #         disc_type,
    #
    #         submit=True)

print('Making validation sequence datasets...')
validSeqSets = manager.dict()
Parallel(n_jobs=-1, backend='multiprocessing')(
        delayed(generate_sequence_dataset)(disc_type, False)
       for disc_type in disc_type_to_ids
    )
print('Done.')

# print('Making submit sequence datasets...')
# submitSeqSets = manager.dict()
# Parallel(n_jobs=-1, backend='multiprocessing')(
#         delayed(sequenceDataset)(disc_type, True)
#        for disc_type in disc_type_to_ids
#     )
# print('Done.')

In [None]:
"""
Re-Ranking Function
Downsample negative samples to 1:1 for efficiency/ease. There are many samples, and performance increase was observed.
"""
NEGATIVE_SAMPLE_RATIO = 10

def resample(y):
    global resample_call
    counts = np.bincount(y)
    np.random.seed((resample_call+counts[0]) % 2**32)

    neg_sample_count = NEGATIVE_SAMPLE_RATIO*counts[1]
    indices = np.concatenate((
        np.random.choice(np.arange(len(y))[y==0], neg_sample_count, replace=False),
        np.arange(len(y))[y==1]
    ))
    indices.sort()
    resample_call += 1
    return indices

resample_call = 0

In [None]:
"""
Tune 3 Boosting Algorithm (XGBoost, LightGBM, CatBooost) with Cross Validation
Add more Classifier Algorithm
Original Source Code from:
    https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb
"""

NUM_FOLDS = 8
seq_cache = {} # For each fold and each text. cache score predictions sorted by score
clfs = []  # Each fold will add its classifier here
test_dataset = valid_df
train_text_df = pd.read_csv('/kaggle/input/fbp2-preprocessed-train-dataframe/final_converted_train_df.csv')
submitSeqSets = []


def xgb_predict_strings(
    disc_type: str,
    probThresh: float,
    test_groups,
    train_ind=None,
    submit=False
    ):
    """
    Predict sub-sequences for a discourse type and set of train/test texts
    """
    string_preds = []
    validSeqDs = validSeqSets[disc_type]
#    submitSeqDs = submitSeqSets[disc_type]
    
    # Average the probability predictions of a set of classifiers
    get_tp_prob = lambda testDs, classifiers: np.mean([clf.predict_proba(testDs.features)[:,1] for clf in classifiers], axis=0) if testDs.features.shape[0] > 0 else np.array([])
    
    """ Classifier Tuning workflow """
    if not submit:  
        # Point to validation set values, tuned by validation dataset
        predict_df = test_dataset
        text_df = train_text_df
        groupIdx = np.isin(validSeqDs.groups, test_groups)
        testDs = SeqDataset(
            validSeqDs.features[groupIdx],
            validSeqDs.labels[groupIdx],
            validSeqDs.groups[groupIdx],
            validSeqDs.wordRanges[groupIdx],
            validSeqDs.truePos[groupIdx]
        )
        
        # Cache the classifier predictions to speed up tuning iterations
        seq_key = (
            disc_type,
            tuple(test_groups),
            tuple(train_ind)
        )
        if seq_key in seq_cache:
            text_to_seq = seq_cache[seq_key]
        else:
            clf = xgb.XGBClassifier(
                **CFG1.xgb_params
            )
            
            resampled = resample(validSeqDs.truePos[train_ind])
            clf.fit(
                validSeqDs.features[train_ind][resampled],
                validSeqDs.truePos[train_ind][resampled]
            )
            clfs.append(clf)
            prob_tp = get_tp_prob(testDs, [clf])

    else:
        """ Making submission workflow """
        # Point to submission set values
        predict_df = test_texts
        text_df = test_texts
        groupIdx = np.isin(submitSeqDs.groups, test_groups)
        testDs = SeqDataset(submitSeqDs.features[groupIdx], submitSeqDs.labels[groupIdx], submitSeqDs.groups[groupIdx], submitSeqDs.wordRanges[groupIdx], submitSeqDs.truePos[groupIdx])
        
        # Classifiers are always loaded from disc during submission
        with open( f"../input/seqclassifiers6/{disc_type}_clf.p", "rb" ) as clfFile:
            classifiers = pickle.load( clfFile )  
        prob_tp = get_tp_prob(testDs, classifiers)
    
    if submit or seq_key not in seq_cache:
        text_to_seq = {}
        for text_idx in test_groups:
            # The probability of true positive and (start,end) of each sub-sequence in the curent text
            prob_tp_curr = prob_tp[testDs.groups == text_idx]
            word_ranges_curr = testDs.wordRanges[testDs.groups == text_idx]
            sorted_seqs = list(reversed(sorted(zip(prob_tp_curr, [tuple(wr) for wr in word_ranges_curr]))))
            text_to_seq[text_idx] = sorted_seqs
        if not submit: 
            seq_cache[seq_key] = text_to_seq
    
    for text_idx in test_groups:
        i = 1
        split_text = text_df.loc[text_df.id == predict_df.id.values[text_idx]].iloc[0].text.split()
        
        # Start and end word indices of sequence candidates kept in sorted order for efficiency
        starts = []
        ends = []
        
        # Include the sub-sequence predictions in order of predicted probability
        for prob, wordRange in text_to_seq[text_idx]:
            # Until the predicted probability is lower than the tuned threshold
            if prob < probThresh: 
                break
                
            # Binary search already-placed word sequence intervals, and insert the new word sequence interval if it does not intersect an existing interval.
            insert = bisect_left(starts, wordRange[0])
            if (insert == 0 or ends[insert-1] <= wordRange[0]) and (insert == len(starts) or starts[insert] >= wordRange[1]):
                starts.insert(insert, wordRange[0])
                ends.insert(insert, wordRange[1])
                string_preds.append((predict_df.id.values[text_idx], disc_type, ' '.join(map(str, list(range(wordRange[0], wordRange[1]))))))
                i += 1     
    return string_preds

def sub_df(string_preds):
    return pd.DataFrame(string_preds, columns=['id','class','predictionstring'])
    
# Convert skopt's uniform distribution over the tuning threshold to a distribution that exponentially decays from 100% to 0%
def prob_thresh(x): 
    return .01*(100-np.exp(100*x))

# Convert back to the scalar supplied by skopt
def skopt_thresh(x): 
    return np.log((x/.01-100.)/-1.)/100.
    
def score_fmin(arr, disc_type):
    """
    This function is called every tuning iteration.
    It takes the probability threshold as input and returns Macro F1
    """
    validSeqDs = validSeqSets[disc_type]
    string_preds = []
    folds = np.array(list(GroupKFold(n_splits=NUM_FOLDS).split(validSeqDs.features, groups=validSeqDs.groups)))
    gt_indices = []
    for ind in folds[:,1]: gt_indices.extend(ind)
        
    # Texts that have no samples in our dataset for this class
    unsampled_texts = np.array(np.array_split(list(set(uniqueValidGroups).difference(set(np.unique(validSeqDs.groups)))), NUM_FOLDS))
    
    gt_texts = test_dataset.id.values[np.unique(validSeqDs.groups[np.array(gt_indices, dtype=int)]).astype(int)]
    
    # Generate predictions from each fold of the validation predictions
    for fold_i, (train_ind, test_ind) in enumerate(folds):
        string_preds.extend(
            xgb_predict_strings(
                disc_type, prob_thresh(arr[0]),
                np.concatenate((np.unique(validSeqDs.groups[test_ind]), unsampled_texts[fold_i])).astype(int),
                train_ind
            )
        )
    boost_df = sub_df(list(string_preds))
    gt_df = valid_df.loc[np.bitwise_and(valid_df['discourse_type']==disc_type, valid_df.id.isin(gt_texts))].copy()
    f1 = calculate_f1(boost_df.copy(), gt_df)
    return -f1

def train_seq_clfs(disc_type):
    """ 
    Function for Finding optimal Probability Threshold by using skopt
    function will find optimization bounds on the tuned probability threshold
    Source code from:
        https://www.kaggle.com/code/chasembowers/sequence-postprocessing-v2-67-lb
    """
    space_start = skopt_thresh(.999)
    space_end = skopt_thresh(0)
    space  = [Real(space_start,space_end)]
    
    # Minimize F1
    score_fmin_disc = lambda arr: score_fmin(arr, disc_type)
    res_gp = gp_minimize(
        score_fmin_disc,
        space,
        n_calls=100,
        x0=[skopt_thresh(.5)]
    )
    
    # Use the gaussian approximation of f(threshold) -> F1 to select the minima
    thresh_cand = np.rot90([np.linspace(0,1,1000)])
    cand_scores = res_gp.models[-1].predict(thresh_cand)
    best_thresh_raw = space_start + (space_end - space_start)*thresh_cand[np.argmin(cand_scores)][0]
    best_thresh = prob_thresh(best_thresh_raw)
    exp_score = -np.min(cand_scores)
    
    # Make predictions at the inferred function minimum
    pred_thresh_score = -score_fmin_disc([best_thresh_raw])
    
    # And the best iteration in the optimization run
    best_iter_score = -score_fmin_disc(res_gp.x)
    
    # Save the trained classifiers to disc
    with open( f"{disc_type}_clf.p", "wb" ) as clfFile:
        pickle.dump( clfs, clfFile )
        
    # Save the tuning run results to file
    with open( f"{disc_type}_res.p", "wb" ) as resFile:
        pickle.dump( 
            {
                'pred_thresh': best_thresh,  # The location of the minimum of the gaussian function inferred by skopt
                'min_thresh': prob_thresh(res_gp.x[0]),  # The threshold which produces the best score
                'pred_score': exp_score,  # The minimum of the gaussian function inferred by skopt
                'min_score': best_iter_score, # The best score in the tuning run
                'pred_thresh_score': pred_thresh_score  # The score produced by 'pred_thresh'
            }, 
            resFile 
        )
    print('Done training', disc_type)
    
if TRAIN_SEQ_CLASSIFIERS and not SUBMISSION:
    print('Training sequence classifiers... (This takes a long time.)')
    Parallel(n_jobs=-1, backend='multiprocessing')(
            delayed(train_seq_clfs)(disc_type) 
           for disc_type in disc_type_to_ids
    )
    print('Done training all sequence classifiers.')

In [None]:
""" Check Tuning Result """
thresholds = {}
for disc_type in disc_type_to_ids:
    with open( f"/kaggle/working/{disc_type}_res.p", "rb" ) as res_file:
        train_result = pickle.load( res_file )  
    thresholds[disc_type] = train_result['pred_thresh']
    print(f'{disc_type }: {train_result}', end='\n\n')
    
""" Submission Part """
sub = pd.concat([sub_df(predict_strings(disc_type, thresholds[disc_type], uniqueSubmitGroups, submit=True)) for disc_type in disc_type_to_ids ]).reset_index(drop=True)