In [17]:
import ast, gc, os, warnings, glob
import wandb
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torch import Tensor
from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, AutoTokenizer
from scipy import stats
from collections import Counter
from bisect import bisect_left
from tqdm.auto import tqdm

import dataset_class.dataclass as dataset_class
import model.loss as model_loss
import model.metric as model_metric
import model.model as model_arch
from dataset_class import data_preprocessing
from dataset_class.data_preprocessing import *
from utils.helper import *
from trainer.trainer_utils import *
from model.metric import *
from utils.helper import class2dict

warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['LRU_CACHE_CAPACITY'] = "1"

In [12]:
""" Configuration Class for LLM, Classifier such as XGBoost, LightGBM, CatBoost """

class CFG:
    wandb = True
    seed = 42
    n_gpu = 1
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    gpu_id = 0
    num_workers = 4
    weight_path = './saved/model'
    model = 'microsoft/deberta-v3-large'
    tokenizer = AutoTokenizer.from_pretrained(model)
    n_folds = 5
    max_len = 2048
    val_batch_size = 64
    xgb_params = {
         'learning_rate': 0.05,
        'n_estimators': 200,
        'max_depth': 7,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 0.7,
        'reg_alpha': 0.0005,
        'colsample_bytree': 0.6,
        'scale_pos_weight': 1,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'tree_method': 'hist'
    }
    cat_params = {
        'iterations': 2000,
        'learning_rate': 0.07,
        'depth': 12,
        'l2_leaf_reg':8 ,
        'random_strength':0.5,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU',
        'border_count': 128,
        'verbose': 1000,
        'early_stopping_rounds': 100,
        'use_best_model': True,

    }
    lgb_params = {
    'n_estimators': 1500, # use a large number of trees with early stopping
    'max_depth': 12, # restrict the depths of the individual trees
    'min_child_samples': 20, # atleast 20 observations in leaf
    'early_stopping_round': 50, # this can be specified in config as well
    'subsample_freq': 1, # this can be specified in config as well
    'n_jobs': 1,
    'importance_type': 'gain',
    'device': 'gpu'
    }

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
""" torch.cuda, cudnn, reproducibility setting """

check_library(True)
all_type_seed(CFG, True)
g = torch.Generator()
g.manual_seed(CFG.seed)


""" Trainer Class for Make Sequence Dataset for Multiple Label Classification Task Pipeline """

class SequenceDataTrainer:
    """
    Only Forward Pass with Validation Dataset for Making Sequence Dataset by whole Competition Data
    """
    def __init__(self, cfg, generator: torch.Generator) -> None:
        self.cfg = cfg
        self.model_name = get_name(self.cfg)
        self.generator = generator
        self.df = load_data('./dataset_class/data_folder/final_converted_train_df.csv')

    def make_batch(self, fold: int) -> tuple[torch.utils.data.DataLoader, pd.DataFrame]:
        """ Make Batch Dataset for main train loop """
        valid = self.df[self.df['fold'] == fold].reset_index(drop=True)

        # Custom Datasets
        valid_dataset = getattr(dataset_class, self.cfg.dataset)(self.cfg, valid, is_train=False)
        loader_valid = DataLoader(
            valid_dataset,
            batch_size=self.cfg.val_batch_size,
            shuffle=False,
            worker_init_fn=seed_worker,
            generator=self.generator,
            num_workers=self.cfg.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        return loader_valid, valid

    def model_setting(self, path: str):
        """ load fine-tuned model's weight, iterate by fold """
        model = getattr(model_arch, self.cfg.model_arch)(self.cfg)
        model.load_state_dict(
            torch.load(path)
        )
        model.to(self.cfg.device)
        return model


    def valid_fn(self, loader_valid, model) -> tuple[list, list]:
        """ Validation Functions """
        ids_to_labels = data_preprocessing.ids2labels()
        val_ids_list, val_pred_list, val_label_list = [], [], []
        model.eval()
        with torch.no_grad():
            for step, (ids, inputs) in enumerate(tqdm(loader_valid)):  # Maybe need to append
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(self.cfg.device)  # prompt to GPU

                val_ids_list += ids  # make list for calculating cross validation score
                val_pred = model(inputs)  # inference for cross validation

                flat_val_pred = torch.argmax(val_pred, dim=-1).detach().cpu().numpy()
                predictions = []
                for k, text_pred in enumerate(flat_val_pred):
                    token_pred = [ids_to_labels[i] for i in text_pred]
                    prediction = []
                    word_ids = inputs['word_ids'][k].detach().cpu().numpy()
                    previous_word_idx = -1
                    for idx, word_idx in enumerate(word_ids):
                        if word_idx == -1:
                            pass
                        elif word_idx != previous_word_idx:
                            prediction.append(token_pred[idx])
                            previous_word_idx = word_idx
                    predictions.append(prediction)
                val_pred_list.extend(predictions)
        gc.collect()
        return val_ids_list, val_pred_list

In [None]:
""" Let's Make Sequence Dataset by forwarding each fold's dataset to fold's model weight """

tmp_valid = pd.read_csv('./dataset_class/data_folder/train.csv')
fold_list = glob.glob(f'{CFG.weight_path}/*.pth')
all_id_list, all_pred_list = [], []

for fold, model_path in tqdm(enumerate(fold_list)):
        print(f'============== {fold}th Fold forward ==============')
        forward_input = SequenceDataTrainer(CFG, g)
        loader_valid, valid = forward_input.make_batch(fold)
        fold_model = forward_input.model_setting(model_path)

        # forward pass
        val_ids_list, val_pred_list = forward_input.valid_fn(loader_valid, fold_model)
        all_id_list.extend(val_ids_list), all_pred_list.extend(val_pred_list)

print(all_id_list)
print(all_pred_list)


In [21]:
""" loop function for making sequence dataset """

def val_loop(cfg: any) -> None:
    """ Base Trainer Loop Function """
    tmp_valid = pd.read_csv('./dataset_class/data_folder/train.csv')
    fold_list = [i for i in range(cfg.n_folds)]
    for fold in tqdm(fold_list[4:5]):
        print(f'============== {fold}th Fold Train & Validation ==============')
        wandb.init(
            project=cfg.name,
            name=f'FBP2_fold{fold}/' + cfg.model,
            config=class2dict(cfg),
            group=f'FBP2/{cfg.model}',
            job_type='train',
            entity="qcqced"
        )
        early_stopping = EarlyStopping(mode=cfg.stop_mode, patience=3)
        early_stopping.detecting_anomaly()

        val_score_max = get_save_thresholds(cfg)
        train_input = getattr(trainer, cfg.name)(cfg, g)  # init object
        loader_train, loader_valid, train, valid = train_input.make_batch(fold)
        model, criterion, val_metrics, optimizer, lr_scheduler = train_input.model_setting(len(train))

        for epoch in range(cfg.epochs):
            print(f'[{epoch + 1}/{cfg.epochs}] Train & Validation')
            train_loss, train_accuracy, train_recall, train_precision = train_input.train_fn(
                loader_train, model, criterion, optimizer, lr_scheduler, val_metrics
            )
            val_ids_list, val_pred_list = train_input.valid_fn(
                loader_valid, model
            )
            # 1) make prediction dataframe
            final_pred = []
            for i in range(len(valid)):
                idx = valid.id.values[i]
                pred = val_pred_list[i]
                tmp_pred = []
                j = 0
                while j < len(pred):
                    cls = pred[j]
                    if cls == 'O':
                        j += 1
                    else:
                        cls = cls.replace('B', 'I')  # spans start with B
                    end = j + 1
                    while end < len(pred) and pred[end] == cls:
                        end += 1

                    if cls != 'O' and cls != '' and end - j > 7:
                        final_pred.append(
                            (idx, cls.replace('I-', ''),
                             ' '.join(map(str, list(range(j, end)))))
                        )
                    j = end
            pred_df = pd.DataFrame(final_pred)
            pred_df.columns = ['id', 'class', 'predictionstring']

            # 2) calculate cross validation score
            batch_valid = tmp_valid.loc[tmp_valid['id'].isin(val_ids_list)].copy()
            f1_list = []
            unique_class = pred_df['class'].unique()
            for i, c in enumerate(unique_class):
                print(f'iteration: {i}, class: {c}')
                subset_pred_df = pred_df.loc[pred_df['class'] == c].copy()
                gt_df = batch_valid.loc[batch_valid['discourse_type'] == c].copy()
                f1_score = calculate_f1(subset_pred_df, gt_df)
                print(c, f1_score)  # print f1 score for each class
                f1_list.append(f1_score)
            final_f1_score = np.mean(f1_list)  # average == 'micro'
