In [2]:
import ast, gc, os, warnings, glob
import wandb
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch import Tensor
from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, AutoTokenizer
from scipy import stats
from collections import Counter
from bisect import bisect_left
from tqdm.auto import tqdm

import dataset_class.dataclass as dataset_class
import model.loss as model_loss
import model.metric as model_metric
import model.model as model_arch
from model.model_utils import *
from dataset_class import data_preprocessing
from dataset_class.data_preprocessing import *
from utils.helper import *
from trainer.trainer_utils import *
from model.metric import *
from utils.helper import class2dict

warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ['LRU_CACHE_CAPACITY'] = "1"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
""" Configuration Class for LLM, Classifier such as XGBoost, LightGBM, CatBoost """

class CFG:
    wandb = True
    seed = 42
    n_gpu = 1
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'mps')
    gpu_id = 0
    num_workers = 0
    weight_path = './saved/model'
    model = 'microsoft/deberta-v3-large'
    reinit = True
    tokenizer = AutoTokenizer.from_pretrained(model)
    n_folds = 5
    max_len = 2048
    val_batch_size = 16
    xgb_params = {
         'learning_rate': 0.05,
        'n_estimators': 200,
        'max_depth': 7,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 0.7,
        'reg_alpha': 0.0005,
        'colsample_bytree': 0.6,
        'scale_pos_weight': 1,
        'use_label_encoder': False,
        'eval_metric': 'logloss',
        'tree_method': 'hist'
    }
    cat_params = {
        'iterations': 2000,
        'learning_rate': 0.07,
        'depth': 12,
        'l2_leaf_reg':8 ,
        'random_strength':0.5,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'task_type': 'GPU',
        'border_count': 128,
        'verbose': 1000,
        'early_stopping_rounds': 100,
        'use_best_model': True,

    }
    lgb_params = {
    'n_estimators': 1500, # use a large number of trees with early stopping
    'max_depth': 12, # restrict the depths of the individual trees
    'min_child_samples': 20, # atleast 20 observations in leaf
    'early_stopping_round': 50, # this can be specified in config as well
    'subsample_freq': 1, # this can be specified in config as well
    'n_jobs': 1,
    'importance_type': 'gain',
    'device': 'gpu'
    }

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
""" Custom Dataset Class """

class NERDataset(Dataset):
    """
    Custom Dataset Class for NER Task
    Args:
        cfg: configuration.CFG
        df: dataframe from .txt file
        is_train: if this param set False, return word_ids from self.df.entities
    """
    def __init__(self, cfg: configuration.CFG, df: pd.DataFrame, is_train: bool = True) -> None:
        self.cfg = cfg
        self.df = df
        self.tokenizer = ner_tokenizing
        self.labels2ids = labels2ids()  # Function for Encoding Labels to ids
        self.ids2labels = ids2labels()  # Function for Decoding ids to Labels
        self.is_train = is_train

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, item: int) -> tuple[list, [dict[Tensor, Tensor, Tensor], Tensor]]:
        """
        1) Tokenizing input text:
            - if you param 'return_offsets_mapping' == True, tokenizer doen't erase \n or \n\n
              but, I don't know this param also applying for DeBERTa Pretrained Tokenizer
        2) Create targets and mapping of tokens to split() words by tokenizer
            - Mapping Labels to split tokens
            - Iterate in reverse to label whitespace tokens until a Begin token is encountered
            - Tokenizer will split word into subsequent of character such as copied => copy, ##ed
            - So, we need to find having same parent token and then label BIO NER Tags
        3) Return dict:
            - Train: dict.keys = [inputs_id, attention_mask, token_type_ids, labels]
            - Validation/Test: dict.keys = [inputs_id, attention_mask, token_type_ids, word_ids]
        """
        ids = self.df.id[item]
        text = self.df.text[item]
        if self.is_train:
            word_labels = ast.literal_eval(self.df.entities[item])

        # 1) Tokenizing input text
        encoding = self.tokenizer(
            self.cfg,
            text,
        )
        word_ids = encoding.word_ids()
        split_word_ids = np.full(len(word_ids), -1)
        offset_to_wordidx = split_mapping(text)  # [1, sequence_length]
        offsets = encoding['offset_mapping']  # [(src, end), (src, end), ...]

        # 2) Find having same parent token and then label BIO NER Tags
        label_ids = []
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):
            if word_idx is None:
                """ for padding token """
                if self.is_train:
                    label_ids.append(-100)
            else:
                if offsets[token_idx] != (0, 0):
                    # Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(
                        np.unique(split_idxs)) > 1 else split_idxs[0]
                    if split_index != -1:
                        if self.is_train:
                            label_ids.append(self.labels2ids[word_labels[split_index]])
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        if label_ids and label_ids[-1] != -100 and self.ids2labels[label_ids[-1]][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if self.is_train:
                                label_ids.append(label_ids[-1])
                        else:
                            if self.is_train:
                                label_ids.append(-100)
                else:
                    if self.is_train:
                        label_ids.append(-100)
        if not self.is_train:
            encoding['word_ids'] = torch.as_tensor(split_word_ids)
        else:
            encoding['labels'] = list(reversed(label_ids))
        for k, v in encoding.items():
            encoding[k] = torch.as_tensor(v)
        return ids, encoding

In [5]:
""" Custom Model Class """

class DeBERTaModel(nn.Module):
    """
    Model class For NER Task Pipeline, in this class no pooling layer with backbone named "DeBERTa"
    This pipeline apply B.I.O Style, so the number of classes is 15 which is 7 unique classes original
    Each of 7 unique classes has sub 2 classes (B, I) => 14 classes
    And 1 class for O => 1 class
    14 + 1 = 15 classes
    Args:
        cfg: configuration.CFG
    """
    def __init__(self, cfg: configuration.CFG) -> None:
        super().__init__()
        self.cfg = cfg
        self.auto_cfg = AutoConfig.from_pretrained(
            cfg.model,
            output_hidden_states=True
        )
        self.model = AutoModel.from_pretrained(
            cfg.model,
            config=self.auto_cfg
        )
        self.fc = nn.Linear(self.auto_cfg.hidden_size, 15)  # BIO Style NER Task

    def feature(self, inputs_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=inputs_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        return outputs

    def forward(self, inputs) -> Tensor:
        """
        No Pooling Layer for word-level task
        Args:
            inputs: Dict type from AutoTokenizer
            => {input_ids, attention_mask, token_type_ids, offset_mapping, labels}
        """
        outputs = self.feature(
            inputs_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            token_type_ids=inputs["token_type_ids"],
        )
        logit = self.fc(outputs.last_hidden_state)
        return logit

In [7]:
""" torch.cuda, cudnn, reproducibility setting """

check_library(True)
all_type_seed(CFG, True)
g = torch.Generator()
g.manual_seed(CFG.seed)


""" Trainer Class for Make Sequence Dataset for Multiple Label Classification Task Pipeline """

class SequenceDataTrainer:
    """
    Only Forward Pass with Validation Dataset for Making Sequence Dataset by whole Competition Data
    """
    def __init__(self, cfg, generator: torch.Generator) -> None:
        self.cfg = cfg
        self.model_name = get_name(self.cfg)
        self.generator = generator
        self.df = load_data('./dataset_class/data_folder/final_converted_train_df.csv')

    def make_batch(self, fold: int) -> tuple[torch.utils.data.DataLoader, pd.DataFrame]:
        """ Make Batch Dataset for main train loop """
        valid = self.df[self.df['fold'] == fold].reset_index(drop=True)

        # Custom Datasets
        valid_dataset = NERDataset(self.cfg, valid, is_train=False)
        loader_valid = DataLoader(
            valid_dataset,
            batch_size=self.cfg.val_batch_size,
            shuffle=False,
            worker_init_fn=seed_worker,
            generator=self.generator,
            num_workers=self.cfg.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        return loader_valid, valid

    def model_setting(self, path: str):
        """ load fine-tuned model's weight, iterate by fold """
        model = DeBERTaModel(self.cfg)
        model.load_state_dict(
            torch.load(path, map_location=torch.device('mps'))
        )
        model.to(self.cfg.device)
        return model

    def inference_fn(self, loader_valid: torch.utils.data.DataLoader, model: nn.Module) -> tuple[list, list]:
        """
        Validation Functions
        Not convert probability to string label text with torch.argmax
        function should return those shape of Tensor: [batch_size, sequence_length, num_labels] == outputs.last_hidden_state
        Variable:
            val_ids_list: list of ids for calculating sequence dataset
            val_prob_list: list of probability for make sequence dataset
            val_label_list: list of labels for calculating CV Score
        """
        ids_to_labels = data_preprocessing.ids2labels()
        val_ids_list, val_prob_list, val_label_list = [], [], []
        model.eval()
        with torch.no_grad():
            for step, (ids, inputs) in enumerate(tqdm(loader_valid)):  # Maybe need to append
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(self.cfg.device)  # prompt to GPU

                val_pred = model(inputs)  # [batch_size, sequence_length, num_labels]
                val_prob = F.softmax(val_pred, dim=2).cpu().detach().numpy()  # dim 2 == num_labels dim

                val_prob_list.extend(val_prob), val_ids_list.extend(ids)  # make list for sequence dataset

            predictions = []
            for sequence_logit in range(val_prob_list[0]):
        return val_ids_list, val_prob_list


In [8]:
"""
Let's Make Sequence Dataset by forwarding each fold's dataset to fold's model weight
loop function for Sequence Dataset Generate
"""

tmp_valid = pd.read_csv('./dataset_class/data_folder/train.csv')
fold_list = glob.glob(f'{CFG.weight_path}/*.pth')
all_id_list, all_pred_list = [], []

for fold, model_path in tqdm(enumerate(fold_list)):
        print(f'============== {fold}th Fold forward ==============')
        forward_input = SequenceDataTrainer(CFG, g)
        loader_valid, valid = forward_input.make_batch(fold)
        fold_model = forward_input.model_setting(model_path)

        # forward pass
        val_ids_list, val_pred_list = forward_input.inference_fn(loader_valid, fold_model)
        all_id_list.extend(val_ids_list), all_pred_list.extend(val_pred_list)


0it [00:00, ?it/s]



Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/195 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 14.33 GB, other allocations: 2.71 GB, max allowed: 18.13 GB). Tried to allocate 1.57 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
print(len(all_id_list), len(all_pred_list))

In [12]:
print(all_id_list[0], all_pred_list[0])

E1FA876D6E6C ['O', 'O', 'B-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'I-Lead', 'B-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'I-Position', 'B-Claim', 'I-Claim', 'I-Evidence', 'I-Claim', 'I-Evidence', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Claim', 'I-Evidence', 'I-Claim', 'I-Claim', 'B-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evidence', 'I-Evid

In [5]:
test = np.array([])
tmp = np.array([1,2,3,4])

test.extend(tmp)

AttributeError: 'numpy.ndarray' object has no attribute 'extend'