### Goal of This Notebook  
1) Want to experience tuning models' parameter & several strategy experiment  
2) Experience Token Classification Task  
3) Study Top Ranker's Knowledge

In [1]:
from __future__ import annotations
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# !unzip /home/qcqced/nltk_data/corpora/wordnet.zip -d /home/qcqced/nltk_data/corpora/

In [2]:
import wandb
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim.swa_utils as swa
import tokenizers, transformers
import os, sys, gc, time, random, warnings, math, re

from torch.utils.checkpoint import checkpoint
from transformers import AdamW
from torch.optim.swa_utils import AveragedModel, SWALR, update_bn
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

from numpy import ndarray
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

from functools import reduce
from scipy.stats import pearsonr
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from tqdm.auto import tqdm
# from kaggle_secrets import UserSecretsClient
from glob import glob
warnings.filterwarnings("ignore")
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [3]:
# WandB Login => Copy API Key
secret_value_0 = '8d7716caaaa5afb56e1d02ef5837cabbffe48b41'
!wandb login $secret_value_0

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/qcqced/.netrc


In [4]:
# Step 1.1 Configuration Setting
"""
[Configuration]
    - Pooling: mean, attention, max, weightedlayer, concat (This Pipeline doesn't need to pooling)
    - Optimizer: AdamW, SWA
    - Scheduler: cosine, linear
    - Clip_grad_norm, Gradient Checking: T/F
    - LLRD
    - Re-Init
    - AWP
"""
class CFG:
    """--------[Common]--------"""
    wandb, train, competition, seed, cfg_name = True, True, 'UPPPM', 42, 'CFG'
    device, gpu_id = torch.device('cuda' if torch.cuda.is_available() else 'cpu'), 0
    num_workers = 0
    """ Mixed Precision, Gradient Check Point """
    amp_scaler = True
    gradient_checkpoint = True # save parameter
    output_dir = './output/'
    """ Clipping Grad Norm, Gradient Accumulation """
    clipping_grad = True # clip_grad_norm
    n_gradient_accumulation_steps = 1 # Gradient Accumulation
    max_grad_norm = n_gradient_accumulation_steps * 1000
    """ Model """
    model_name = 'microsoft/deberta-v3-large' 
    tokenizer = AutoTokenizer.from_pretrained(model_name)
#    pooling = 'attention'
    max_len = 512
    """ CV, Epoch, Batch Size """
    n_folds = 4
    epochs = 180
    batch_size = 64
    """ SWA, Loss, Optimizer, Scheduler """
    swa = True
    swa_start = int(epochs*0.75)
    swa_lr = 1e-4
    anneal_epochs = 4
    anneal_strategy = 'cos' # default = cos, available option: linear 
    loss_fn = 'BCE'
    optimizer = 'AdamW' # options: SWA, AdamW
    weight_decay = 1e-2
    scheduler = 'cosine_annealing' # options: cosine, linear, cosine_annealing, linearannealing
    num_cycles = 0.5
#    num_warmup_steps = 0
    warmup_ratio = 0.1 # options: 0.05, 0.1
    batch_scheduler = True
    # encoder_lr = 5e-5
    # decoder_lr = 1e-5
    min_lr = 1e-7
    # eps = 1e-6
    betas = (0.9, 0.999)
    """ LLRD """
    llrd = True
    layerwise_lr = 5e-5
    layerwise_lr_decay = 0.9
    layerwise_weight_decay = 1e-2
    layerwise_adam_epsilon = 1e-6
    layerwise_use_bertadam = False
    """ Re-Init, AWP """
    reinit = True
    num_reinit = 5
    awp = False
    nth_awp_start_epoch = 10
    awp_eps = 1e-2
    awp_lr = 1e-4

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def check_device() -> bool:
    return torch.backends.mps.is_available()

def check_library(checker: bool) -> tuple:
    """
    1) checker == True 
        - current device is mps
    2) checker == False
        - current device is cuda with cudnn
    """
    if not checker:
        _is_built = torch.backends.cudnn.is_available()
        _is_enable = torch.backends.cudnn.enabledtorch.backends.cudnn.enabled
        version = torch.backends.cudnn.version()
        device = (_is_built, _is_enable, version)
        return device

def class2dict(cfg: CFG) -> dict:
    return dict((name, getattr(cfg, name)) for name in dir(cfg) if not name.startswith('__'))

def all_type_seed(cfg: CFG, checker: bool) -> None:
    # python & torch seed
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)  # python Seed
    random.seed(cfg.seed)  # random module Seed
    np.random.seed(cfg.seed)  # numpy module Seed
    torch.manual_seed(cfg.seed)  # Pytorch CPU Random Seed Maker

    # device == cuda
    if not checker: 
        torch.cuda.manual_seed(cfg.seed)  # Pytorch GPU Random Seed Maker
        torch.cuda.manual_seed_all(cfg.seed)  # Pytorch Multi Core GPU Random Seed Maker
        # torch.cudnn seed
        torch.backends.cudnn.deterministic = False
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.enabled = True

    # devide == mps
#     else:
#         torch.mps.manual_seed(cfg.seed)
    
def seed_worker(worker_id) -> None:
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
def get_logger(filename: str):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)

logger = get_logger(filename=CFG.output_dir+'train')
check_library(True)
all_type_seed(CFG, True)    
g = torch.Generator()
g.manual_seed(0)

<torch._C.Generator at 0x7f245eefac70>

## 🗂️ Step 1. Stratified Group K Fold

In [6]:
"""
[Stratified Group K]
같은 그룹은 서로 같은 성질의 데이터 세트에 포함되며, 전체 라벨 분포를 고려한 폴드 구성 전략
1) 점수를 클래스로 치환
    - 0.00: class 0
    - 0.25: class 1
    - 0.50: class 2
    - 0.75: class 3
    - 1.00: class 4
2) 라벨 분포 균형을 위해 StratifiedGroupKFold 사용
3) Group 기준은 'anchor'
4) 원작자는 폴드를 2개로 구성하고 테스트를 돌린듯
"""
def cross_val(cfg):
    train_df = pd.read_csv('./dataset/train.csv')
    kfold = StratifiedGroupKFold(
        n_splits=CFG.n_folds, 
        shuffle=True, 
        random_state=cfg.seed
    )
    train_df["score_class"] = train_df["score"].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})
    train_df['fold'] = -1
    for num, (tx, vx) in enumerate(kfold.split(train_df, train_df["score_class"], train_df["anchor"])):
        train_df.loc[vx, "fold"] = num
    return train_df

## ✂️ Stage 2. Text Preprocessing: Normalization & Cleaning

In [7]:
"""[Text Normalization]"""
def create_word_normalizer() -> function:
    ps = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    def normalize(word):
        w = word.lower()
        w = lemmatizer.lemmatize(w)
        w = ps.stem(w)
        return w
    return normalize

def __normalize_words(titles: list) -> list:
    stop_words = set(stopwords.words('english'))
    normalizer = create_word_normalizer()
    titles = [normalizer(t) for t in titles if t not in stop_words]
    return titles

def normalize_words(words: ndarray, unique=True):
    if type(words) is str:
        words = [words]
    sep_re = r'[\s\(\){}\[\];,\.]+'
    num_re = r'\d'
    words = re.split(sep_re, ' '.join(words).lower())
    words = [w for w in words if len(w) >= 3 and not re.match(num_re, w)]
    if unique:
        words = list(set(words))
        words = set(__normalize_words(words))
    else:
        words = __normalize_words(words)
    return words

def filter_title(title: str) -> str:
    titles = normalize_words(title, unique=False)
    return ','.join([t for t in titles if t in include_words])

train_df = cross_val(CFG)
cpc_codes = pd.read_csv("./dataset/titles.csv", engine='python')

norm_titles = normalize_words(cpc_codes['title'].to_numpy(), unique=False) # 여기는 big query dataset을 정규화
anchor_targets = train_df['target'].unique().tolist() + train_df['anchor'].unique().tolist() # original train dataset 
norm_anchor_targets = normalize_words(anchor_targets) # Original Train Dataset 정규화
include_words = set(norm_titles) & norm_anchor_targets # Anchor & Target 공통되는 단어 

# 왜 코드 길이가 4개 이상인 것만 가져오는 것인지... 원본 데이터는 sub-class까지 제공 안하는 걸로 아는데
tmp_cpc_codes = cpc_codes.copy()
tmp_cpc_codes = tmp_cpc_codes[cpc_codes['code'].str.len() >= 4] 

"""
1) 원본 데이터처럼 형태를 맞춰주려고 3자리까지 슬라이싱해서 새로운 컬럼을 만든다.
2) section_class 1: ['sub_class 1', 'sub_class 2', 'sub_class 3', .....]
    - sub_class의 sub_class text까지 모두 포함
3) 데이터 변환 검증
4) 리스트 형태 뭉개고, 줄글로 변환 & 정규화
5) Anchor & Target 교집합 데이터 따로 모아 새로운 열에 삽입
    - 이후 CountVectorizer 적용하기 위함인듯
"""
tmp_cpc_codes['section_class'] = tmp_cpc_codes['code'].apply(lambda x: x[:3]) 
title_group_df = tmp_cpc_codes.groupby('section_class', as_index=False)[['title']].agg(list)
title_group_df = title_group_df[title_group_df['section_class'].str.len() == 3] 
title_group_df['title'] = title_group_df['title'].apply(lambda lst: ' '.join(lst))
title_group_df['norm_title'] = title_group_df['title'].agg(filter_title)

"""
1) CountVectorizer 적용
 - 개별 단어의 출현빈도 계산
2) 벡터를 다시 원본 단어로 변환
"""
vectorizer = CountVectorizer()
c_vect = vectorizer.fit_transform(title_group_df['norm_title'])
r = np.argsort(c_vect.toarray(), axis=1)[:, ::-1][::, :400]
vect_words = vectorizer.get_feature_names_out()
t_words = np.vectorize(lambda v: vect_words[v])(r)
norm_title = title_group_df['norm_title'].str.split(',').to_numpy().tolist()

"""
1) 이해가 안감
2) class 정보 추가
    - sub_class 제목 추가
3) context_text: main class + [SEP] + sub class
4) cpc_text => {'A01': ('A01's title [SEP] A01B's title.........)}
"""
res = []
for (n, t) in zip(norm_title, t_words):
    res.append(','.join(set(n) & set(t)))
title_group_df['norm_title'] = res
title_group_df['section'] = title_group_df.section_class.str[0:1]
title_group_df['section_title'] = title_group_df['section'].map(cpc_codes.set_index('code')['title']).str.lower() + ';' + title_group_df['section_class'].map(cpc_codes.set_index('code')['title']).str.lower()
title_group_df['context_text'] = title_group_df['section_title'] + ' [SEP] ' + title_group_df['norm_title']
cpc_texts = dict(title_group_df[['section_class', 'context_text']].to_numpy().tolist())

In [8]:
"""[Merge Two DataFrame]"""
af_dict = {}
for i,r in train_df[['anchor', 'fold']].iterrows():
    af_dict[r.anchor] = r.fold
anchor_context_grouped_target = train_df.groupby(['anchor', 'context'])['target'].apply(list)
anchor_context_grouped_score = train_df.groupby(['anchor', 'context'])['score'].apply(list)
anchor_context_grouped_id = train_df.groupby(['anchor', 'context'])['id'].apply(list)
i = pd.DataFrame(anchor_context_grouped_id).reset_index()
s = pd.DataFrame(anchor_context_grouped_score).reset_index()
t = pd.DataFrame(anchor_context_grouped_target).reset_index()
train_df = s.merge(t, on=['anchor', 'context'])
train_df = train_df.merge(i, on=['anchor', 'context'])
train_df['context_text'] = train_df['context'].map(cpc_texts)
train_df = train_df.rename(columns={'target': 'targets', 'score': 'scores', 'id': 'ids'})
train_df['fold'] = train_df['anchor'].map(af_dict)

In [9]:
# Add Special Token
tar_token = '[TAR]'
special_tokens_dict = {'additional_special_tokens': [f'{tar_token}']}
CFG.tokenizer.add_special_tokens(special_tokens_dict)
tar_token_id = CFG.tokenizer(f'{tar_token}', add_special_tokens=False)['input_ids'][0]
# logger.info(f'tar_token_id: {tar_token_id}')
setattr(CFG.tokenizer, 'tar_token', f'{tar_token}')
setattr(CFG.tokenizer, 'tar_token_id', tar_token_id)
CFG.tokenizer.save_pretrained(f'{CFG.output_dir}tokenizer/')

('./output/tokenizer/tokenizer_config.json',
 './output/tokenizer/special_tokens_map.json',
 './output/tokenizer/spm.model',
 './output/tokenizer/added_tokens.json',
 './output/tokenizer/tokenizer.json')

In [10]:
# Length of Train Data (anchor + targets + context_text)
# token_test = train_df.explode('targets')

# anchor_list = token_test.anchor.to_list()
# targets_list = token_test.targets.to_list()
# context_list = token_test.context_text.to_list()

# text_len, text_token = [], []
# text_list = [anchor_list[idx] + targets_list[idx] + context_list[idx] for idx in range(len(anchor_list))]
# for text in tqdm(text_list):
#     inputs = CFG.tokenizer.encode_plus(
#         text,
#         return_tensors = None,
#         add_special_tokens = False,
#         truncation = False,
# #        max_length = 192,
#         return_token_type_ids = True,
#         return_attention_mask = True    
#     )
#     text_token.append(CFG.tokenizer.decode(inputs.input_ids))
#     text_len.append(len(inputs.input_ids))

## 👩‍👩‍👧‍👦 Step 3. Dataset Class

In [11]:
class UPPPMDataset(Dataset):
    def __init__(self, CFG, df, is_valid=False):
        super().__init__()
        self.anchor_list = df.anchor.to_numpy()
        self.target_list = df.targets.to_numpy()
        self.context_list = df.context_text.to_numpy()
        self.score_list = df.scores.to_numpy()
        self.id_list = df.ids.to_numpy()
        self.cfg = CFG
        self.is_valid = is_valid
        
#     def add_special_token(self):
#         tar_token = '[TAR]'
#         special_tokens_dict = {'additional_special_tokens': [f'{tar_token}']}
#         self.cfg.tokenizer.add_special_tokens(special_tokens_dict)
#         tar_token_id = self.cfg.tokenizer(f'{tar_token}', add_special_tokens=False)['input_ids'][0]
#         # logger.info(f'tar_token_id: {tar_token_id}')
#         setattr(self.cfg.tokenizer, 'tar_token', f'{tar_token}')
#         setattr(self.cfg.tokenizer, 'tar_token_id', tar_token_id)
#         self.cfg.tokenizer.save_pretrained(f'{self.cfg.output_dir}tokenizer/')
    
    def tokenizing(self, text_data: str) -> dict:
        inputs = self.cfg.tokenizer.encode_plus(
            text_data, 
            return_tensors=None, # if true, tf.tensor, pt.tensor, numpy
            add_special_tokens=False,
            truncation=True,
            padding='max_length',
            max_length=self.cfg.max_len
        )
        return inputs
    
    def __len__(self) -> int:
        return len(self.id_list)
    
    def __getitem__(self, idx: int) -> "tuple[dict, Tensor, Tensor]":
        """
        1) make Embedding Shape,
            - Data: [cls]+[anchor]+[sep]+[target]+[tar]+[target]+[tar]...+[tar]+[cpc_text]+[sep]
            - Label: [-1] * self.cfg.max_len, target value의 인덱스 위치에 score_class값 전달
        2) apply data augment
            - shuffle target values
        """
        scores = np.array(self.score_list[idx]) # len(scores) == target count
        target_mask = np.zeros(self.cfg.max_len)
        targets = np.array(self.target_list[idx])
        
        # Data Augment for train stage
        if not self.is_valid:
            indices = list(range(len(scores)))
            random.shuffle(indices)
            scores = scores[indices]
            targets = targets[indices]
        
        text = self.cfg.tokenizer.cls_token + self.anchor_list[idx] + self.cfg.tokenizer.sep_token
        for target in targets:
            text += target + self.cfg.tokenizer.tar_token
        text += self.context_list[idx] + self.cfg.tokenizer.sep_token
        
        # tokenizing & make label list
        inputs = self.tokenizing(text) 
        label = torch.full([self.cfg.max_len], -1, dtype=torch.float)
        # target value의 인덱스(label list) 위치에 score값 자체를 전달
        # 나중에 score_class로 변환해주는 작업이 필요할 듯
        cnt_tar = 0
        cnt_sep = 0
        nth_target = -1
        prev_i = -1
        
        for i, input_id in enumerate(inputs['input_ids']):
            if input_id == self.cfg.tokenizer.tar_token_id:
                cnt_tar += 1
                if cnt_tar == len(targets):
                    break
            if input_id == self.cfg.tokenizer.sep_token_id:
                cnt_sep += 1
            
            if cnt_sep == 1 and input_id not in [self.cfg.tokenizer.pad_token_id, self.cfg.tokenizer.sep_token_id, self.cfg.tokenizer.tar_token_id]:
                if (i-prev_i) > 1:
                    nth_target += 1
                label[i] = scores[nth_target]
                target_mask[i] = 1
                prev_i = i

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
            
        return inputs, target_mask, label

## 🧠 Step 4. Model Class

In [12]:
class UPPPMModel(nn.Module):
    def __init__(self, CFG, n_vocabs: int):
        super().__init__()
        self.cfg = CFG
        self.auto_cfg = AutoConfig.from_pretrained(CFG.model_name,
                                                   output_hidden_states = True)
        self.model = AutoModel.from_pretrained(CFG.model_name,
                                               config = self.auto_cfg)
        self.model.resize_token_embeddings(n_vocabs)
        """
        1) if your loss function == CrossEntropyLoss, change value 1 to 5
        2) fully_connected == classifier        
        """
        self.fc = nn.Linear(self.auto_cfg.hidden_size, 1)
        self._init_weights(self.fc) # Classifier Layer Init
        # checkpointing
        if self.cfg.gradient_checkpoint:
            self.model.gradient_checkpointing_enable()
        # re-init Top-K Encoder Layer
        if self.cfg.reinit:
            self.reinit_topk_layers()
    
    # Classifier Layer Init
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.auto_cfg.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    # Re-Init Top-K Transformers Layer
    def reinit_topk_layers(self):
        """
        Re-initialize the last-k transformer layers.
        Args:
            model: The target transformer model.
            num_layers: The number of layers to be re-initialized.
        """
        self.model.encoder.layer[-self.cfg.num_reinit:].apply(self.model._init_weights) # model class에 있는거
        
    def forward(self, inputs: dict):
        """
        outputs.last_hidden_stats with no pooling => Token-Level Task
        """
        outputs = self.model(**inputs) # inputs from LECRDataset
        embedding = outputs.last_hidden_state
        output = self.fc(embedding).squeeze(-1)
        return output

## 🤝 Step 5. Model & Metric Utils

In [13]:
"""
Adversarial Weight Perturbation
"""
class AWP:
    def __init__(
        self,
        model,
        criterion,
        optimizer,
        awp: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.awp = awp
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, label):
        with torch.cuda.amp.autocast(enabled=self.awp):
            self._save()
            self._attack_step()
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

class AverageMeter(object):
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count        
        
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps # If MSE == 0, We need eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss
    
def collate(inputs: dict) -> dict:
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Pearson Correlations Coeffiicient Loss
class PearsonLoss(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, y_pred, y_true) -> float:
        x = y_pred.clone()
        y = y_true.clone()
        vx = x - torch.mean(x)
        vy = y - torch.mean(y)
        cov = torch.sum(vx * vy)
        corr = cov / (torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2)) + 1e-12)
        corr = torch.maximum(torch.minimum(corr,torch.tensor(1)), torch.tensor(-1))
        
        return torch.sub(torch.tensor(1), corr ** 2)
    

# F2-Score for each fold
# Multi Classification => Positive Case would be highlighted, So We need to use Recall to Eval Metric
"""
오차행렬 및 관련 지표 정리
1) 모든건 항상 예측값 기준으로 생각
    - 예측값이 Positive && 실제 정답인 경우 => True Positive, TP
    - 예측값이 Positive && 실제 오답인 경우 => False Positive, FP
    - 예측값이 Negative && 실제 오답인 경우 => True Negative, TN
    - 예측값이 Negative && 실제 정답인 경우 ==> False Negative, FN

2) Precision
    - precision = TP / (TP + FP)
      => 정답으로 예측한 것 중에서 실제 정답에 해당 하는 경우
    - Recall = TP / (TP + FN)
      => 실제 정답 중에서 모델이 예측 성공한 것이 몇개 인가
"""
# def pearson_score(y_true, y_pred) -> float:
#     return pearsonr(y_true, y_pred)[0]

def pearson_score(y_true, y_pred) -> float:
    x, y = y_pred, y_true
    vx = x - np.mean(x)
    vy = y - np.mean(y)
    cov = np.sum(vx * vy)
    corr = cov / (np.sqrt(np.sum(vx ** 2)) * np.sqrt(np.sum(vy ** 2)) + 1e-12)
    return corr
    
def recall(y_true, y_pred) -> float:
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    recall = tp / (tp + fn)
    return round(recall.mean(), 4)

def precision(y_true, y_pred) -> float:
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    return round(precision.mean(), 4)

def f2_score(y_true, y_pred) -> float:
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [14]:
# Step 2.8 Trainer Input Class
# Later append for AWP, SWA, Re-Init Code
class TrainInput():
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.tokenizer = self.cfg.tokenizer
        self.df = df # return dataset
        if self.cfg.gradient_checkpoint:
            self.save_parameter = f'(best_score) {self.cfg.model_name}_state_dict.pth' # checkpoint
        
    
    # LLRD 
    def get_optimizer_grouped_parameters(self, model, layerwise_lr, layerwise_weight_decay, layerwise_lr_decay):
        no_decay = ["bias", "LayerNorm.weight"]
        # initialize lr for task specific layer
        optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if "model" not in n],
                                         "weight_decay": 0.0,
                                         "lr": layerwise_lr,
                                        },]
        # initialize lrs for every layer
        layers = [model.model.embeddings] + list(model.model.encoder.layer)
        layers.reverse()
        lr = layerwise_lr
        for layer in layers:
            optimizer_grouped_parameters += [{"params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                                              "weight_decay": layerwise_weight_decay,
                                              "lr": lr,
                                             },
                                             {"params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                                              "weight_decay": 0.0,
                                              "lr": lr,
                                             },]
            lr *= layerwise_lr_decay
        return optimizer_grouped_parameters
    
    def get_optimizer_params(self, model, encoder_lr, decoder_lr, weight_decay):
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters


    def make_batch(self, fold: int):
        train = self.df[self.df['fold'] != fold].reset_index(drop=True)
        valid = self.df[self.df['fold'] == fold].reset_index(drop=True)

        # Custom Dataset
        train_dataset = UPPPMDataset(self.cfg, train)
        valid_dataset = UPPPMDataset(self.cfg, valid, is_valid=True)
        valid_labels = valid['scores'].explode().to_numpy()

        # DataLoader
        loader_train = DataLoader(
            train_dataset,
            batch_size = self.cfg.batch_size,
            shuffle = True,
            worker_init_fn=seed_worker,
            generator=g,
            num_workers = self.cfg.num_workers,
            pin_memory = True,
            drop_last = True,
        )
        
        loader_valid = DataLoader(
            valid_dataset,
            batch_size = self.cfg.batch_size,
            shuffle = False,
            worker_init_fn=seed_worker,
            generator=g,
            num_workers = self.cfg.num_workers,
            pin_memory = True,
            drop_last = False,
        )
        
        return loader_train, loader_valid, train, valid, valid_labels

    def model_setting(self):
        """
        [model]
        1) Re-Initialze Weights of Encoder
           - DeBERTa => Last Two Layers == EMD        
        2) SWA
           - original model => to.device
           - after calculate, update swa_model
        """
        model = UPPPMModel(self.cfg, n_vocabs=len(self.tokenizer))        
        #model.load_state_dict(torch.load('Token_Classification_Fold0_DeBERTa_V3_Large.pth'))
        model.to(self.cfg.device)
        # SWA: Stochastic Weighted Averaging        
        if self.cfg.swa:
            swa_model = AveragedModel(model)
        else:
            swa_model = 'none'
        
        # Setting Loss_Function
        # Because we don't need to calculate none-target token: Output will be same shape as input
        if self.cfg.loss_fn == 'BCE':
            criterion = nn.BCEWithLogitsLoss(reduction='none') 
        if self.cfg.loss_fn == 'cross_entropy':
            criterion = nn.CrossEntropyLoss()
        if self.cfg.loss_fn == 'pearson':
            criterion = PearsonLoss()
        if self.cfg.loss_fn == 'RMSE':
            criterion = RMSELoss()
            
        # optimizer
        grouped_optimizer_params = self.get_optimizer_grouped_parameters(
            model, 
            self.cfg.layerwise_lr, 
            self.cfg.layerwise_weight_decay, 
            self.cfg.layerwise_lr_decay
        )
        
        optimizer = AdamW(
            grouped_optimizer_params,
            lr = self.cfg.layerwise_lr,
            eps = self.cfg.layerwise_adam_epsilon,
            correct_bias = not self.cfg.layerwise_use_bertadam)
        
        return model, swa_model, criterion, optimizer, self.save_parameter

In [15]:
# Step 3.1 Train & Validation Function
def train_fn(cfg,
             loader_train,
             loader_valid,
             model,
             criterion,
             optimizer,
             scheduler,
             valid,
             valid_labels,
             epoch,
             swa_model = None,
             swa_start = None,
             swa_scheduler = None,):
    # Train Stages
    # torch.amp.gradscaler
    awp = AWP(model, criterion, optimizer, cfg.awp, adv_lr=cfg.awp_lr, adv_eps=cfg.awp_eps)
    if cfg.amp_scaler:
        scaler = torch.cuda.amp.GradScaler(enabled = True)
    global_step, score_list = 0, [] # All Fold's average of mean F2-Score
    losses = AverageMeter()
    model.train()
    for step, (inputs, _, labels) in enumerate(tqdm(loader_train)):
        optimizer.zero_grad()
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(cfg.device) # train to gpu
        labels = labels.to(cfg.device) # label to gpu
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled = cfg.amp_scaler):
            preds = model(inputs)
            loss = criterion(preds.view(-1, 1), labels.view(-1,1))
            mask = (labels.view(-1, 1) != -1)
            loss = torch.masked_select(loss, mask).mean() # reduction = mean
            losses.update(loss, batch_size)
            """
            [gradient_accumlation]
            - GPU VRAM OVER 문제해결을 위해 사용
            - epoch이 사용자 지정 에폭 횟수를 넘을 때까지 Backward 하지 않고 그라디언트 축적
            - 지정 epoch 넘어가면 한 번에 Backward
            """
            if cfg.n_gradient_accumulation_steps > 1:
                loss = loss / cfg.n_gradient_accumulation_steps
                
        scaler.scale(loss).backward()        
        """
        [Adversarial Weight Training]
        """
        if cfg.awp and epoch >= cfg.nth_awp_start_epoch:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()
        """
        1) Clipping Gradient && Gradient Accumlation
        2) Stochastic Weight Averaging
        """
        if cfg.clipping_grad and ((step + 1) % cfg.n_gradient_accumulation_steps == 0 or cfg.n_gradient_accumulation_steps == 1):
            scaler.unscale_(optimizer)      
            grad_norm = torch.nn.utils.clip_grad_norm(
                model.parameters(),
                cfg.max_grad_norm
            )
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            
            if epoch >= int(swa_start):
                swa_model.update_parameters(model)
                swa_scheduler.step()  
            global_step += 1
            scheduler.step()
            
    # Validation Stage
    preds_list, label_list = [], []
    valid_losses = AverageMeter()
    model.eval()
    with torch.no_grad():
        for step, (inputs, target_masks, labels) in enumerate(tqdm(loader_valid)):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(cfg.device)
            labels = labels.to(cfg.device)
            batch_size = labels.size(0)
            preds = model(inputs)
            valid_loss = criterion(preds.view(-1, 1), labels.view(-1, 1))
            mask = (labels.view(-1, 1) != -1)
            valid_loss = torch.masked_select(valid_loss, mask).mean()
            valid_losses.update(valid_loss, batch_size)

            y_preds = preds.sigmoid().to('cpu').numpy()
                                          
            anchorwise_preds = []
            for pred, target_mask, in zip(y_preds, target_masks):
                prev_i = -1
                targetwise_pred_scores = []
                for i, (p, tm) in enumerate(zip(pred, target_mask)):
                    if tm != 0:
                        if i-1 == prev_i:
                            targetwise_pred_scores[-1].append(p)
                        else:
                            targetwise_pred_scores.append([p])
                        prev_i = i
                for targetwise_pred_score in targetwise_pred_scores:
                    anchorwise_preds.append(np.mean(targetwise_pred_score))
            preds_list.append(anchorwise_preds)
    # error_list = [[i, preds_list.index(i)] for i in preds_list if i == 'nan' or i == float('inf')]
    # print(error_list)
    epoch_score = pearson_score(valid_labels, np.array(reduce(lambda a, b: a + b, preds_list)))
    return losses.avg, valid_losses.avg, epoch_score, grad_norm, scheduler.get_lr()[0]

In [16]:
def swa_valid(cfg,
              loader_valid,
              swa_model,
              criterion,
              valid_labels):
    swa_preds_list, swa_label_list = [], []
    swa_model.eval()
    swa_valid_losses = AverageMeter()
    
    with torch.no_grad():
        for step, (swa_inputs, target_masks, swa_labels) in enumerate(tqdm(loader_valid)):
            swa_inputs = collate(swa_inputs)
            
            for k, v in swa_inputs.items():
                swa_inputs[k] = v.to(cfg.device)
                
            swa_labels = swa_labels.to(cfg.device)
            batch_size = swa_labels.size(0)
            
            swa_preds = swa_model(swa_inputs)
            
            swa_valid_loss = criterion(swa_preds.view(-1, 1), swa_labels.view(-1, 1))
            mask = (swa_labels.view(-1, 1) != -1)
            swa_valid_loss = torch.masked_select(swa_valid_loss, mask)
            swa_valid_loss = swa_valid_loss.mean()
            swa_valid_losses.update(swa_valid_loss, batch_size)
            
            swa_y_preds = swa_preds.sigmoid().to('cpu').numpy()
                                          
            anchorwise_preds = []
            for pred, target_mask, in zip(swa_y_preds, target_masks):
                prev_i = -1
                targetwise_pred_scores = []
                for i, (p, tm) in enumerate(zip(pred, target_mask)):
                    if tm != 0:
                        if i-1 == prev_i:
                            targetwise_pred_scores[-1].append(p)
                        else:
                            targetwise_pred_scores.append([p])
                        prev_i = i
                for targetwise_pred_score in targetwise_pred_scores:
                    anchorwise_preds.append(np.mean(targetwise_pred_score))
                    
            swa_preds_list.append(anchorwise_preds)
    swa_valid_score = pearson_score(valid_labels, np.array(reduce(lambda a, b: a + b, swa_preds_list)))    
    del swa_preds_list, swa_y_preds, swa_labels, anchorwise_preds
    gc.collect()
    torch.cuda.empty_cache()
    return swa_valid_losses.avg, swa_valid_score

In [None]:
# This is Test Code for Appending Cross-Validaton Strategy (fold to epoch)
cfg_list = [CFG]
for cfg in cfg_list:
    #init wandb
    wandb.init(project="[Append Ver 2]UPPPM Token Classification", 
               name='[Append Version 2.2]' + 'Fold 0' + cfg.model_name,
               config=class2dict(cfg),
               group=cfg.model_name,
               job_type="train",
               entity = "qcqced")
    wandb_config = wandb.config
    print(f'========================= Retriever Model :{cfg.model_name} =========================')
    fold_list, swa_score_max = [i for i in range(cfg.n_folds)], -np.inf
    
    for fold in tqdm(fold_list):
        print(f'============== {fold+1}th Fold Train & Validation ==============')
        val_score_max = -np.inf
        fold_train_loss_list, fold_valid_loss_list, fold_score_list  = [], [], []
        fold_swa_loss, fold_swa_score = [], []
        
        train_input = TrainInput(cfg, train_df) # init object
        model, swa_model, criterion, optimizer, save_parameter = train_input.model_setting()
        loader_train, loader_valid, train, valid, valid_labels = train_input.make_batch(fold)
        
        # Scheduler Setting
        if cfg.swa:
            swa_start = cfg.swa_start
            swa_scheduler = SWALR(
                optimizer,
                swa_lr = cfg.swa_lr, # Later Append
                anneal_epochs=cfg.anneal_epochs, 
                anneal_strategy=cfg.anneal_strategy
            )
            
        if cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=int(len(train)/cfg.batch_size * cfg.epochs/cfg.n_gradient_accumulation_steps) * cfg.warmup_ratio,
                num_training_steps=int(len(train)/cfg.batch_size * cfg.epochs/cfg.n_gradient_accumulation_steps),
                num_cycles = cfg.num_cycles
            )
        elif cfg.scheduler == 'cosine_annealing':
            scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=int(len(train)/cfg.batch_size * cfg.epochs/cfg.n_gradient_accumulation_steps) * cfg.warmup_ratio,
                num_training_steps=int(len(train)/cfg.batch_size * cfg.epochs/cfg.n_gradient_accumulation_steps),
                num_cycles = 8 
            )
        else:
            scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=int(len(train)/cfg.batch_size * cfg.epochs) * cfg.warmup_ratio,
                num_training_steps=int(len(train) /cfg.batch_size * cfg.epochs),
                num_cycles = cfg.num_cycles
            )  

        for epoch in range(cfg.epochs):
            print(f'[{epoch+1}/{cfg.epochs}] Train & Validation')
            if cfg.swa:
                train_loss, valid_loss, score, grad_norm, lr = train_fn(
                    cfg,
                    loader_train,
                    loader_valid,
                    model,
                    criterion,
                    optimizer,
                    scheduler,
                    valid,
                    valid_labels,
                    int(epoch),
                    swa_model=swa_model,
                    swa_start=swa_start,
                    swa_scheduler=swa_scheduler,
            )        
            else:
                train_loss, valid_loss, score = train_fn(
                    cfg,
                    loader_train,
                    loader_valid,
                    model,
                    criterion,
                    optimizer,
                    scheduler,
                    valid,
                    valid_labels,
                    int(epoch),
            )
            
            train_loss = train_loss.detach().cpu().numpy()
            valid_loss = valid_loss.detach().cpu().numpy()
            grad_norm = grad_norm.detach().cpu().numpy()
            
            fold_train_loss_list.append(train_loss)
            fold_valid_loss_list.append(valid_loss)
            fold_score_list.append(score)
           
            wandb.log({
                '<epoch> Train Loss': train_loss,
                '<epoch> Valid Loss': valid_loss,
                '<epoch> Pearson_Score': score,
                '<epoch> Gradient Norm': grad_norm,
                '<epoch> lr': lr
            })
            
            print(f'[{epoch+1}/{cfg.epochs}] Train Loss: {np.round(train_loss, 4)}') 
            print(f'[{epoch+1}/{cfg.epochs}] Valid Loss: {np.round(valid_loss, 4)}')
            print(f'[{epoch+1}/{cfg.epochs}] Pearson Score: {np.round(score, 4)}')
            print(f'[{epoch+1}/{cfg.epochs}] Gradient Norm: {np.round(grad_norm, 4)}')
            print(f'[{epoch+1}/{cfg.epochs}] lr: {lr}')
            
        
            if val_score_max <= score:
                print(f'[Update] Valid Score : ({val_score_max:.4f} => {score:.4f}) Save Parameter')
                print(f'Best Score: {score}')
                torch.save(model.state_dict(),
                           f'Ver2-3_Token_Classification_Fold{fold}_DeBERTa_V3_Large.pth')
                val_score_max = score
            
        del train_loss, valid_loss
        gc.collect()
        torch.cuda.empty_cache()
        
        print(f'================= {fold+1}th Train & Validation =================')            
        fold_train_loss = np.mean(fold_train_loss_list)
        fold_valid_loss = np.mean(fold_valid_loss_list)
        fold_score = np.mean(fold_score_list)
        wandb.log({f'<Fold{fold+1}> Train Loss': fold_train_loss,
                   f'<Fold{fold+1}> Valid Loss': fold_valid_loss,
                   f'<Fold{fold+1}> Pearson_Score': fold_score,})
        print(f'Fold[{fold+1}/{fold_list[-1]+1}] Train Loss: {np.round(fold_train_loss, 4)}')
        print(f'Fold[{fold+1}/{fold_list[-1]+1}] Valid Loss: {np.round(fold_valid_loss, 4)}')
        print(f'Fold[{fold+1}/{fold_list[-1]+1}] Pearson Score: {np.round(fold_score, 4)}')
        
        if cfg.swa:
            update_bn(loader_train, swa_model) # Stochastic Weight Averaging
            fold_swa_loss, fold_swa_score = swa_valid(
                cfg,
                loader_valid,
                swa_model,
                criterion,
                valid_labels,
            )
            fold_swa_loss = fold_swa_loss.detach().cpu().numpy()
            fold_swa_loss = np.mean(fold_swa_loss)
            fold_swa_score = np.mean(fold_swa_score)
            
            wandb.log({
                f'<Fold{fold+1}> SWA Valid Loss': fold_swa_loss,
                f'<Fold{fold+1}> SWA Pearson_Score': fold_swa_score,
            })
            
            print(f'Fold[{fold+1}/{fold_list[-1]+1}] SWA Loss: {np.round(fold_swa_loss, 4)}')
            print(f'Fold[{fold+1}/{fold_list[-1]+1}] SWA Score: {np.round(fold_swa_score, 4)}') 
        
        if val_score_max <= fold_swa_score:
            print(f'[Update] Valid Score : ({val_score_max:.4f} => {fold_swa_score:.4f}) Save Parameter')
            print(f'Best Score: {fold_swa_score}')
            torch.save(model.state_dict(),
                       f'SWA_Ver2-3_Token_Classification_Fold{fold}_DeBERTa_V3_Large.pth')
            val_score_max = fold_score
            
        del fold_swa_loss
        gc.collect()
        torch.cuda.empty_cache()
            
    wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mqcqced[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666878751666445, max=1.0)…



  0%|          | 0/4 [00:00<?, ?it/s]



Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[1/180] Train Loss: 0.6843000054359436
[1/180] Valid Loss: 0.6578999757766724
[1/180] Pearson Score: 0.4097
[1/180] Gradient Norm: 1780.4930419921875
[1/180] lr: 2.7122321670735012e-06
[Update] Valid Score : (-inf => 0.4097) Save Parameter
Best Score: 0.40971906912197553
[2/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[2/180] Train Loss: 0.6531999707221985
[2/180] Valid Loss: 0.6401000022888184
[2/180] Pearson Score: 0.532
[2/180] Gradient Norm: 2407.181640625
[2/180] lr: 5.4244643341470025e-06
[Update] Valid Score : (0.4097 => 0.5320) Save Parameter
Best Score: 0.5319613917782015
[3/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[3/180] Train Loss: 0.640500009059906
[3/180] Valid Loss: 0.6269999742507935
[3/180] Pearson Score: 0.6087
[3/180] Gradient Norm: 3722.11279296875
[3/180] lr: 8.136696501220503e-06
[Update] Valid Score : (0.5320 => 0.6087) Save Parameter
Best Score: 0.608674825045642
[4/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[4/180] Train Loss: 0.6287000179290771
[4/180] Valid Loss: 0.6136999726295471
[4/180] Pearson Score: 0.6612
[4/180] Gradient Norm: 8064.52001953125
[4/180] lr: 1.0848928668294005e-05
[Update] Valid Score : (0.6087 => 0.6612) Save Parameter
Best Score: 0.661183444323874
[5/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[5/180] Train Loss: 0.6150000095367432
[5/180] Valid Loss: 0.5963000059127808
[5/180] Pearson Score: 0.6958
[5/180] Gradient Norm: 8425.140625
[5/180] lr: 1.3561160835367507e-05
[Update] Valid Score : (0.6612 => 0.6958) Save Parameter
Best Score: 0.6957966041860348
[6/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[6/180] Train Loss: 0.5999000072479248
[6/180] Valid Loss: 0.5839999914169312
[6/180] Pearson Score: 0.7249
[6/180] Gradient Norm: 4018.826904296875
[6/180] lr: 1.6273393002441007e-05
[Update] Valid Score : (0.6958 => 0.7249) Save Parameter
Best Score: 0.724850180184517
[7/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[7/180] Train Loss: 0.5882999897003174
[7/180] Valid Loss: 0.5703999996185303
[7/180] Pearson Score: 0.7546
[7/180] Gradient Norm: 5371.4365234375
[7/180] lr: 1.898562516951451e-05
[Update] Valid Score : (0.7249 => 0.7546) Save Parameter
Best Score: 0.7545719150912469
[8/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[8/180] Train Loss: 0.5796999931335449
[8/180] Valid Loss: 0.5673999786376953
[8/180] Pearson Score: 0.7706
[8/180] Gradient Norm: 6399.50830078125
[8/180] lr: 2.169785733658801e-05
[Update] Valid Score : (0.7546 => 0.7706) Save Parameter
Best Score: 0.7706341882087255
[9/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[9/180] Train Loss: 0.5719000101089478
[9/180] Valid Loss: 0.559499979019165
[9/180] Pearson Score: 0.7849
[9/180] Gradient Norm: 6250.1083984375
[9/180] lr: 2.4410089503661513e-05
[Update] Valid Score : (0.7706 => 0.7849) Save Parameter
Best Score: 0.7848926436915169
[10/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[10/180] Train Loss: 0.5637999773025513
[10/180] Valid Loss: 0.555899977684021
[10/180] Pearson Score: 0.7985
[10/180] Gradient Norm: 6449.2470703125
[10/180] lr: 2.7122321670735013e-05
[Update] Valid Score : (0.7849 => 0.7985) Save Parameter
Best Score: 0.7985043920576239
[11/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[11/180] Train Loss: 0.5575000047683716
[11/180] Valid Loss: 0.5504000186920166
[11/180] Pearson Score: 0.8068
[11/180] Gradient Norm: 5355.22265625
[11/180] lr: 2.9834553837808517e-05
[Update] Valid Score : (0.7985 => 0.8068) Save Parameter
Best Score: 0.8067804918261992
[12/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[12/180] Train Loss: 0.5526000261306763
[12/180] Valid Loss: 0.5532000064849854
[12/180] Pearson Score: 0.813
[12/180] Gradient Norm: 4369.27392578125
[12/180] lr: 3.254678600488201e-05
[Update] Valid Score : (0.8068 => 0.8130) Save Parameter
Best Score: 0.8129845930798877
[13/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[13/180] Train Loss: 0.5468999743461609
[13/180] Valid Loss: 0.5467000007629395
[13/180] Pearson Score: 0.8207
[13/180] Gradient Norm: 3543.90869140625
[13/180] lr: 3.525901817195551e-05
[Update] Valid Score : (0.8130 => 0.8207) Save Parameter
Best Score: 0.8207410904728175
[14/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[14/180] Train Loss: 0.5428000092506409
[14/180] Valid Loss: 0.5479000210762024
[14/180] Pearson Score: 0.8222
[14/180] Gradient Norm: 4250.6513671875
[14/180] lr: 3.797125033902902e-05
[Update] Valid Score : (0.8207 => 0.8222) Save Parameter
Best Score: 0.8221954399992328
[15/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[15/180] Train Loss: 0.5389000177383423
[15/180] Valid Loss: 0.5475000143051147
[15/180] Pearson Score: 0.8251
[15/180] Gradient Norm: 2680.077392578125
[15/180] lr: 4.068348250610252e-05
[Update] Valid Score : (0.8222 => 0.8251) Save Parameter
Best Score: 0.8250705062382038
[16/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[16/180] Train Loss: 0.5353999733924866
[16/180] Valid Loss: 0.5485000014305115
[16/180] Pearson Score: 0.827
[16/180] Gradient Norm: 4286.13134765625
[16/180] lr: 4.339571467317602e-05
[Update] Valid Score : (0.8251 => 0.8270) Save Parameter
Best Score: 0.8269675036996958
[17/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[17/180] Train Loss: 0.5324000120162964
[17/180] Valid Loss: 0.5525000095367432
[17/180] Pearson Score: 0.8294
[17/180] Gradient Norm: 6413.98828125
[17/180] lr: 4.610794684024952e-05
[Update] Valid Score : (0.8270 => 0.8294) Save Parameter
Best Score: 0.8294301481083621
[18/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[18/180] Train Loss: 0.5307999849319458
[18/180] Valid Loss: 0.545799970626831
[18/180] Pearson Score: 0.8343
[18/180] Gradient Norm: 2480.564208984375
[18/180] lr: 4.8820179007323027e-05
[Update] Valid Score : (0.8294 => 0.8343) Save Parameter
Best Score: 0.834254476997923
[19/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[19/180] Train Loss: 0.5299000144004822
[19/180] Valid Loss: 0.5479000210762024
[19/180] Pearson Score: 0.8337
[19/180] Gradient Norm: 1635.8603515625
[19/180] lr: 4.9634645331736365e-05
[20/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[20/180] Train Loss: 0.5249999761581421
[20/180] Valid Loss: 0.5479000210762024
[20/180] Pearson Score: 0.8362
[20/180] Gradient Norm: 3918.79931640625
[20/180] lr: 4.724224290233164e-05
[Update] Valid Score : (0.8343 => 0.8362) Save Parameter
Best Score: 0.8361648769024356
[21/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[21/180] Train Loss: 0.5217000246047974
[21/180] Valid Loss: 0.5519999861717224
[21/180] Pearson Score: 0.8387
[21/180] Gradient Norm: 2828.005859375
[21/180] lr: 4.282391876428358e-05
[Update] Valid Score : (0.8362 => 0.8387) Save Parameter
Best Score: 0.8387472586881518
[22/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[22/180] Train Loss: 0.5181999802589417
[22/180] Valid Loss: 0.5485000014305115
[22/180] Pearson Score: 0.8376
[22/180] Gradient Norm: 3624.087890625
[22/180] lr: 3.678211339207133e-05
[23/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[23/180] Train Loss: 0.5163000226020813
[23/180] Valid Loss: 0.5508000254631042
[23/180] Pearson Score: 0.8412
[23/180] Gradient Norm: 3202.439453125
[23/180] lr: 2.9667141100519634e-05
[Update] Valid Score : (0.8387 => 0.8412) Save Parameter
Best Score: 0.8412145154352871
[24/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[24/180] Train Loss: 0.5146999955177307
[24/180] Valid Loss: 0.550000011920929
[24/180] Pearson Score: 0.84
[24/180] Gradient Norm: 3588.76953125
[24/180] lr: 2.212706498673368e-05
[25/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[25/180] Train Loss: 0.51419997215271
[25/180] Valid Loss: 0.555899977684021
[25/180] Pearson Score: 0.8385
[25/180] Gradient Norm: 5315.53369140625
[25/180] lr: 1.48486684812703e-05
[26/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[26/180] Train Loss: 0.5115000009536743
[26/180] Valid Loss: 0.5515999794006348
[26/180] Pearson Score: 0.8414
[26/180] Gradient Norm: 4035.801513671875
[26/180] lr: 8.494900080343715e-06
[Update] Valid Score : (0.8412 => 0.8414) Save Parameter
Best Score: 0.8414331603470655
[27/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[27/180] Train Loss: 0.511900007724762
[27/180] Valid Loss: 0.5548999905586243
[27/180] Pearson Score: 0.8409
[27/180] Gradient Norm: 4706.0908203125
[27/180] lr: 3.6444890691097577e-06
[28/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[28/180] Train Loss: 0.5116999745368958
[28/180] Valid Loss: 0.5547999739646912
[28/180] Pearson Score: 0.8408
[28/180] Gradient Norm: 2299.37841796875
[28/180] lr: 7.392323026181453e-07
[29/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[29/180] Train Loss: 0.5105999708175659
[29/180] Valid Loss: 0.5533999800682068
[29/180] Pearson Score: 0.8417
[29/180] Gradient Norm: 3829.83349609375
[29/180] lr: 4.995624660278707e-05
[Update] Valid Score : (0.8414 => 0.8417) Save Parameter
Best Score: 0.8417452063837799
[30/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[30/180] Train Loss: 0.5120000243186951
[30/180] Valid Loss: 0.5551999807357788
[30/180] Pearson Score: 0.8417
[30/180] Gradient Norm: 3008.890380859375
[30/180] lr: 4.8378600357061824e-05
[31/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[31/180] Train Loss: 0.5110999941825867
[31/180] Valid Loss: 0.5514000058174133
[31/180] Pearson Score: 0.8404
[31/180] Gradient Norm: 1851.47314453125
[31/180] lr: 4.4671527947390544e-05
[32/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[32/180] Train Loss: 0.5120000243186951
[32/180] Valid Loss: 0.5611000061035156
[32/180] Pearson Score: 0.8379
[32/180] Gradient Norm: 4354.42138671875
[32/180] lr: 3.917268589983689e-05
[33/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[33/180] Train Loss: 0.5091000199317932
[33/180] Valid Loss: 0.5569000244140625
[33/180] Pearson Score: 0.8416
[33/180] Gradient Norm: 8824.7109375
[33/180] lr: 3.23829330318078e-05
[34/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[34/180] Train Loss: 0.5092999935150146
[34/180] Valid Loss: 0.5547000169754028
[34/180] Pearson Score: 0.8418
[34/180] Gradient Norm: 2301.409912109375
[34/180] lr: 2.4920710019096003e-05
[Update] Valid Score : (0.8417 => 0.8418) Save Parameter
Best Score: 0.841761358055445
[35/180] Train & Validation


  0%|          | 0/20 [00:00<?, ?it/s]

[Append Ver 1]
1) Add Gradient Accumulation & Clipping Norm
- 느리지만 확실히 안정적으로 스코어 상승, 다음에 Clipping Norm 설정 끄고 epoch별 Gradient 총량을 구해서 max_norm 값을 accumulation step 숫자에 맞게 수정하자  
- 확실히 max_grad_norm을 올리니까 처음부터 어지간한 성능이 나오는 듯  
- Running Time 생각 안하고 Batch & Epoch 관점에서만 보면 훨씬 성능이 좋아 진 거긴 함  
- learning rate도 accumulation 숫자에 맞게 나눠줘야 하는거 아닌가?? 한번 해보자

2) Add Stochastic Weight Averaging  
- OOM 발생 안하고 잘 돌아간다 
- VRAM이 진짜 10% 정도 상승함 
- 이제 문제는..... AWP  
- epoch 8 설정하고 한번 일반 Validation이랑 얼마나 차이가 나는지 확인해보자  
  일반 모델 폴드 평균: 0.51  
  SWA 모델: 0.60  

3) Adversarial Weight Pertubation  
- token max_len = 400 => 아슬아슬하게 돌아감
- 근데 학습 속도가 너무 느려서 못써먹겠음  
- 이거 같이 쓸거면 Gradient Norm을 올리던 학습률을 올리던 해야겠음  

4) Cosine Scheduler  
- num_cycle: 8  
  [초기]  
  epoch=180, num_accumulation = 4, num_cycle =2
  실제 스케줄러 step은 45회 발생, 실험 결과 에폭 60회(실 에폭 15회) 부터 스코어가 수렴하는 경향성을 보임  
  num_cycle = 2로 설정했기 때문에, 사실상 한 주기의 절반쯤부터 수렴이 발생함  
  단순하게 생각해보면 num_cycle * 2 * = 16으로 새로운 파이프라인의 num_cycle 설정하는 것이 맞아보이지만,  
  학습률의 감소하는 추세를 고려했을때 8정도로 설정하는 것이 일단 좋아 보인다.
  이런 계산 및 가정하는 것이 귀찮을 뿐더러 정확한 학습 결과를 보장하는 것도 아니기 때문에 얼른 파라미터 튜닝 툴을 적용하자  
  
[Append Ver 2]  
1) num_accumulation = 1
- 이거 적용하는 것의 이점이 딱히 없는듯  
- AWP = False => max_len = 512
2) Clipping Grad Norm = 1000  

3) Cosine Annealing Scheduler  
- num_cycle: 8, 16  

4) Re-Init Top Encoder Block  
- num_reinit: 5  

5) Cross Validation  
- num_fold: 4  
  실제 대회 LB는 25%, PB는 75%라서 해당 비율에 맞게 폴드를 구성하자...CV의 중요성은 진짜 100000억번 반복해도 지나치지 않은 것 같다  
  
6) Add LR Tracker  

7) Append LR  
- options: 1e-5, 5e-6

In [None]:
del train_df, model, swa_model
gc.collect()
torch.cuda.empty_cache()