# About this notebook
- Deberta-base starter code
- pip wheels is [here](https://www.kaggle.com/yasufuminakama/nbme-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
import shutil
from pathlib import Path

transformers_path = Path("/home/gpufs/users/students/iasd22/iasd22_0904/miniconda3/envs/dsa3/lib/python3.9/site-packages/transformers")

input_dir = Path("./fast_token")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py', "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './ouput_deberta_large/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='NBME'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [4]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='NBME-Public-derbertav3', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Currently logged in as: [33manony-moose-234678[0m (use `wandb login --relogin` to force relogin)


# Library

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
# import joblib
import itertools
import warnings
from IPython.display import display

warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.6
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [6]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [7]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [8]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [9]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('./nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('./nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('./nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [10]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [11]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [12]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    8185
0    4399
2    1292
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

# CV split

In [13]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [14]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [15]:
# ====================================================
# tokenizer
# ====================================================
# tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
# CFG.tokenizer = tokenizer

from transformers.models.deberta_v2 import DebertaV2TokenizerFast

tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [16]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/42146 [00:00<?, ?it/s]

pn_history max(lengths): 323


  0%|          | 0/143 [00:00<?, ?it/s]

feature_text max(lengths): 28
max_len: 354


In [17]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [18]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helpler functions

In [21]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [22]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [23]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/953] Elapsed 0m 2s (remain 46m 57s) Loss: 1.0580(1.0580) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 2s (remain 8m 49s) Loss: 0.0123(0.1062) Grad: 952.1965  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 2m 3s (remain 7m 40s) Loss: 0.0157(0.0644) Grad: 2435.2998  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 3m 3s (remain 6m 38s) Loss: 0.0141(0.0496) Grad: 1368.5153  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 4m 5s (remain 5m 37s) Loss: 0.0411(0.0412) Grad: 4829.8188  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 5m 5s (remain 4m 35s) Loss: 0.0258(0.0364) Grad: 2527.3516  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 6m 4s (remain 3m 33s) Loss: 0.0219(0.0328) Grad: 1419.6333  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 7m 4s (remain 2m 32s) Loss: 0.0040(0.0301) Grad: 394.7768  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 8m 4s (remain 1m 32s) Loss: 0.0162(0.0281) Grad: 2020.5662  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 9m 5s (remain 0m 31s) Loss: 0.0061(0

Epoch 1 - avg_train_loss: 0.0260  avg_val_loss: 0.0116  time: 639s
Epoch 1 - Score: 0.8657
Epoch 1 - Save Best Score: 0.8657 Model


Epoch: [2][0/953] Elapsed 0m 0s (remain 15m 0s) Loss: 0.0087(0.0087) Grad: 10986.2256  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 0s (remain 8m 30s) Loss: 0.0315(0.0108) Grad: 25046.6289  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 1m 59s (remain 7m 28s) Loss: 0.0072(0.0104) Grad: 6624.9873  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 2m 59s (remain 6m 29s) Loss: 0.0085(0.0100) Grad: 9646.6992  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 3m 59s (remain 5m 29s) Loss: 0.0106(0.0098) Grad: 14899.5410  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 4m 59s (remain 4m 29s) Loss: 0.0140(0.0095) Grad: 16617.5078  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 5m 59s (remain 3m 30s) Loss: 0.0028(0.0095) Grad: 4316.6265  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0059(0.0094) Grad: 5627.5903  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0065(0.0093) Grad: 13237.3896  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 9m 1s (remain 0m 31

Epoch 2 - avg_train_loss: 0.0092  avg_val_loss: 0.0114  time: 634s
Epoch 2 - Score: 0.8796
Epoch 2 - Save Best Score: 0.8796 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 18s) Loss: 0.0021(0.0021) Grad: 6427.7129  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 1s (remain 8m 37s) Loss: 0.0048(0.0086) Grad: 11697.7881  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 2m 1s (remain 7m 34s) Loss: 0.0059(0.0086) Grad: 21192.3984  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 3m 1s (remain 6m 33s) Loss: 0.0173(0.0084) Grad: 24573.5156  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0019(0.0081) Grad: 5197.6636  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 5m 1s (remain 4m 31s) Loss: 0.0012(0.0079) Grad: 2743.8250  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 6m 2s (remain 3m 32s) Loss: 0.0039(0.0080) Grad: 6201.5088  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 7m 2s (remain 2m 31s) Loss: 0.0154(0.0081) Grad: 24646.7461  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 8m 4s (remain 1m 31s) Loss: 0.0091(0.0081) Grad: 17062.3730  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 9m 4s (remain 0m 31s) L

Epoch 3 - avg_train_loss: 0.0080  avg_val_loss: 0.0114  time: 637s
Epoch 3 - Score: 0.8835
Epoch 3 - Save Best Score: 0.8835 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 16m 25s) Loss: 0.0078(0.0078) Grad: 12653.4395  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 0s (remain 8m 32s) Loss: 0.0077(0.0069) Grad: 11684.5918  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 2m 1s (remain 7m 32s) Loss: 0.0240(0.0070) Grad: 35412.1914  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0077(0.0069) Grad: 22152.0938  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 4m 0s (remain 5m 31s) Loss: 0.0173(0.0070) Grad: 26519.7910  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 5m 0s (remain 4m 31s) Loss: 0.0018(0.0069) Grad: 9260.1230  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 6m 0s (remain 3m 31s) Loss: 0.0102(0.0070) Grad: 16802.7520  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0093(0.0071) Grad: 8062.0547  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0017(0.0072) Grad: 2747.2168  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 9m 0s (remain 0m 31s) 

Epoch 4 - avg_train_loss: 0.0071  avg_val_loss: 0.0117  time: 633s
Epoch 4 - Score: 0.8813


Epoch: [5][0/953] Elapsed 0m 0s (remain 15m 42s) Loss: 0.0042(0.0042) Grad: 8971.9736  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 1s (remain 8m 35s) Loss: 0.0114(0.0064) Grad: 17601.0684  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 2m 0s (remain 7m 31s) Loss: 0.0087(0.0066) Grad: 42419.6484  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 3m 0s (remain 6m 30s) Loss: 0.0019(0.0066) Grad: 5340.3481  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 4m 0s (remain 5m 30s) Loss: 0.0013(0.0066) Grad: 4273.7358  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 5m 0s (remain 4m 31s) Loss: 0.0021(0.0067) Grad: 6060.6943  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 6m 0s (remain 3m 31s) Loss: 0.0041(0.0066) Grad: 10353.0898  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0037(0.0065) Grad: 6125.2715  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0018(0.0067) Grad: 3899.0017  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 9m 1s (remain 0m 31s) Los

Epoch 5 - avg_train_loss: 0.0066  avg_val_loss: 0.0119  time: 636s
Epoch 5 - Score: 0.8824
Score: 0.8835
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializin

Epoch: [1][0/953] Elapsed 0m 0s (remain 14m 58s) Loss: 0.7165(0.7165) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 1s (remain 8m 36s) Loss: 0.0151(0.0811) Grad: 1298.9277  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 2m 1s (remain 7m 34s) Loss: 0.0221(0.0511) Grad: 2552.1089  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 3m 2s (remain 6m 34s) Loss: 0.0159(0.0406) Grad: 945.3980  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 4m 2s (remain 5m 33s) Loss: 0.0108(0.0344) Grad: 1112.3580  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 5m 2s (remain 4m 32s) Loss: 0.0212(0.0305) Grad: 2789.7830  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0124(0.0279) Grad: 785.3531  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0228(0.0260) Grad: 1465.6504  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0235(0.0247) Grad: 2100.3743  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 9m 1s (remain 0m 31s) Loss: 0.0094(0

Epoch 1 - avg_train_loss: 0.0229  avg_val_loss: 0.0141  time: 634s
Epoch 1 - Score: 0.8561
Epoch 1 - Save Best Score: 0.8561 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 17m 34s) Loss: 0.0040(0.0040) Grad: 7118.7593  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 1s (remain 8m 37s) Loss: 0.0051(0.0117) Grad: 11191.4141  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 2m 1s (remain 7m 35s) Loss: 0.0039(0.0109) Grad: 7080.8311  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 3m 1s (remain 6m 33s) Loss: 0.0082(0.0103) Grad: 9758.7246  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0087(0.0100) Grad: 11663.5771  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 5m 1s (remain 4m 31s) Loss: 0.0095(0.0097) Grad: 20234.1680  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0024(0.0095) Grad: 6611.9360  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0036(0.0095) Grad: 12851.8799  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0054(0.0092) Grad: 7048.3276  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 9m 1s (remain 0m 31s) Lo

Epoch 2 - avg_train_loss: 0.0092  avg_val_loss: 0.0118  time: 634s
Epoch 2 - Score: 0.8768
Epoch 2 - Save Best Score: 0.8768 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 5s) Loss: 0.0096(0.0096) Grad: 16526.8145  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 0s (remain 8m 30s) Loss: 0.0053(0.0078) Grad: 20915.7969  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0081(0.0075) Grad: 24603.4355  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0028(0.0075) Grad: 7928.0947  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 4m 1s (remain 5m 31s) Loss: 0.0011(0.0074) Grad: 4870.8975  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 5m 1s (remain 4m 31s) Loss: 0.0253(0.0076) Grad: 29441.5117  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0018(0.0076) Grad: 18591.8223  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0017(0.0076) Grad: 3412.5107  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0038(0.0075) Grad: 7724.5913  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 9m 1s (remain 0m 31s) Lo

Epoch 3 - avg_train_loss: 0.0075  avg_val_loss: 0.0122  time: 636s
Epoch 3 - Score: 0.8802
Epoch 3 - Save Best Score: 0.8802 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 17m 37s) Loss: 0.0035(0.0035) Grad: 4935.8760  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 0s (remain 8m 32s) Loss: 0.0107(0.0058) Grad: 47331.1172  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 2m 0s (remain 7m 30s) Loss: 0.0086(0.0062) Grad: 21291.1172  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 3m 0s (remain 6m 30s) Loss: 0.0010(0.0063) Grad: 4299.8447  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 4m 0s (remain 5m 30s) Loss: 0.0033(0.0062) Grad: 6572.7480  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 5m 0s (remain 4m 30s) Loss: 0.0023(0.0062) Grad: 5177.1997  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 6m 0s (remain 3m 31s) Loss: 0.0015(0.0063) Grad: 3115.1162  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0342(0.0064) Grad: 63588.7070  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0043(0.0065) Grad: 6438.6201  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 9m 1s (remain 0m 31s) Los

Epoch 4 - avg_train_loss: 0.0066  avg_val_loss: 0.0125  time: 636s
Epoch 4 - Score: 0.8802


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 21s) Loss: 0.0012(0.0012) Grad: 2846.8789  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 1s (remain 8m 39s) Loss: 0.0022(0.0067) Grad: 5613.2612  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 2m 1s (remain 7m 36s) Loss: 0.0109(0.0064) Grad: 12277.4395  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 3m 3s (remain 6m 37s) Loss: 0.0115(0.0062) Grad: 14290.9883  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 4m 4s (remain 5m 36s) Loss: 0.0145(0.0061) Grad: 52948.1758  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 5m 6s (remain 4m 36s) Loss: 0.0052(0.0059) Grad: 9359.4092  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 6m 6s (remain 3m 34s) Loss: 0.0032(0.0058) Grad: 7044.5972  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 7m 6s (remain 2m 33s) Loss: 0.0092(0.0059) Grad: 14462.8486  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 8m 7s (remain 1m 32s) Loss: 0.0005(0.0059) Grad: 2038.7532  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 9m 7s (remain 0m 31s) Lo

Epoch 5 - avg_train_loss: 0.0060  avg_val_loss: 0.0130  time: 641s
Epoch 5 - Score: 0.8794
Score: 0.8802
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializin

Epoch: [1][0/953] Elapsed 0m 0s (remain 15m 30s) Loss: 0.8917(0.8917) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 1s (remain 8m 36s) Loss: 0.0099(0.0923) Grad: 851.4504  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0108(0.0585) Grad: 802.0851  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0232(0.0449) Grad: 1755.0813  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 4m 0s (remain 5m 31s) Loss: 0.0242(0.0378) Grad: 2670.8279  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 59s (remain 4m 30s) Loss: 0.0224(0.0336) Grad: 1646.8302  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 6m 0s (remain 3m 30s) Loss: 0.0119(0.0304) Grad: 934.1456  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 6m 59s (remain 2m 30s) Loss: 0.0022(0.0280) Grad: 156.7932  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0054(0.0263) Grad: 895.5369  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 8m 59s (remain 0m 31s) Loss: 0.0095(0

Epoch 1 - avg_train_loss: 0.0245  avg_val_loss: 0.0123  time: 633s
Epoch 1 - Score: 0.8633
Epoch 1 - Save Best Score: 0.8633 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 18m 6s) Loss: 0.0150(0.0150) Grad: 11923.0752  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 0s (remain 8m 33s) Loss: 0.0033(0.0111) Grad: 5057.2769  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0032(0.0098) Grad: 5793.0684  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0067(0.0096) Grad: 17109.2773  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 4m 0s (remain 5m 30s) Loss: 0.0043(0.0093) Grad: 7119.6729  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 5m 0s (remain 4m 31s) Loss: 0.0094(0.0090) Grad: 19894.4043  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 6m 0s (remain 3m 31s) Loss: 0.0107(0.0088) Grad: 21512.7305  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0090(0.0088) Grad: 14979.6504  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0111(0.0090) Grad: 21258.5195  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 9m 1s (remain 0m 31s) L

Epoch 2 - avg_train_loss: 0.0089  avg_val_loss: 0.0115  time: 634s
Epoch 2 - Score: 0.8778
Epoch 2 - Save Best Score: 0.8778 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 35s) Loss: 0.0041(0.0041) Grad: 6886.4146  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 0s (remain 8m 32s) Loss: 0.0323(0.0081) Grad: 32391.1172  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 2m 0s (remain 7m 29s) Loss: 0.0071(0.0076) Grad: 13324.8008  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0018(0.0077) Grad: 3604.1040  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 4m 1s (remain 5m 31s) Loss: 0.0107(0.0075) Grad: 16646.6641  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 5m 1s (remain 4m 32s) Loss: 0.0090(0.0073) Grad: 17367.0078  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0028(0.0075) Grad: 6519.6714  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 7m 3s (remain 2m 32s) Loss: 0.0030(0.0076) Grad: 11224.0371  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 8m 3s (remain 1m 31s) Loss: 0.0053(0.0074) Grad: 6833.1060  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 9m 3s (remain 0m 31s) L

Epoch 3 - avg_train_loss: 0.0073  avg_val_loss: 0.0122  time: 637s
Epoch 3 - Score: 0.8796
Epoch 3 - Save Best Score: 0.8796 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 16m 17s) Loss: 0.0023(0.0023) Grad: 3816.3750  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 0s (remain 8m 31s) Loss: 0.0024(0.0060) Grad: 6084.6421  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 2m 0s (remain 7m 30s) Loss: 0.0059(0.0063) Grad: 10298.4492  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 2m 59s (remain 6m 29s) Loss: 0.0124(0.0063) Grad: 15498.3213  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 3m 59s (remain 5m 30s) Loss: 0.0062(0.0065) Grad: 8510.2432  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 5m 0s (remain 4m 30s) Loss: 0.0042(0.0066) Grad: 13146.5996  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 6m 0s (remain 3m 30s) Loss: 0.0004(0.0065) Grad: 1342.2178  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0086(0.0065) Grad: 20002.9238  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0099(0.0065) Grad: 19325.9668  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 9m 1s (remain 0m 31s)

Epoch 4 - avg_train_loss: 0.0066  avg_val_loss: 0.0121  time: 636s
Epoch 4 - Score: 0.8797
Epoch 4 - Save Best Score: 0.8797 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 17s) Loss: 0.0091(0.0091) Grad: 5794.3564  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 0s (remain 8m 34s) Loss: 0.0036(0.0064) Grad: 7647.2710  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 2m 2s (remain 7m 37s) Loss: 0.0017(0.0063) Grad: 6242.7622  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 3m 1s (remain 6m 33s) Loss: 0.0008(0.0061) Grad: 3649.8931  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 4m 2s (remain 5m 33s) Loss: 0.0028(0.0059) Grad: 8802.1611  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 5m 2s (remain 4m 32s) Loss: 0.0010(0.0058) Grad: 4243.6865  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0035(0.0057) Grad: 9667.8184  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0096(0.0059) Grad: 20081.2051  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0040(0.0058) Grad: 7078.5762  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 9m 1s (remain 0m 31s) Loss:

Epoch 5 - avg_train_loss: 0.0060  avg_val_loss: 0.0127  time: 633s
Epoch 5 - Score: 0.8804
Epoch 5 - Save Best Score: 0.8804 Model
Score: 0.8804
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you exp

Epoch: [1][0/953] Elapsed 0m 0s (remain 15m 2s) Loss: 0.5206(0.5206) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 0s (remain 8m 29s) Loss: 0.0195(0.0636) Grad: 8995.1758  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0277(0.0423) Grad: 6795.4326  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0213(0.0348) Grad: 2001.4686  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 4m 0s (remain 5m 31s) Loss: 0.0194(0.0305) Grad: 1806.4292  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 5m 1s (remain 4m 32s) Loss: 0.0109(0.0274) Grad: 2179.0889  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0079(0.0254) Grad: 2192.6482  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0441(0.0238) Grad: 4956.6313  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 8m 0s (remain 1m 31s) Loss: 0.0151(0.0226) Grad: 4621.0557  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 9m 0s (remain 0m 31s) Loss: 0.0063(

Epoch 1 - avg_train_loss: 0.0211  avg_val_loss: 0.0127  time: 634s
Epoch 1 - Score: 0.8534
Epoch 1 - Save Best Score: 0.8534 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 16m 55s) Loss: 0.0171(0.0171) Grad: 13813.7500  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 0s (remain 8m 34s) Loss: 0.0010(0.0094) Grad: 4465.3467  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0244(0.0099) Grad: 18217.4629  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0022(0.0095) Grad: 4729.8657  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0182(0.0091) Grad: 23289.9941  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 5m 1s (remain 4m 32s) Loss: 0.0122(0.0091) Grad: 49046.8828  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 6m 2s (remain 3m 32s) Loss: 0.0030(0.0091) Grad: 5912.8306  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0101(0.0092) Grad: 30292.8672  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0255(0.0091) Grad: 25780.8223  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 9m 1s (remain 0m 31s) 

Epoch 2 - avg_train_loss: 0.0090  avg_val_loss: 0.0118  time: 635s
Epoch 2 - Score: 0.8759
Epoch 2 - Save Best Score: 0.8759 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 16m 14s) Loss: 0.0093(0.0093) Grad: 17791.8945  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 2s (remain 8m 43s) Loss: 0.0007(0.0066) Grad: 4061.2236  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0025(0.0067) Grad: 3261.8840  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 3m 1s (remain 6m 32s) Loss: 0.0218(0.0070) Grad: 34951.5234  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 4m 0s (remain 5m 30s) Loss: 0.0135(0.0073) Grad: 15765.5186  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 4m 59s (remain 4m 30s) Loss: 0.0034(0.0073) Grad: 23315.2344  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 5m 59s (remain 3m 30s) Loss: 0.0104(0.0071) Grad: 18850.5215  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 6m 59s (remain 2m 30s) Loss: 0.0042(0.0071) Grad: 7979.5762  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 7m 59s (remain 1m 31s) Loss: 0.0210(0.0072) Grad: 33283.8516  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 8m 59s (remain 0m 

Epoch 3 - avg_train_loss: 0.0072  avg_val_loss: 0.0126  time: 632s
Epoch 3 - Score: 0.8780
Epoch 3 - Save Best Score: 0.8780 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 16m 31s) Loss: 0.0045(0.0045) Grad: 13154.8008  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 1s (remain 8m 37s) Loss: 0.0062(0.0066) Grad: 7350.6050  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 2m 0s (remain 7m 31s) Loss: 0.0034(0.0064) Grad: 7245.1445  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 3m 1s (remain 6m 32s) Loss: 0.0117(0.0063) Grad: 56522.4219  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0024(0.0061) Grad: 5737.6841  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 5m 1s (remain 4m 32s) Loss: 0.0008(0.0060) Grad: 2639.5081  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 6m 1s (remain 3m 31s) Loss: 0.0028(0.0060) Grad: 5638.7534  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 7m 1s (remain 2m 31s) Loss: 0.0018(0.0060) Grad: 8634.5127  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0097(0.0061) Grad: 10833.2666  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 9m 2s (remain 0m 31s) Los

Epoch 4 - avg_train_loss: 0.0061  avg_val_loss: 0.0129  time: 636s
Epoch 4 - Score: 0.8811
Epoch 4 - Save Best Score: 0.8811 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 47s) Loss: 0.0033(0.0033) Grad: 10086.5127  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 1s (remain 8m 41s) Loss: 0.0012(0.0049) Grad: 4590.3091  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 2m 1s (remain 7m 35s) Loss: 0.0087(0.0051) Grad: 26499.8809  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 3m 2s (remain 6m 35s) Loss: 0.0034(0.0053) Grad: 8538.2373  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 4m 2s (remain 5m 34s) Loss: 0.0074(0.0054) Grad: 65545.6484  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 5m 2s (remain 4m 32s) Loss: 0.0053(0.0053) Grad: 43801.7891  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 6m 1s (remain 3m 32s) Loss: 0.0048(0.0052) Grad: 11387.3633  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 7m 3s (remain 2m 32s) Loss: 0.0080(0.0053) Grad: 14116.3809  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 8m 3s (remain 1m 31s) Loss: 0.0082(0.0053) Grad: 17640.7246  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 9m 3s (remain 0m 31s)

Epoch 5 - avg_train_loss: 0.0054  avg_val_loss: 0.0133  time: 637s
Epoch 5 - Score: 0.8801
Score: 0.8811
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializin

Epoch: [1][0/953] Elapsed 0m 0s (remain 15m 47s) Loss: 0.8846(0.8846) Grad: inf  LR: 0.00002000  
Epoch: [1][100/953] Elapsed 1m 0s (remain 8m 28s) Loss: 0.0156(0.0940) Grad: 3930.2397  LR: 0.00001998  
Epoch: [1][200/953] Elapsed 1m 59s (remain 7m 27s) Loss: 0.0052(0.0577) Grad: 773.4874  LR: 0.00001991  
Epoch: [1][300/953] Elapsed 2m 59s (remain 6m 29s) Loss: 0.0085(0.0449) Grad: 880.3503  LR: 0.00001980  
Epoch: [1][400/953] Elapsed 3m 59s (remain 5m 29s) Loss: 0.0210(0.0382) Grad: 2488.8479  LR: 0.00001965  
Epoch: [1][500/953] Elapsed 4m 59s (remain 4m 30s) Loss: 0.0194(0.0338) Grad: 1928.1050  LR: 0.00001946  
Epoch: [1][600/953] Elapsed 5m 59s (remain 3m 30s) Loss: 0.0432(0.0308) Grad: 2284.2219  LR: 0.00001923  
Epoch: [1][700/953] Elapsed 7m 0s (remain 2m 31s) Loss: 0.0032(0.0285) Grad: 720.7268  LR: 0.00001895  
Epoch: [1][800/953] Elapsed 8m 1s (remain 1m 31s) Loss: 0.0054(0.0268) Grad: 409.4217  LR: 0.00001864  
Epoch: [1][900/953] Elapsed 9m 1s (remain 0m 31s) Loss: 0.014

Epoch 1 - avg_train_loss: 0.0246  avg_val_loss: 0.0128  time: 635s
Epoch 1 - Score: 0.8582
Epoch 1 - Save Best Score: 0.8582 Model


Epoch: [2][0/953] Elapsed 0m 1s (remain 16m 32s) Loss: 0.0032(0.0032) Grad: 6604.6196  LR: 0.00001809  
Epoch: [2][100/953] Elapsed 1m 0s (remain 8m 27s) Loss: 0.0023(0.0104) Grad: 10047.0469  LR: 0.00001768  
Epoch: [2][200/953] Elapsed 2m 0s (remain 7m 32s) Loss: 0.0052(0.0099) Grad: 9559.6230  LR: 0.00001724  
Epoch: [2][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0156(0.0100) Grad: 31164.1113  LR: 0.00001677  
Epoch: [2][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0072(0.0102) Grad: 12340.3438  LR: 0.00001627  
Epoch: [2][500/953] Elapsed 5m 2s (remain 4m 32s) Loss: 0.0068(0.0098) Grad: 9161.6738  LR: 0.00001575  
Epoch: [2][600/953] Elapsed 6m 2s (remain 3m 32s) Loss: 0.0121(0.0097) Grad: 19095.5996  LR: 0.00001520  
Epoch: [2][700/953] Elapsed 7m 2s (remain 2m 31s) Loss: 0.0137(0.0096) Grad: 23316.5508  LR: 0.00001462  
Epoch: [2][800/953] Elapsed 8m 2s (remain 1m 31s) Loss: 0.0123(0.0094) Grad: 11843.2930  LR: 0.00001403  
Epoch: [2][900/953] Elapsed 9m 1s (remain 0m 31s) 

Epoch 2 - avg_train_loss: 0.0093  avg_val_loss: 0.0123  time: 633s
Epoch 2 - Score: 0.8763
Epoch 2 - Save Best Score: 0.8763 Model


Epoch: [3][0/953] Elapsed 0m 1s (remain 17m 11s) Loss: 0.0067(0.0067) Grad: 18944.6641  LR: 0.00001309  
Epoch: [3][100/953] Elapsed 1m 0s (remain 8m 29s) Loss: 0.0026(0.0080) Grad: 4555.4990  LR: 0.00001245  
Epoch: [3][200/953] Elapsed 1m 59s (remain 7m 28s) Loss: 0.0247(0.0086) Grad: 52524.2266  LR: 0.00001181  
Epoch: [3][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0155(0.0083) Grad: 13800.7803  LR: 0.00001116  
Epoch: [3][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0168(0.0079) Grad: 48335.7344  LR: 0.00001050  
Epoch: [3][500/953] Elapsed 5m 1s (remain 4m 32s) Loss: 0.0007(0.0079) Grad: 3103.0017  LR: 0.00000984  
Epoch: [3][600/953] Elapsed 6m 2s (remain 3m 32s) Loss: 0.0055(0.0076) Grad: 16534.7773  LR: 0.00000918  
Epoch: [3][700/953] Elapsed 7m 3s (remain 2m 32s) Loss: 0.0036(0.0078) Grad: 13663.6240  LR: 0.00000853  
Epoch: [3][800/953] Elapsed 8m 3s (remain 1m 31s) Loss: 0.0115(0.0078) Grad: 16046.0264  LR: 0.00000788  
Epoch: [3][900/953] Elapsed 9m 5s (remain 0m 31s

Epoch 3 - avg_train_loss: 0.0076  avg_val_loss: 0.0122  time: 639s
Epoch 3 - Score: 0.8797
Epoch 3 - Save Best Score: 0.8797 Model


Epoch: [4][0/953] Elapsed 0m 1s (remain 18m 24s) Loss: 0.0535(0.0535) Grad: 81679.1406  LR: 0.00000691  
Epoch: [4][100/953] Elapsed 1m 1s (remain 8m 38s) Loss: 0.0030(0.0060) Grad: 7613.2305  LR: 0.00000629  
Epoch: [4][200/953] Elapsed 2m 1s (remain 7m 36s) Loss: 0.0062(0.0065) Grad: 10037.9980  LR: 0.00000568  
Epoch: [4][300/953] Elapsed 3m 2s (remain 6m 35s) Loss: 0.0043(0.0066) Grad: 28400.6230  LR: 0.00000510  
Epoch: [4][400/953] Elapsed 4m 2s (remain 5m 33s) Loss: 0.0004(0.0065) Grad: 1295.3612  LR: 0.00000454  
Epoch: [4][500/953] Elapsed 5m 2s (remain 4m 33s) Loss: 0.0058(0.0066) Grad: 16184.3135  LR: 0.00000400  
Epoch: [4][600/953] Elapsed 6m 3s (remain 3m 32s) Loss: 0.0009(0.0067) Grad: 4207.6255  LR: 0.00000348  
Epoch: [4][700/953] Elapsed 7m 2s (remain 2m 32s) Loss: 0.0077(0.0067) Grad: 31194.9824  LR: 0.00000300  
Epoch: [4][800/953] Elapsed 8m 3s (remain 1m 31s) Loss: 0.0005(0.0067) Grad: 1742.8903  LR: 0.00000254  
Epoch: [4][900/953] Elapsed 9m 3s (remain 0m 31s) L

Epoch 4 - avg_train_loss: 0.0067  avg_val_loss: 0.0125  time: 636s
Epoch 4 - Score: 0.8824
Epoch 4 - Save Best Score: 0.8824 Model


Epoch: [5][0/953] Elapsed 0m 1s (remain 16m 25s) Loss: 0.0082(0.0082) Grad: 16681.7715  LR: 0.00000191  
Epoch: [5][100/953] Elapsed 1m 0s (remain 8m 33s) Loss: 0.0082(0.0062) Grad: 19569.7773  LR: 0.00000154  
Epoch: [5][200/953] Elapsed 2m 1s (remain 7m 33s) Loss: 0.0026(0.0063) Grad: 5823.0142  LR: 0.00000121  
Epoch: [5][300/953] Elapsed 3m 0s (remain 6m 31s) Loss: 0.0107(0.0063) Grad: 15956.3389  LR: 0.00000091  
Epoch: [5][400/953] Elapsed 4m 1s (remain 5m 32s) Loss: 0.0049(0.0062) Grad: 12541.1885  LR: 0.00000066  
Epoch: [5][500/953] Elapsed 5m 2s (remain 4m 32s) Loss: 0.0026(0.0061) Grad: 8158.0586  LR: 0.00000044  
Epoch: [5][600/953] Elapsed 6m 2s (remain 3m 32s) Loss: 0.0063(0.0061) Grad: 17582.1113  LR: 0.00000027  
Epoch: [5][700/953] Elapsed 7m 2s (remain 2m 31s) Loss: 0.0033(0.0061) Grad: 10109.5088  LR: 0.00000014  
Epoch: [5][800/953] Elapsed 8m 2s (remain 1m 31s) Loss: 0.0029(0.0061) Grad: 16548.8086  LR: 0.00000005  
Epoch: [5][900/953] Elapsed 9m 3s (remain 0m 31s)

Epoch 5 - avg_train_loss: 0.0060  avg_val_loss: 0.0127  time: 636s
Epoch 5 - Score: 0.8823
Score: 0.8824
Score: 0.8815





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▂▂▁▁
[fold0] avg_val_loss,▄▁▁▆█
[fold0] epoch,▁▃▅▆█
[fold0] loss,▄▃▅▅▄▃▄▆▂▂▂▁▂▂▂▂▂▂▃▂█▁▂▃▂▁▂▁▂▁▃▂▂▂▁▂▁▂▂▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▁▆█▇█
[fold1] avg_train_loss,█▂▂▁▁
[fold1] avg_val_loss,█▁▂▃▅
[fold1] epoch,▁▃▅▆█
[fold1] loss,▄▃▃█▄▂▄▂▃▆▂▄▂▁▃▆▂▂▂▂▂▂▁▁▃▂▁▁▂▅▂▃▃▁▁▃▂▃▂▅

0,1
[fold0] avg_train_loss,0.00659
[fold0] avg_val_loss,0.01186
[fold0] epoch,5.0
[fold0] loss,0.00322
[fold0] lr,0.0
[fold0] score,0.88242
[fold1] avg_train_loss,0.00597
[fold1] avg_val_loss,0.01296
[fold1] epoch,5.0
[fold1] loss,0.00389
