# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================

import sys
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive")
    base = "/content/drive/MyDrive/colab_notebooks/kaggle/nbme-score-clinical-patient-notes/notebooks"
    %cd {base}


import os
if 'kaggle_web_client' in sys.modules:
    OUTPUT_DIR = './'
else:
    OUTPUT_DIR = './nb001t-token-classifier/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/colab_notebooks/kaggle/nbme-score-clinical-patient-notes/notebooks


# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    pseudo=True
    wandb=True
    wandbgroup="nb001t-token-classifier"
    wandbname="case-num-0"
    cv_case_num=False
    competition='NBME'
    _wandb_kernel='riow1983'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=12
    fc_dropout=0.2
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]
    CFG.wandbname = "debug-" + CFG.wandbname

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    if 'google.colab' in sys.modules:
        !pip install wandb
    import wandb

    try:
        if 'kaggle_web_client' in sys.modules:
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            secret_value_0 = user_secrets.get_secret("wandb_api")
        else:
            import json
            f = open("../../wandb.json", "r")
            json_data = json.load(f)
            secret_value_0 = json_data["wandb_api"]
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(dir=OUTPUT_DIR,
                    project='NBME-Public', 
                    name=CFG.wandbname,
                    config=class2dict(CFG),
                    group=CFG.wandbgroup,
                    job_type="train",
                    anonymous=anony)
    print(f"wandb run id: {run.id}")



[34m[1mwandb[0m: Currently logged in as: [33mriow1983[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


wandb run id: hdu15a3r


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [5]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [6]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

# Utils

In [7]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [8]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"features.shape: {features.shape}")
display(features.head())
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head())

train.shape: (14300, 6)


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


features.shape: (143, 3)


Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


patient_notes.shape: (42146, 3)


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [9]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [10]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [11]:
train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [12]:
train['location'][0]

['696 724']

In [13]:
if CFG.pseudo:
    print(train.shape)
    # concatenate train w/ test(pseudo-labeld data)
    pl = pd.read_csv("./nb002i-token-classifier/test.csv")
    pl['annotation'] = pl['annotation'].apply(ast.literal_eval)
    pl['location'] = pl['location'].apply(ast.literal_eval)
    pl['id'] = pl['pn_num'].apply(lambda x: str(x).zfill(5))+'_'+pl['feature_num'].apply(lambda x: str(x).zfill(3))
    pl = pl[train.columns]
    #display(pl.head())
    train['is_pl'] = 0
    pl['is_pl'] = 1
    train = pd.concat([train, pl], axis=0, ignore_index=True)
    del pl
    print(train.shape)

(14300, 8)
(42484, 9)


In [14]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

1    30279
0    10489
2     1292
3      287
4       99
5       27
6        9
7        1
8        1
Name: annotation_length, dtype: int64

## Merge patient_notes w/ features

In [15]:
print(patient_notes.shape)
patient_notes = patient_notes.merge(features, on=['case_num'], how='left')
print(patient_notes.shape)
display(patient_notes.head())

(42146, 3)
(626902, 5)


Unnamed: 0,pn_num,case_num,pn_history,feature_num,feature_text
0,0,0,"17-year-old male, has come to the student heal...",0,Family-history-of-MI-OR-Family-history-of-myoc...
1,0,0,"17-year-old male, has come to the student heal...",1,Family-history-of-thyroid-disorder
2,0,0,"17-year-old male, has come to the student heal...",2,Chest-pressure
3,0,0,"17-year-old male, has come to the student heal...",3,Intermittent-symptoms
4,0,0,"17-year-old male, has come to the student heal...",4,Lightheaded


## ~~Remove pn_nums which are appeared in train from patient_notes~~

In [16]:
# print(patient_notes.shape)
# patient_notes = patient_notes[~patient_notes["pn_num"].isin(train["pn_num"].unique())].reset_index(drop=True)
# print(patient_notes.shape)

## Select one specific case_num

In [17]:
if CFG.wandbname.split("-")[-1] != "all":
    selected_case_num = int(CFG.wandbname.split("-")[-1])
    print(f"selected_case_num: {selected_case_num}")

    print(train.shape)
    train = train[train["case_num"]==selected_case_num].reset_index(drop=True)
    print(train.shape)

    print()

    print(patient_notes.shape)
    patient_notes = patient_notes[patient_notes["case_num"]==selected_case_num].reset_index(drop=True)
    print(patient_notes.shape)

    print()

    print(features.shape)
    features = features[features["case_num"]==selected_case_num].reset_index(drop=True)
    print(features.shape)

selected_case_num: 0
(42484, 10)
(29484, 10)

(626902, 5)
(29484, 5)

(143, 3)
(13, 3)


# CV split

In [18]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)

if CFG.pseudo:
    # remove pseudo labels from val
    train.loc[train['is_pl']==1, 'fold'] = -1

display(train.groupby('fold').size())

fold
-1    28184
 0      273
 1      260
 2      234
 3      416
 4      117
dtype: int64

In [19]:
len(train)

29484

In [20]:
if CFG.debug:
    display(train.groupby('fold').size())
    if len(train) > 2000:
        train = train.sample(n=2000, random_state=0).reset_index(drop=True)
        display(train.groupby('fold').size())

# tokenizer

In [21]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [22]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['pn_history']:
    pn_history_lengths = []
    tk0 = tqdm(patient_notes[text_col].fillna("").values, total=len(patient_notes))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        pn_history_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(pn_history_lengths)}')

for text_col in ['feature_text']:
    features_lengths = []
    tk0 = tqdm(features[text_col].fillna("").values, total=len(features))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        features_lengths.append(length)
    LOGGER.info(f'{text_col} max(lengths): {max(features_lengths)}')

CFG.max_len = max(pn_history_lengths) + max(features_lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/29484 [00:00<?, ?it/s]

pn_history max(lengths): 386


  0%|          | 0/13 [00:00<?, ?it/s]

feature_text max(lengths): 24
max_len: 413


In [23]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text, feature_text):
    inputs = cfg.tokenizer(text, feature_text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation_length, location_list):
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if annotation_length != 0:
        for location in location_list:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    return torch.tensor(label, dtype=torch.float)


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.feature_texts = df['feature_text'].values
        self.pn_historys = df['pn_history'].values
        self.annotation_lengths = df['annotation_length'].values
        self.locations = df['location'].values

    def __len__(self):
        return len(self.feature_texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.pn_historys[item], 
                               self.feature_texts[item])
        label = create_label(self.cfg, 
                             self.pn_historys[item], 
                             self.annotation_lengths[item], 
                             self.locations[item])
        return inputs, label

# Model

In [24]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# Helper functions

In [25]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [26]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['pn_history'].values
    valid_labels = create_labels_for_scoring(valid_folds)
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions.reshape((len(valid_folds), CFG.max_len))
        
        # scoring
        char_probs = get_char_probs(valid_texts, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [None]:
if __name__ == '__main__':
    
    def get_result(oof_df, cv_score=False, case_num=None):
        if case_num is not None:
            oof_df = oof_df[oof_df["case_num"]==case_num].reset_index(drop=True)
        labels = create_labels_for_scoring(oof_df)
        predictions = oof_df[[i for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(oof_df['pn_history'].values, predictions, CFG.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(labels, preds)
        if case_num is not None:
            LOGGER.info(f'Score of case_num {case_num}: {score:<.4f}')
            if cv_score:
                wandb.log({f'CV score of case_num {case_num}': score})
        else:
            LOGGER.info(f'Score: {score:<.4f}')
            if cv_score:
                wandb.log({f'CV score': score})
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        if CFG.cv_case_num:
            for i in range(10):
                get_result(oof_df, cv_score=True, case_num=i)
        else:
            get_result(oof_df, cv_score=True)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()


    # Push to LINE
    import requests
    def send_line_notification(message):
        import json
        f = open("../../line.json", "r")
        json_data = json.load(f)
        line_token = json_data["kagglePush"]
        endpoint = 'https://notify-api.line.me/api/notify'
        message = "\n{}".format(message)
        payload = {'message': message}
        headers = {'Authorization': 'Bearer {}'.format(line_token)}
        requests.post(endpoint, data=payload, headers=headers)

    if CFG.wandb:
        send_line_notification(f"Training of {CFG.wandbgroup} has been done. See {run.url}")
    else:
        send_line_notification(f"Training of {CFG.wandbgroup} has been done.")

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2434] Elapsed 0m 0s (remain 25m 1s) Loss: 0.6152(0.6152) Grad: inf  LR: 0.00002000  
Epoch: [1][100/2434] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0035(0.0630) Grad: 1512.7084  LR: 0.00002000  
Epoch: [1][200/2434] Elapsed 1m 0s (remain 11m 16s) Loss: 0.0070(0.0364) Grad: 2025.6398  LR: 0.00001999  
Epoch: [1][300/2434] Elapsed 1m 31s (remain 10m 45s) Loss: 0.0033(0.0264) Grad: 1445.9966  LR: 0.00001997  
Epoch: [1][400/2434] Elapsed 2m 1s (remain 10m 14s) Loss: 0.0023(0.0212) Grad: 1045.9598  LR: 0.00001995  
Epoch: [1][500/2434] Elapsed 2m 31s (remain 9m 43s) Loss: 0.0055(0.0179) Grad: 1654.0469  LR: 0.00001992  
Epoch: [1][600/2434] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0022(0.0157) Grad: 727.4800  LR: 0.00001988  
Epoch: [1][700/2434] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0051(0.0141) Grad: 1141.0780  LR: 0.00001984  
Epoch: [1][800/2434] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0057(0.0129) Grad: 848.3799  LR: 0.00001979  
Epoch: [1][900/2434] Elapsed 4m 31s (remain 7m 4

Epoch 1 - avg_train_loss: 0.0068  avg_val_loss: 0.0103  time: 737s
Epoch 1 - Score: 0.9118
Epoch 1 - Save Best Score: 0.9118 Model


Epoch: [2][0/2434] Elapsed 0m 0s (remain 21m 39s) Loss: 0.0025(0.0025) Grad: 14835.4883  LR: 0.00001809  
Epoch: [2][100/2434] Elapsed 0m 30s (remain 11m 48s) Loss: 0.0021(0.0017) Grad: 4878.7437  LR: 0.00001793  
Epoch: [2][200/2434] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0017(0.0019) Grad: 5687.4707  LR: 0.00001777  
Epoch: [2][300/2434] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0037(0.0020) Grad: 13379.0488  LR: 0.00001761  
Epoch: [2][400/2434] Elapsed 2m 1s (remain 10m 13s) Loss: 0.0028(0.0021) Grad: 12786.2031  LR: 0.00001744  
Epoch: [2][500/2434] Elapsed 2m 31s (remain 9m 43s) Loss: 0.0001(0.0022) Grad: 390.1593  LR: 0.00001726  
Epoch: [2][600/2434] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0024(0.0023) Grad: 9641.0986  LR: 0.00001709  
Epoch: [2][700/2434] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0002(0.0022) Grad: 1415.0878  LR: 0.00001690  
Epoch: [2][800/2434] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0015(0.0022) Grad: 9092.0205  LR: 0.00001671  
Epoch: [2][900/2434] Elapsed 4m 31s (

Epoch 2 - avg_train_loss: 0.0023  avg_val_loss: 0.0132  time: 737s
Epoch 2 - Score: 0.9051


Epoch: [3][0/2434] Elapsed 0m 0s (remain 22m 59s) Loss: 0.0015(0.0015) Grad: 5710.8066  LR: 0.00001309  
Epoch: [3][100/2434] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0007(0.0021) Grad: 2088.6567  LR: 0.00001284  
Epoch: [3][200/2434] Elapsed 1m 0s (remain 11m 16s) Loss: 0.0001(0.0019) Grad: 1499.9299  LR: 0.00001259  
Epoch: [3][300/2434] Elapsed 1m 31s (remain 10m 45s) Loss: 0.0057(0.0019) Grad: 20860.0352  LR: 0.00001234  
Epoch: [3][400/2434] Elapsed 2m 1s (remain 10m 14s) Loss: 0.0008(0.0018) Grad: 2940.9487  LR: 0.00001209  
Epoch: [3][500/2434] Elapsed 2m 31s (remain 9m 43s) Loss: 0.0007(0.0018) Grad: 3511.8000  LR: 0.00001184  
Epoch: [3][600/2434] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0002(0.0019) Grad: 1474.4937  LR: 0.00001158  
Epoch: [3][700/2434] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0002(0.0019) Grad: 1632.9711  LR: 0.00001133  
Epoch: [3][800/2434] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0015(0.0019) Grad: 4196.3096  LR: 0.00001107  
Epoch: [3][900/2434] Elapsed 4m 31s (r

Epoch 3 - avg_train_loss: 0.0019  avg_val_loss: 0.0152  time: 737s
Epoch 3 - Score: 0.8954


Epoch: [4][0/2434] Elapsed 0m 0s (remain 21m 31s) Loss: 0.0010(0.0010) Grad: 5404.9580  LR: 0.00000691  
Epoch: [4][100/2434] Elapsed 0m 30s (remain 11m 48s) Loss: 0.0009(0.0020) Grad: 10848.4043  LR: 0.00000666  
Epoch: [4][200/2434] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0025(0.0020) Grad: 10619.6367  LR: 0.00000642  
Epoch: [4][300/2434] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0001(0.0020) Grad: 837.3244  LR: 0.00000618  
Epoch: [4][400/2434] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0027(0.0019) Grad: 10119.7061  LR: 0.00000595  
Epoch: [4][500/2434] Elapsed 2m 31s (remain 9m 42s) Loss: 0.0005(0.0018) Grad: 4704.5918  LR: 0.00000571  
Epoch: [4][600/2434] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0075(0.0018) Grad: 27404.3086  LR: 0.00000548  
Epoch: [4][700/2434] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0003(0.0017) Grad: 2371.0254  LR: 0.00000525  
Epoch: [4][800/2434] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0009(0.0017) Grad: 8514.1836  LR: 0.00000502  
Epoch: [4][900/2434] Elapsed 4m 31s 

Epoch 4 - avg_train_loss: 0.0015  avg_val_loss: 0.0156  time: 738s
Epoch 4 - Score: 0.9002


Epoch: [5][0/2434] Elapsed 0m 0s (remain 22m 20s) Loss: 0.0011(0.0011) Grad: 7589.7397  LR: 0.00000191  
Epoch: [5][100/2434] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0001(0.0018) Grad: 837.6344  LR: 0.00000176  
Epoch: [5][200/2434] Elapsed 1m 0s (remain 11m 16s) Loss: 0.0001(0.0016) Grad: 713.2725  LR: 0.00000162  
Epoch: [5][300/2434] Elapsed 1m 31s (remain 10m 45s) Loss: 0.0036(0.0015) Grad: 12574.6582  LR: 0.00000148  
Epoch: [5][400/2434] Elapsed 2m 1s (remain 10m 15s) Loss: 0.0016(0.0014) Grad: 12035.2080  LR: 0.00000135  
Epoch: [5][500/2434] Elapsed 2m 31s (remain 9m 44s) Loss: 0.0026(0.0014) Grad: 9620.1143  LR: 0.00000122  
Epoch: [5][600/2434] Elapsed 3m 1s (remain 9m 14s) Loss: 0.0024(0.0014) Grad: 15980.6475  LR: 0.00000110  
Epoch: [5][700/2434] Elapsed 3m 31s (remain 8m 43s) Loss: 0.0009(0.0013) Grad: 9010.1689  LR: 0.00000099  
Epoch: [5][800/2434] Elapsed 4m 2s (remain 8m 13s) Loss: 0.0000(0.0013) Grad: 90.3062  LR: 0.00000088  
Epoch: [5][900/2434] Elapsed 4m 32s (rem

Epoch 5 - avg_train_loss: 0.0012  avg_val_loss: 0.0166  time: 737s
Epoch 5 - Score: 0.9041
Score: 0.9118
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2435] Elapsed 0m 0s (remain 21m 17s) Loss: 0.9100(0.9100) Grad: inf  LR: 0.00002000  
Epoch: [1][100/2435] Elapsed 0m 30s (remain 11m 46s) Loss: 0.0228(0.0828) Grad: 1306.1455  LR: 0.00002000  
Epoch: [1][200/2435] Elapsed 1m 0s (remain 11m 13s) Loss: 0.0159(0.0458) Grad: 2482.2681  LR: 0.00001999  
Epoch: [1][300/2435] Elapsed 1m 30s (remain 10m 43s) Loss: 0.0048(0.0331) Grad: 512.8638  LR: 0.00001997  
Epoch: [1][400/2435] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0031(0.0262) Grad: 579.8724  LR: 0.00001995  
Epoch: [1][500/2435] Elapsed 2m 31s (remain 9m 42s) Loss: 0.0031(0.0220) Grad: 2247.9924  LR: 0.00001992  
Epoch: [1][600/2435] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0004(0.0192) Grad: 155.6343  LR: 0.00001988  
Epoch: [1][700/2435] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0068(0.0171) Grad: 1262.1349  LR: 0.00001984  
Epoch: [1][800/2435] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0083(0.0155) Grad: 757.7378  LR: 0.00001979  
Epoch: [1][900/2435] Elapsed 4m 31s (remain 7m 42

Epoch 1 - avg_train_loss: 0.0075  avg_val_loss: 0.0086  time: 737s
Epoch 1 - Score: 0.9138
Epoch 1 - Save Best Score: 0.9138 Model


Epoch: [2][0/2435] Elapsed 0m 0s (remain 21m 28s) Loss: 0.0004(0.0004) Grad: 1956.0028  LR: 0.00001809  
Epoch: [2][100/2435] Elapsed 0m 30s (remain 11m 48s) Loss: 0.0009(0.0026) Grad: 3948.9243  LR: 0.00001793  
Epoch: [2][200/2435] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0003(0.0025) Grad: 1267.2802  LR: 0.00001777  
Epoch: [2][300/2435] Elapsed 1m 30s (remain 10m 43s) Loss: 0.0000(0.0025) Grad: 164.1956  LR: 0.00001761  
Epoch: [2][400/2435] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0003(0.0026) Grad: 1594.7716  LR: 0.00001744  
Epoch: [2][500/2435] Elapsed 2m 30s (remain 9m 42s) Loss: 0.0025(0.0025) Grad: 24654.3613  LR: 0.00001727  
Epoch: [2][600/2435] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0002(0.0025) Grad: 2213.4104  LR: 0.00001709  
Epoch: [2][700/2435] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0065(0.0025) Grad: 15419.1992  LR: 0.00001690  
Epoch: [2][800/2435] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0004(0.0024) Grad: 1897.6826  LR: 0.00001671  
Epoch: [2][900/2435] Elapsed 4m 31s (r

Epoch 2 - avg_train_loss: 0.0022  avg_val_loss: 0.0101  time: 737s
Epoch 2 - Score: 0.9149
Epoch 2 - Save Best Score: 0.9149 Model


Epoch: [3][0/2435] Elapsed 0m 0s (remain 21m 19s) Loss: 0.0021(0.0021) Grad: 11697.5039  LR: 0.00001309  
Epoch: [3][100/2435] Elapsed 0m 30s (remain 11m 47s) Loss: 0.0002(0.0021) Grad: 2755.4744  LR: 0.00001284  
Epoch: [3][200/2435] Elapsed 1m 0s (remain 11m 14s) Loss: 0.0028(0.0017) Grad: 8900.4297  LR: 0.00001259  
Epoch: [3][300/2435] Elapsed 1m 30s (remain 10m 43s) Loss: 0.0003(0.0016) Grad: 2005.8900  LR: 0.00001234  
Epoch: [3][400/2435] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0016(0.0018) Grad: 10063.6738  LR: 0.00001209  
Epoch: [3][500/2435] Elapsed 2m 30s (remain 9m 42s) Loss: 0.0007(0.0018) Grad: 8543.3154  LR: 0.00001184  
Epoch: [3][600/2435] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0020(0.0018) Grad: 12018.5605  LR: 0.00001159  
Epoch: [3][700/2435] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0009(0.0018) Grad: 4466.8311  LR: 0.00001133  
Epoch: [3][800/2435] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0064(0.0019) Grad: 2497.4958  LR: 0.00001107  
Epoch: [3][900/2435] Elapsed 4m 31s 

Epoch 3 - avg_train_loss: 0.0019  avg_val_loss: 0.0114  time: 736s
Epoch 3 - Score: 0.9131


Epoch: [4][0/2435] Elapsed 0m 0s (remain 22m 22s) Loss: 0.0028(0.0028) Grad: 19933.4375  LR: 0.00000691  
Epoch: [4][100/2435] Elapsed 0m 30s (remain 11m 47s) Loss: 0.0007(0.0015) Grad: 9450.3086  LR: 0.00000666  
Epoch: [4][200/2435] Elapsed 1m 0s (remain 11m 14s) Loss: 0.0021(0.0019) Grad: 10280.6660  LR: 0.00000642  
Epoch: [4][300/2435] Elapsed 1m 30s (remain 10m 43s) Loss: 0.0127(0.0017) Grad: 15692.5020  LR: 0.00000618  
Epoch: [4][400/2435] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0039(0.0016) Grad: 15490.9805  LR: 0.00000595  
Epoch: [4][500/2435] Elapsed 2m 30s (remain 9m 42s) Loss: 0.0047(0.0016) Grad: 13169.4521  LR: 0.00000571  
Epoch: [4][600/2435] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0002(0.0016) Grad: 3249.6440  LR: 0.00000548  
Epoch: [4][700/2435] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0014(0.0017) Grad: 7775.2979  LR: 0.00000525  
Epoch: [4][800/2435] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0101(0.0016) Grad: 16502.4844  LR: 0.00000503  
Epoch: [4][900/2435] Elapsed 4m 3

Epoch 4 - avg_train_loss: 0.0016  avg_val_loss: 0.0125  time: 736s
Epoch 4 - Score: 0.9165
Epoch 4 - Save Best Score: 0.9165 Model


Epoch: [5][0/2435] Elapsed 0m 0s (remain 21m 41s) Loss: 0.0012(0.0012) Grad: 20139.8867  LR: 0.00000191  
Epoch: [5][100/2435] Elapsed 0m 30s (remain 11m 47s) Loss: 0.0010(0.0012) Grad: 7394.3760  LR: 0.00000176  
Epoch: [5][200/2435] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0019(0.0013) Grad: 10858.8828  LR: 0.00000162  
Epoch: [5][300/2435] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0027(0.0013) Grad: 27043.6426  LR: 0.00000148  
Epoch: [5][400/2435] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0026(0.0012) Grad: 9299.3301  LR: 0.00000135  
Epoch: [5][500/2435] Elapsed 2m 31s (remain 9m 42s) Loss: 0.0000(0.0012) Grad: 10.3664  LR: 0.00000122  
Epoch: [5][600/2435] Elapsed 3m 1s (remain 9m 12s) Loss: 0.0049(0.0012) Grad: 17899.8887  LR: 0.00000110  
Epoch: [5][700/2435] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0005(0.0012) Grad: 6523.1587  LR: 0.00000099  
Epoch: [5][800/2435] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0001(0.0012) Grad: 1009.0346  LR: 0.00000088  
Epoch: [5][900/2435] Elapsed 4m 31s (

Epoch 5 - avg_train_loss: 0.0013  avg_val_loss: 0.0130  time: 736s
Epoch 5 - Score: 0.9167
Epoch 5 - Save Best Score: 0.9167 Model
Score: 0.9167
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2437] Elapsed 0m 0s (remain 20m 41s) Loss: 1.0451(1.0451) Grad: inf  LR: 0.00002000  
Epoch: [1][100/2437] Elapsed 0m 30s (remain 11m 47s) Loss: 0.0064(0.0964) Grad: 1263.0387  LR: 0.00002000  
Epoch: [1][200/2437] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0071(0.0527) Grad: 1092.0651  LR: 0.00001999  
Epoch: [1][300/2437] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0073(0.0374) Grad: 581.2119  LR: 0.00001997  
Epoch: [1][400/2437] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0037(0.0295) Grad: 506.5844  LR: 0.00001995  
Epoch: [1][500/2437] Elapsed 2m 31s (remain 9m 43s) Loss: 0.0025(0.0247) Grad: 220.4788  LR: 0.00001992  
Epoch: [1][600/2437] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0014(0.0214) Grad: 268.8468  LR: 0.00001988  
Epoch: [1][700/2437] Elapsed 3m 31s (remain 8m 43s) Loss: 0.0032(0.0190) Grad: 391.0432  LR: 0.00001984  
Epoch: [1][800/2437] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0020(0.0172) Grad: 614.0691  LR: 0.00001979  
Epoch: [1][900/2437] Elapsed 4m 31s (remain 7m 42s)

Epoch 1 - avg_train_loss: 0.0081  avg_val_loss: 0.0106  time: 737s
Epoch 1 - Score: 0.9026
Epoch 1 - Save Best Score: 0.9026 Model


Epoch: [2][0/2437] Elapsed 0m 0s (remain 22m 29s) Loss: 0.0025(0.0025) Grad: 4266.8335  LR: 0.00001809  
Epoch: [2][100/2437] Elapsed 0m 30s (remain 11m 50s) Loss: 0.0021(0.0022) Grad: 4593.0693  LR: 0.00001794  
Epoch: [2][200/2437] Elapsed 1m 0s (remain 11m 16s) Loss: 0.0016(0.0023) Grad: 2500.7622  LR: 0.00001778  
Epoch: [2][300/2437] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0000(0.0021) Grad: 768.5548  LR: 0.00001761  
Epoch: [2][400/2437] Elapsed 2m 0s (remain 10m 14s) Loss: 0.0003(0.0020) Grad: 1738.4130  LR: 0.00001744  
Epoch: [2][500/2437] Elapsed 2m 31s (remain 9m 43s) Loss: 0.0001(0.0021) Grad: 351.9653  LR: 0.00001727  
Epoch: [2][600/2437] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0020(0.0021) Grad: 5645.8784  LR: 0.00001709  
Epoch: [2][700/2437] Elapsed 3m 31s (remain 8m 43s) Loss: 0.0025(0.0022) Grad: 5682.8730  LR: 0.00001690  
Epoch: [2][800/2437] Elapsed 4m 1s (remain 8m 13s) Loss: 0.0018(0.0022) Grad: 7308.4727  LR: 0.00001671  
Epoch: [2][900/2437] Elapsed 4m 31s (rema

Epoch 2 - avg_train_loss: 0.0022  avg_val_loss: 0.0147  time: 737s
Epoch 2 - Score: 0.9116
Epoch 2 - Save Best Score: 0.9116 Model


Epoch: [3][0/2437] Elapsed 0m 0s (remain 22m 1s) Loss: 0.0009(0.0009) Grad: 2749.8052  LR: 0.00001309  
Epoch: [3][100/2437] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0041(0.0017) Grad: 16155.7617  LR: 0.00001284  
Epoch: [3][200/2437] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0080(0.0019) Grad: 7560.7065  LR: 0.00001260  
Epoch: [3][300/2437] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0024(0.0018) Grad: 23868.0000  LR: 0.00001235  
Epoch: [3][400/2437] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0000(0.0019) Grad: 342.7188  LR: 0.00001209  
Epoch: [3][500/2437] Elapsed 2m 30s (remain 9m 43s) Loss: 0.0003(0.0020) Grad: 2774.9194  LR: 0.00001184  
Epoch: [3][600/2437] Elapsed 3m 0s (remain 9m 12s) Loss: 0.0040(0.0020) Grad: 10249.4736  LR: 0.00001159  
Epoch: [3][700/2437] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0000(0.0020) Grad: 137.9102  LR: 0.00001133  
Epoch: [3][800/2437] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0025(0.0020) Grad: 14685.1992  LR: 0.00001108  
Epoch: [3][900/2437] Elapsed 4m 31s (r

Epoch 3 - avg_train_loss: 0.0019  avg_val_loss: 0.0166  time: 737s
Epoch 3 - Score: 0.9128
Epoch 3 - Save Best Score: 0.9128 Model


Epoch: [4][0/2437] Elapsed 0m 0s (remain 21m 39s) Loss: 0.0001(0.0001) Grad: 1913.5590  LR: 0.00000691  
Epoch: [4][100/2437] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0001(0.0017) Grad: 1946.6342  LR: 0.00000667  
Epoch: [4][200/2437] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0007(0.0016) Grad: 4671.5303  LR: 0.00000642  
Epoch: [4][300/2437] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0000(0.0016) Grad: 155.9813  LR: 0.00000618  
Epoch: [4][400/2437] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0000(0.0016) Grad: 213.6659  LR: 0.00000595  
Epoch: [4][500/2437] Elapsed 2m 30s (remain 9m 43s) Loss: 0.0004(0.0016) Grad: 2896.0559  LR: 0.00000571  
Epoch: [4][600/2437] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0021(0.0015) Grad: 12437.7627  LR: 0.00000548  
Epoch: [4][700/2437] Elapsed 3m 31s (remain 8m 43s) Loss: 0.0046(0.0015) Grad: 23237.3359  LR: 0.00000525  
Epoch: [4][800/2437] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0005(0.0016) Grad: 5690.9385  LR: 0.00000503  
Epoch: [4][900/2437] Elapsed 4m 31s (re

Epoch 4 - avg_train_loss: 0.0016  avg_val_loss: 0.0168  time: 737s
Epoch 4 - Score: 0.9122


Epoch: [5][0/2437] Elapsed 0m 0s (remain 20m 43s) Loss: 0.0023(0.0023) Grad: 7279.9302  LR: 0.00000191  
Epoch: [5][100/2437] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0018(0.0013) Grad: 9495.9854  LR: 0.00000176  
Epoch: [5][200/2437] Elapsed 1m 0s (remain 11m 15s) Loss: 0.0001(0.0012) Grad: 837.8331  LR: 0.00000162  
Epoch: [5][300/2437] Elapsed 1m 30s (remain 10m 44s) Loss: 0.0003(0.0012) Grad: 5021.1455  LR: 0.00000148  
Epoch: [5][400/2437] Elapsed 2m 0s (remain 10m 13s) Loss: 0.0000(0.0013) Grad: 165.2116  LR: 0.00000135  
Epoch: [5][500/2437] Elapsed 2m 30s (remain 9m 43s) Loss: 0.0001(0.0012) Grad: 1770.3824  LR: 0.00000122  
Epoch: [5][600/2437] Elapsed 3m 1s (remain 9m 13s) Loss: 0.0088(0.0013) Grad: 7234.2349  LR: 0.00000110  
Epoch: [5][700/2437] Elapsed 3m 31s (remain 8m 42s) Loss: 0.0005(0.0013) Grad: 2301.1064  LR: 0.00000099  
Epoch: [5][800/2437] Elapsed 4m 1s (remain 8m 12s) Loss: 0.0011(0.0012) Grad: 7771.1289  LR: 0.00000088  
Epoch: [5][900/2437] Elapsed 4m 31s (rema

Epoch 5 - avg_train_loss: 0.0013  avg_val_loss: 0.0176  time: 737s
Epoch 5 - Score: 0.9121
Score: 0.9128
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2422] Elapsed 0m 0s (remain 26m 0s) Loss: 0.7451(0.7451) Grad: inf  LR: 0.00002000  
Epoch: [1][100/2422] Elapsed 0m 30s (remain 11m 45s) Loss: 0.0028(0.0756) Grad: 1103.7051  LR: 0.00002000  
Epoch: [1][200/2422] Elapsed 1m 0s (remain 11m 11s) Loss: 0.0027(0.0424) Grad: 627.6338  LR: 0.00001999  
Epoch: [1][300/2422] Elapsed 1m 30s (remain 10m 40s) Loss: 0.0253(0.0305) Grad: 1570.9480  LR: 0.00001997  
Epoch: [1][400/2422] Elapsed 2m 1s (remain 10m 9s) Loss: 0.0072(0.0242) Grad: 1287.7050  LR: 0.00001995  
Epoch: [1][500/2422] Elapsed 2m 31s (remain 9m 39s) Loss: 0.0070(0.0204) Grad: 1296.6935  LR: 0.00001992  
Epoch: [1][600/2422] Elapsed 3m 1s (remain 9m 8s) Loss: 0.0048(0.0178) Grad: 693.5203  LR: 0.00001988  
Epoch: [1][700/2422] Elapsed 3m 31s (remain 8m 38s) Loss: 0.0037(0.0158) Grad: 533.6204  LR: 0.00001984  
Epoch: [1][800/2422] Elapsed 4m 1s (remain 8m 8s) Loss: 0.0083(0.0144) Grad: 1128.2563  LR: 0.00001978  
Epoch: [1][900/2422] Elapsed 4m 31s (remain 7m 38s) 

Epoch 1 - avg_train_loss: 0.0071  avg_val_loss: 0.0127  time: 735s
Epoch 1 - Score: 0.8860
Epoch 1 - Save Best Score: 0.8860 Model


Epoch: [2][0/2422] Elapsed 0m 0s (remain 21m 56s) Loss: 0.0075(0.0075) Grad: 13828.6484  LR: 0.00001809  
Epoch: [2][100/2422] Elapsed 0m 30s (remain 11m 44s) Loss: 0.0030(0.0028) Grad: 5904.0005  LR: 0.00001793  
Epoch: [2][200/2422] Elapsed 1m 0s (remain 11m 11s) Loss: 0.0009(0.0023) Grad: 6548.0610  LR: 0.00001777  
Epoch: [2][300/2422] Elapsed 1m 30s (remain 10m 40s) Loss: 0.0046(0.0022) Grad: 6834.3701  LR: 0.00001761  
Epoch: [2][400/2422] Elapsed 2m 0s (remain 10m 9s) Loss: 0.0010(0.0022) Grad: 10295.3584  LR: 0.00001744  
Epoch: [2][500/2422] Elapsed 2m 31s (remain 9m 39s) Loss: 0.0008(0.0023) Grad: 3334.5447  LR: 0.00001726  
Epoch: [2][600/2422] Elapsed 3m 1s (remain 9m 8s) Loss: 0.0021(0.0022) Grad: 5288.4546  LR: 0.00001708  
Epoch: [2][700/2422] Elapsed 3m 31s (remain 8m 38s) Loss: 0.0008(0.0023) Grad: 3557.2351  LR: 0.00001689  
Epoch: [2][800/2422] Elapsed 4m 1s (remain 8m 8s) Loss: 0.0000(0.0022) Grad: 196.0190  LR: 0.00001670  
Epoch: [2][900/2422] Elapsed 4m 31s (rema

Epoch 2 - avg_train_loss: 0.0021  avg_val_loss: 0.0151  time: 735s
Epoch 2 - Score: 0.8892
Epoch 2 - Save Best Score: 0.8892 Model


Epoch: [3][0/2422] Elapsed 0m 0s (remain 21m 8s) Loss: 0.0024(0.0024) Grad: 6495.4810  LR: 0.00001309  
Epoch: [3][100/2422] Elapsed 0m 30s (remain 11m 44s) Loss: 0.0009(0.0015) Grad: 5037.5947  LR: 0.00001284  
Epoch: [3][200/2422] Elapsed 1m 0s (remain 11m 10s) Loss: 0.0035(0.0017) Grad: 10187.5693  LR: 0.00001259  
Epoch: [3][300/2422] Elapsed 1m 30s (remain 10m 39s) Loss: 0.0001(0.0018) Grad: 733.6993  LR: 0.00001234  
Epoch: [3][400/2422] Elapsed 2m 0s (remain 10m 9s) Loss: 0.0008(0.0018) Grad: 10732.1025  LR: 0.00001209  
Epoch: [3][500/2422] Elapsed 2m 30s (remain 9m 38s) Loss: 0.0028(0.0018) Grad: 7266.8149  LR: 0.00001183  
Epoch: [3][600/2422] Elapsed 3m 0s (remain 9m 8s) Loss: 0.0010(0.0019) Grad: 4483.4653  LR: 0.00001158  
Epoch: [3][700/2422] Elapsed 3m 31s (remain 8m 38s) Loss: 0.0000(0.0019) Grad: 38.9454  LR: 0.00001132  
Epoch: [3][800/2422] Elapsed 4m 1s (remain 8m 8s) Loss: 0.0029(0.0019) Grad: 12640.9648  LR: 0.00001106  
Epoch: [3][900/2422] Elapsed 4m 31s (remain

Epoch 3 - avg_train_loss: 0.0019  avg_val_loss: 0.0178  time: 734s
Epoch 3 - Score: 0.8856


Epoch: [4][0/2422] Elapsed 0m 0s (remain 21m 5s) Loss: 0.0000(0.0000) Grad: 223.7136  LR: 0.00000691  
Epoch: [4][100/2422] Elapsed 0m 30s (remain 11m 43s) Loss: 0.0063(0.0015) Grad: 20466.3145  LR: 0.00000666  
Epoch: [4][200/2422] Elapsed 1m 0s (remain 11m 10s) Loss: 0.0013(0.0016) Grad: 13402.0400  LR: 0.00000642  
Epoch: [4][300/2422] Elapsed 1m 30s (remain 10m 39s) Loss: 0.0004(0.0016) Grad: 5574.3643  LR: 0.00000618  
Epoch: [4][400/2422] Elapsed 2m 0s (remain 10m 9s) Loss: 0.0000(0.0015) Grad: 33.2391  LR: 0.00000594  
Epoch: [4][500/2422] Elapsed 2m 30s (remain 9m 38s) Loss: 0.0017(0.0015) Grad: 6292.2329  LR: 0.00000570  
Epoch: [4][600/2422] Elapsed 3m 0s (remain 9m 8s) Loss: 0.0023(0.0015) Grad: 7195.0273  LR: 0.00000547  
Epoch: [4][700/2422] Elapsed 3m 30s (remain 8m 38s) Loss: 0.0004(0.0015) Grad: 4349.5068  LR: 0.00000524  
Epoch: [4][800/2422] Elapsed 4m 0s (remain 8m 7s) Loss: 0.0035(0.0015) Grad: 9537.0342  LR: 0.00000502  
Epoch: [4][900/2422] Elapsed 4m 30s (remain 

Epoch 4 - avg_train_loss: 0.0015  avg_val_loss: 0.0182  time: 734s
Epoch 4 - Score: 0.8887


Epoch: [5][0/2422] Elapsed 0m 0s (remain 21m 16s) Loss: 0.0001(0.0001) Grad: 1020.7054  LR: 0.00000191  
Epoch: [5][100/2422] Elapsed 0m 30s (remain 11m 44s) Loss: 0.0050(0.0008) Grad: 18045.8984  LR: 0.00000176  
Epoch: [5][200/2422] Elapsed 1m 0s (remain 11m 11s) Loss: 0.0072(0.0009) Grad: 11677.6670  LR: 0.00000162  
Epoch: [5][300/2422] Elapsed 1m 30s (remain 10m 39s) Loss: 0.0016(0.0011) Grad: 7938.7051  LR: 0.00000148  
Epoch: [5][400/2422] Elapsed 2m 0s (remain 10m 9s) Loss: 0.0000(0.0012) Grad: 17.1642  LR: 0.00000134  
Epoch: [5][500/2422] Elapsed 2m 30s (remain 9m 38s) Loss: 0.0015(0.0012) Grad: 17042.2891  LR: 0.00000122  
Epoch: [5][600/2422] Elapsed 3m 1s (remain 9m 8s) Loss: 0.0000(0.0012) Grad: 30.0903  LR: 0.00000110  
Epoch: [5][700/2422] Elapsed 3m 31s (remain 8m 38s) Loss: 0.0012(0.0012) Grad: 15173.3818  LR: 0.00000098  
Epoch: [5][800/2422] Elapsed 4m 1s (remain 8m 8s) Loss: 0.0005(0.0012) Grad: 5265.5933  LR: 0.00000087  
Epoch: [5][900/2422] Elapsed 4m 31s (remai

Epoch 5 - avg_train_loss: 0.0012  avg_val_loss: 0.0187  time: 734s
Epoch 5 - Score: 0.8876
Score: 0.8892
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/2447] Elapsed 0m 0s (remain 21m 12s) Loss: 0.8401(0.8401) Grad: inf  LR: 0.00002000  
Epoch: [1][100/2447] Elapsed 0m 30s (remain 11m 49s) Loss: 0.0172(0.0807) Grad: 2406.8347  LR: 0.00002000  
Epoch: [1][200/2447] Elapsed 1m 0s (remain 11m 17s) Loss: 0.0054(0.0450) Grad: 1385.9911  LR: 0.00001999  
Epoch: [1][300/2447] Elapsed 1m 30s (remain 10m 46s) Loss: 0.0064(0.0325) Grad: 915.4549  LR: 0.00001997  
Epoch: [1][400/2447] Elapsed 2m 0s (remain 10m 16s) Loss: 0.0025(0.0259) Grad: 340.9361  LR: 0.00001995  
Epoch: [1][500/2447] Elapsed 2m 30s (remain 9m 45s) Loss: 0.0015(0.0217) Grad: 486.7793  LR: 0.00001992  
Epoch: [1][600/2447] Elapsed 3m 0s (remain 9m 15s) Loss: 0.0015(0.0187) Grad: 239.1274  LR: 0.00001988  
Epoch: [1][700/2447] Elapsed 3m 30s (remain 8m 45s) Loss: 0.0143(0.0166) Grad: 2219.3738  LR: 0.00001984  
Epoch: [1][800/2447] Elapsed 4m 0s (remain 8m 15s) Loss: 0.0008(0.0151) Grad: 356.2041  LR: 0.00001979  
Epoch: [1][900/2447] Elapsed 4m 30s (remain 7m 44s

Epoch 1 - avg_train_loss: 0.0074  avg_val_loss: 0.0139  time: 738s
Epoch 1 - Score: 0.9051
Epoch 1 - Save Best Score: 0.9051 Model


EVAL: [9/10] Elapsed 0m 1s (remain 0m 0s) Loss: 0.0475(0.0139) 
Epoch: [2][0/2447] Elapsed 0m 0s (remain 22m 3s) Loss: 0.0004(0.0004) Grad: 2038.6448  LR: 0.00001809  
Epoch: [2][100/2447] Elapsed 0m 30s (remain 11m 52s) Loss: 0.0009(0.0022) Grad: 4669.9268  LR: 0.00001794  
Epoch: [2][200/2447] Elapsed 1m 0s (remain 11m 19s) Loss: 0.0007(0.0023) Grad: 2127.8743  LR: 0.00001778  
Epoch: [2][300/2447] Elapsed 1m 30s (remain 10m 47s) Loss: 0.0001(0.0021) Grad: 1056.6274  LR: 0.00001761  
Epoch: [2][400/2447] Elapsed 2m 0s (remain 10m 16s) Loss: 0.0011(0.0020) Grad: 5071.3701  LR: 0.00001744  
Epoch: [2][500/2447] Elapsed 2m 30s (remain 9m 46s) Loss: 0.0006(0.0021) Grad: 5274.2964  LR: 0.00001727  
Epoch: [2][600/2447] Elapsed 3m 1s (remain 9m 15s) Loss: 0.0028(0.0022) Grad: 18888.5918  LR: 0.00001709  
Epoch: [2][700/2447] Elapsed 3m 31s (remain 8m 45s) Loss: 0.0005(0.0021) Grad: 1885.2006  LR: 0.00001691  
Epoch: [2][800/2447] Elapsed 4m 1s (remain 8m 15s) Loss: 0.0022(0.0021) Grad: 596