In [1]:
# This is AWP + exp67 seed=2022, maxlen=512, PL + decreasing max len + last 8 hidden layers
# Psuedo labels for this code is here - https://www.kaggle.com/code/rashmibanthia/fb3-psuedo-labels-1-from-exp47b/notebook?scriptVersionId=107695932 
# these were generated from exp47.

# This has 5 fold split of fb1 dataset - https://www.kaggle.com/code/rashmibanthia/fb3-fb1-merged

colab=False
vastai=False
local = True

if local: 
  data_dir =  "/home/rashmi/Documents/kaggle/feedback3/"
  
if colab:
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_dir = 'gdrive/MyDrive/kaggle/2022/feedback3'

  !mkdir -p feedback1
  !cp gdrive/MyDrive/kaggle/2022/feedback_prize/data/train.zip feedback1
  !unzip -oqq gdrive/MyDrive/kaggle/2022/feedback_prize/data/train.zip -d feedback1/

  FEEDBACK1_TRAIN_CSV = 'gdrive/MyDrive/kaggle/2022/feedback_prize/data/train.csv'
  FEEDBACK1_TRAIN_DIR = 'feedback1/train/'

  !pip install wandb > /dev/null
  !pip install tokenizers > /dev/null
  !pip install transformers > /dev/null
  !pip install sentencepiece > /dev/null
  !pip install tez > /dev/null
  !pip install datasets > /dev/null
    
if vastai:
    data_dir = '/workspace/feedback3/'

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os
EXP_NAME = 'exp117a_PL_9'
OUTPUT_DIR = f'{data_dir}/src/models_' + EXP_NAME + "/"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
import os, gc, pickle, math, time, random, copy, json
from glob import glob
import numpy as np
import pandas as pd
from sklearn import metrics
from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed

from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.metrics import log_loss

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset

# Mixed precision in Pytorch
from torch.cuda.amp import autocast, GradScaler

# For SWA
from torch.optim.swa_utils import AveragedModel, SWALR

from transformers import AutoConfig, AutoTokenizer, AutoModel
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, AdamW

import warnings
warnings.filterwarnings('ignore')

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [4]:
import transformers
transformers.__version__

'4.23.1'

In [5]:
class Config:
    # General settings
    competition_name = 'FeedbackPrize3'
    seed = 2022 #42
    debug = False
    train = True
    n_fold = 5
    print_freq = 100
    wandb = True
    val_strategy = "batch"
    val_steps = 250 #300
    # For model
    # model='gdrive/MyDrive/kaggle/2022/feedback2/pretrain/models_pretrain_debertav3large-expA/deberta-v3-large'
    # tokenizer_path='gdrive/MyDrive/kaggle/2022/feedback2/pretrain/models_pretrain_debertav3large-expA/deberta-v3-large/tokenizer'
    model = "microsoft/deberta-v3-large"
    tokenizer_path = "microsoft/deberta-v3-large"
    tokenizer = AutoTokenizer.from_pretrained(model)
    config = AutoConfig.from_pretrained(model)
    config.output_hidden_states = True
    config.hidden_dropout_prob = 0.
    config.attention_probs_dropout_prob = 0.
    
    scheduler='cosine'
    trn_fold = [0,1,2,3,4]
    max_len = 512
    batch_size = 4
    num_workers = os.cpu_count()
    # For training
    apex = True
    gradient_checkpointing = True
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    epochs = 4
    gradient_accumulation_steps = 1.
    max_grad_norm = 1000
    label_smoothing = 0.03
    num_labels=6
    # Optimizer
    lr = 1e-5
    weight_decay = 1e-2
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    min_lr = 1e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    # Scheduler
    scheduler_type = 'cosine'    # 'linear', 'cosine'
    if scheduler_type == 'cosine':
        num_cycles = 0.5
    num_warmup_steps = 100
    batch_scheduler = True
    gpu_optimize_config_adam = False
    TRAIN_FOLDS = f'{data_dir}/input/train_folds.csv'
    # For AWP
    use_awp = True
    if use_awp:
        start_awp_epoch = 1 
        awp_score_check = 0.49
        adv_lr = 2e-5
        adv_eps = 1e-3 #1e-2
#         adv_step = 1
    else:
        start_awp_epoch = epochs + 1

CFG = Config()
CFG.tokenizer.add_special_tokens({'additional_special_tokens': ['[PARAGRAPH]']})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
true_cols = ['cohesion', 'syntax', 'vocabulary','phraseology', 'grammar', 'conventions']
pred_cols = ['pred_cohesion', 'pred_syntax', 'pred_vocabulary','pred_phraseology', 'pred_grammar', 'pred_conventions']

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        # wandb.login(key=secret_value_0)
        wandb.login(key='yourkeyhere')
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project=CFG.competition_name, 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type=EXP_NAME, #"train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mrashmi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/rashmi/.netrc


In [7]:
"""# Random seed"""

def seed_everything(seed, use_cuda = True):
    np.random.seed(seed) # cpu vars
    torch.manual_seed(seed) # cpu  vars
    random.seed(seed) # Python
    os.environ['PYTHONHASHSEED'] = str(seed) # Python hash building
    if use_cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

seed_everything(CFG.seed)


In [8]:
all_pldata = pd.read_csv('/home/rashmi/Documents/kaggle/feedback3/input/fb3_fb1_merged/all_pldata.csv')
all_pldata.columns = ['text_id','full_text','token_length','kfold']
all_pldata

Unnamed: 0,text_id,full_text,token_length,kfold
0,423A1CA112E2,Phones Modern humans today are always on their...,442,0
1,A8445CABFECE,Phones & Driving Drivers should not be able to...,242,0
2,6B4F7A0165B9,Cell Phone Operation While Driving The ability...,364,0
3,E05C7F5C1156,People are debating whether if drivers should ...,671,0
4,50B3435E475B,Texting and driving Over half of drivers in to...,420,0
...,...,...,...,...
15137,0814426B27DF,Most people ask more than one person for advic...,512,4
15138,8F4B595CF9E7,Do you ever want more opinions and options whe...,639,4
15139,6B5809C83978,Has anyone ever gave you advice? Was the advic...,411,4
15140,AFEC37C2D43F,There has been at least one point in everyone'...,647,4


In [9]:
df_folds= pd.read_csv(CFG.TRAIN_FOLDS)
# df_folds['full_text'] = df_folds.full_text.apply(lambda x : resolve_encodings_and_normalize(x))
print(df_folds.shape)
df_folds.head(2)

(3911, 9)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,kfold
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,1
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,0


In [10]:
df_folds['full_text'] = df_folds.full_text.apply(lambda x: x.replace("\n\n"," [PARAGRAPH] "))

In [11]:
df_folds.kfold.value_counts(), df_folds[df_folds.kfold!=0].shape

(1    783
 0    782
 4    782
 3    782
 2    782
 Name: kfold, dtype: int64,
 (3129, 9))

In [12]:
def _prepare_training_data_helper(df, is_train):
      # print(df['full_text'])
      tok = CFG.tokenizer.encode(df['full_text'].values[0], add_special_tokens=True, max_length=CFG.max_len)
      label = df[true_cols].values[0]

      return {'input_ids': tok, 'label': label, 'essay_id': df['text_id'].values[0] }



def prepare_training_data(df, tokenizer, num_jobs, is_train):
    
    results = Parallel()(
        delayed(_prepare_training_data_helper)(gdf, is_train) for gn, gdf in tqdm(df.groupby('text_id'))
    )
    
    return results


In [13]:
folds = df_folds.copy()
fold=0
train_df = folds[folds["kfold"] != fold].reset_index(drop=True)#.head(500)
valid_df = folds[folds["kfold"] == fold].reset_index(drop=True)
NUM_JOBS = os.cpu_count()
training_samples_ = prepare_training_data(train_df, CFG.tokenizer, num_jobs=NUM_JOBS, is_train=True)

  0%|          | 0/3129 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 3129/3129 [00:02<00:00, 1302.85it/s]


In [14]:
class FeedbackDataset:
    def __init__(self, samples):
        self.samples = samples
        self.tokenizer = CFG.tokenizer

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ids = self.samples[idx]["input_ids"]
        label = self.samples[idx]["label"]
        essay_id = self.samples[idx]["essay_id"]
        mask = [1] * len(ids)
        # print(len(ids),len(mask),len(label))
        return {
            "ids": ids,
            "mask": mask,
            "essay_id":essay_id,
            "targets": label,

        }



class Collate:
    def __init__(self, cfg):
        self.tokenizer = cfg.tokenizer
        self.cfg = cfg

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        output["essay_id"] = [sample["essay_id"] for sample in batch]
        output["targets"] = [sample["targets"] for sample in batch]
            
        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])
    
        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]
        
        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)
        output["targets"] = torch.tensor(output["targets"], dtype=torch.float)
       
        
        # print(output['ids'].shape, output['mask'].shape, output['targets'].shape)
        
        return output

In [15]:
train_dataset = FeedbackDataset(training_samples_)
collate_fn = Collate(CFG)
train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,  collate_fn=collate_fn, 
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
for step, inputs in enumerate(train_loader):
  # print(step)
  break

In [16]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        hidden_dropout_prob: float = 0.0
        self.model_config = AutoConfig.from_pretrained(model_name)

        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "add_pooling_layer": False,
                "num_labels": CFG.num_labels,
                 "attention_probs_dropout_prob":0.0 
            }
        )

        self.model = AutoModel.from_pretrained(model_name, config=self.model_config)
        self.model.resize_token_embeddings(len(CFG.tokenizer))
        self.output = nn.Linear(self.model_config.hidden_size, CFG.num_labels)


    def forward(self, ids, mask, targets=None):
        transformer_out = self.model(input_ids=ids, attention_mask=mask)

        # Get CLS token for last 6 hidden layers
        h1 = transformer_out[1][-1][:,0,:].reshape((-1, 1, self.model_config.hidden_size)) #last 1 hidden state # CLS token (4,1,1024)
        h2 = transformer_out[1][-2][:,0,:].reshape((-1, 1, self.model_config.hidden_size)) #2nd last hidden state
        h3 = transformer_out[1][-3][:,0,:].reshape((-1, 1, self.model_config.hidden_size))
        h4 = transformer_out[1][-4][:,0,:].reshape((-1, 1, self.model_config.hidden_size))
        h5 = transformer_out[1][-5][:,0,:].reshape((-1, 1, self.model_config.hidden_size))
        h6 = transformer_out[1][-6][:,0,:].reshape((-1, 1, self.model_config.hidden_size))
        h7 = transformer_out[1][-7][:,0,:].reshape((-1, 1, self.model_config.hidden_size))
        h8 = transformer_out[1][-8][:,0,:].reshape((-1, 1, self.model_config.hidden_size))

        all_h = torch.cat([ h1, h2, h3, h4, h5, h6,h7,h8], 1)
        # Average CLS token for 6 hidden layers
        all_h_mean = torch.mean(all_h,1).reshape((-1, 1, self.model_config.hidden_size)) # torch.Size([bs, 1, hidden size])

        # Average CLS token for last hidden state + CLS token for 6 hidden layers 
        sequence_output = torch.mean(torch.cat([transformer_out.last_hidden_state[:,0,:].reshape((-1, 1, self.model_config.hidden_size)),
                                                all_h_mean],1),1)

        logits = self.output(sequence_output)
        return logits


In [17]:
# m = FeedBackModel(CFG.model)
# m(inputs['ids'], inputs['mask'])

In [18]:
class Loss_Fn(nn.Module):
    def __init__(self):
        super().__init__()
        self.lfn = nn.MSELoss()
    
    def forward(self, outputs, targets):
        loss = self.lfn(outputs, targets)
        return loss
        

In [19]:
from datasets.features.features import config

class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param = 'weight',
        adv_lr = 1,
        adv_eps = 0.2,
        start_step = 0,
        adv_step = 1,
        scaler = None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_step = start_step
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, batch, epoch):
        
        criterion = Loss_Fn()
        
        if (self.adv_lr == 0) or (epoch < self.start_step):
            return None

        self._save()
        for i in range(self.adv_step):
            self._attack_step() 
            with autocast(enabled = CFG.apex):
                input_ids = batch['ids'].to(CFG.device)
                attention_mask = batch['mask'].to(CFG.device)
                # token_type_ids = batch['token_type_ids'].to(cfg.device)
                labels = batch['targets'].to(CFG.device)
                tr_logits = self.model(input_ids, attention_mask,  labels)
                
                loss = criterion(tr_logits, labels)
                adv_loss = loss
                if CFG.gradient_accumulation_steps > 1:
                    adv_loss = loss / CFG.gradient_accumulation_steps
                    
#                 adv_loss = adv_loss.mean()
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                    
    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [20]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

def get_score(outputs,targets): #outputs=preds, targets=groundtruth
    mcrmse = []
    for i in range(CFG.num_labels):
        mcrmse.append(
            metrics.mean_squared_error(
                targets[:, i],
                outputs[:, i],
                squared=False,
            ),
        )
    mcrmse = np.mean(mcrmse)
    return mcrmse # {"mcrmse": torch.tensor(mcrmse, device=device)}

In [21]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, valid_loader=None, valid_idx=None, best_score=np.inf):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    # best_score = np.inf


    if CFG.use_awp:
        # Initialize AWP
        log_awp=True
        awp = AWP(model, optimizer, adv_lr = CFG.adv_lr, adv_eps = CFG.adv_eps, start_step = CFG.start_awp_epoch, scaler = scaler)


    for step, inputs in enumerate(train_loader):
        model.train()
        gc.collect()
        # for k, v in inputs.items():
        #     inputs[k] = v.to(device)
        inputs['ids'] = inputs['ids'].to(device)
        inputs['mask'] = inputs['mask'].to(device)

        labels = inputs['targets'].to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs['ids'],inputs['mask'])
            # print(y_preds.shape, labels.shape, labels.view(-1, 1).shape)
            loss = criterion(y_preds, labels) #.view(-1, 1))
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()

        if CFG.use_awp and best_score <= CFG.awp_score_check:
            if best_score <= CFG.awp_score_check and log_awp:
                LOGGER.info(' Start AWP '.center(50, '-'))
                log_awp=False
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                awp.attack_backward(inputs, epoch)
        

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
        
        if CFG.val_strategy=="batch": 
            if step % CFG.val_steps == 0 or step == (len(train_loader)-1): # or if last step 
                  # Validate here
                  avg_val_loss, predictions, output_map = valid_fn(valid_loader, model, criterion, device, valid_idx)
                  
                  valid_df = df_folds[df_folds.text_id.isin(valid_idx)].copy()
                  for i,c in enumerate(true_cols):
                      valid_df.loc[:, f'pred_{c}'] = valid_df['text_id'].apply(lambda x: output_map[x][i])
                
                  valid_labels = valid_df[true_cols].values
                  valid_preds = valid_df[pred_cols].values

                  score = get_score(valid_preds,valid_labels) 
                  save_preds = valid_df[pred_cols].values

                  if  score < best_score:
                      best_score = score
                      LOGGER.info(f'Epoch {epoch+1} - Step {step}/{len(train_loader)} - Save Best Score: {best_score:.4f} Model')
                      torch.save({'model': model.state_dict(),
                                  'predictions': save_preds},
                                  OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
                      

    return losses.avg, best_score



def valid_fn(valid_loader, model, criterion, device, valid_idx):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, inputs in enumerate(valid_loader):
        inputs['ids'] = inputs['ids'].to(device)
        inputs['mask'] = inputs['mask'].to(device)

        labels = inputs['targets'].to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs['ids'],inputs['mask'])
            loss = criterion(y_preds, labels) #.view(-1, 1))
            
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        
        losses.update(loss.item(), batch_size)
        
        preds.append(y_preds.detach().to('cpu').numpy())

        end = time.time()
        if step == (len(valid_loader)-1): #print only at the end 
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    
    preds = np.vstack(preds) 

    output_map = {}
    for x, y in zip(valid_idx, preds):
        output_map[x] = y.tolist()
    
    return losses.avg, preds, output_map


In [22]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
     
    seed_everything(seed=CFG.seed)

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_df = folds[folds["kfold"] != fold].reset_index(drop=True)#.head(500)
    valid_df = folds[folds["kfold"] == fold].reset_index(drop=True)

    pl_data_fold = pd.read_csv(f'/home/rashmi/Documents/kaggle/feedback3/input/fb3-psuedo-labels-1/exp_PL1_from_exp47_fold{fold}.csv')
    pl_data_fold = pl_data_fold.merge(all_pldata[['text_id','kfold']],on='text_id',how='left')
    pl_data_fold = pl_data_fold[train_df.columns]
    train_df = pd.concat([train_df,pl_data_fold[pl_data_fold.kfold==fold]])

    NUM_JOBS = os.cpu_count()
    # training_samples_ = prepare_training_data(train_df, CFG.tokenizer, num_jobs=NUM_JOBS, is_train=True)
    # valid_samples_ = prepare_training_data(valid_df, CFG.tokenizer, num_jobs=NUM_JOBS, is_train=True)

    # training_samples, valid_samples = training_samples_, valid_samples_

    # valid_idx = []
    # for x in valid_samples:
    #     valid_idx.append(x['essay_id'])  
    
    # print(len(valid_idx), valid_df.shape)

    # train_dataset = FeedbackDataset(training_samples)
    # valid_dataset = FeedbackDataset(valid_samples)

    # collate_fn = Collate(CFG)

    # train_loader = DataLoader(train_dataset,
    #                           batch_size=CFG.batch_size,
    #                           shuffle=True,  collate_fn=collate_fn, 
    #                           num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    # valid_loader = DataLoader(valid_dataset,
    #                           batch_size=CFG.batch_size,
    #                           shuffle=False, collate_fn=collate_fn, 
    #                           num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    # ====================================================
    # model & optimizer
    # ====================================================
    model = FeedBackModel(CFG.model) 
    torch.save(model.model_config, OUTPUT_DIR+'config.pth')
    model.to(device)

    if CFG.wandb: # this logs all gradients
        wandb.watch(model, log='all')
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
              'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
              'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    
    if CFG.gpu_optimize_config_adam:
        optimizer = bnb.optim.AdamW(optimizer_parameters, lr=CFG.encoder_lr, optim_bits=8)
    else:
        optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_df) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    # criterion = nn.BCEWithLogitsLoss(reduction="mean")
    # criterion = nn.CrossEntropyLoss()
    criterion = Loss_Fn() #MSECorrLoss()
    
    best_score = np.inf

    for epoch in range(CFG.epochs):
        if epoch==0:
            CFG.max_len = 768
        if epoch==1:
            CFG.max_len = 512
        if epoch==2:
            CFG.max_len = 512 
        else:
            CFG.max_len = 470
            
        training_samples_ = prepare_training_data(train_df, CFG.tokenizer, num_jobs=NUM_JOBS, is_train=True)
        valid_samples_ = prepare_training_data(valid_df, CFG.tokenizer, num_jobs=NUM_JOBS, is_train=True)

        training_samples, valid_samples = training_samples_, valid_samples_

        valid_idx = []
        for x in valid_samples:
            valid_idx.append(x['essay_id'])  

        print(len(valid_idx), valid_df.shape)

        train_dataset = FeedbackDataset(training_samples)
        valid_dataset = FeedbackDataset(valid_samples)

        collate_fn = Collate(CFG)

        train_loader = DataLoader(train_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=True,  collate_fn=collate_fn, 
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=CFG.batch_size,
                                  shuffle=False, collate_fn=collate_fn, 
                                  num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
        
        start_time = time.time()

        # train and validate
        avg_loss, best_score = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, valid_loader, valid_idx, best_score)


    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    
    valid_df['pred_cohesion'] = predictions[:,0]
    valid_df['pred_syntax'] = predictions[:,1]
    valid_df['pred_vocabulary'] = predictions[:,2]
  
    valid_df['pred_phraseology'] = predictions[:,3]
    valid_df['pred_grammar'] = predictions[:,4]
    valid_df['pred_conventions'] = predictions[:,5]
    

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_df

In [23]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[true_cols].values
        preds = oof_df[pred_cols].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(df_folds, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                _oof_df.to_csv(OUTPUT_DIR+f'oof_df{fold}.csv',index=False)
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        oof_df.to_csv(OUTPUT_DIR+'oof_df.csv',index=False)
        
    if CFG.wandb:
        wandb.finish()

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 615

782 (782, 9)





Epoch: [1][0/1539] Elapsed 0m 1s (remain 43m 4s) Loss: 8.8496(8.8496) Grad: inf  LR: 0.00000010  


Epoch 1 - Step 0/1539 - Save Best Score: 2.9045 Model


EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 10.1507(8.5389) 
Epoch: [1][100/1539] Elapsed 1m 5s (remain 15m 26s) Loss: 0.3086(7.9381) Grad: 57168.4961  LR: 0.00001000  
Epoch: [1][200/1539] Elapsed 1m 42s (remain 11m 24s) Loss: 0.1747(4.1039) Grad: 18277.2246  LR: 0.00000999  


Epoch 1 - Step 250/1539 - Save Best Score: 0.4611 Model


EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2408(0.2132) 


------------------- Start AWP --------------------


Epoch: [1][300/1539] Elapsed 2m 45s (remain 11m 21s) Loss: 0.0678(2.7882) Grad: 13427.2432  LR: 0.00000997  
Epoch: [1][400/1539] Elapsed 3m 23s (remain 9m 37s) Loss: 0.2282(2.1275) Grad: 37235.1992  LR: 0.00000994  
Epoch: [1][500/1539] Elapsed 4m 1s (remain 8m 19s) Loss: 0.3853(1.7335) Grad: 42104.2148  LR: 0.00000989  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1537(0.2214) 
Epoch: [1][600/1539] Elapsed 5m 3s (remain 7m 54s) Loss: 0.2136(1.4672) Grad: 14389.0195  LR: 0.00000983  
Epoch: [1][700/1539] Elapsed 5m 41s (remain 6m 48s) Loss: 0.0457(1.2772) Grad: 10852.8164  LR: 0.00000976  
EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3294(0.2155) 
Epoch: [1][800/1539] Elapsed 6m 42s (remain 6m 10s) Loss: 0.0835(1.1322) Grad: 11563.7900  LR: 0.00000967  
Epoch: [1][900/1539] Elapsed 7m 19s (remain 5m 11s) Loss: 0.1053(1.0211) Grad: 15626.3145  LR: 0.00000957  
Epoch: [1][1000/1539] Elapsed 7m 57s (remain 4m 16s) Loss: 0.1740(0.9302) Grad: 17356.0176  LR: 0.00000946  


Epoch 1 - Step 1000/1539 - Save Best Score: 0.4527 Model


EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2769(0.2055) 
Epoch: [1][1100/1539] Elapsed 9m 1s (remain 3m 35s) Loss: 0.1519(0.8558) Grad: 23906.2793  LR: 0.00000934  
Epoch: [1][1200/1539] Elapsed 9m 38s (remain 2m 42s) Loss: 0.1297(0.7948) Grad: 29812.9180  LR: 0.00000921  


Epoch 1 - Step 1250/1539 - Save Best Score: 0.4511 Model


EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2071(0.2038) 
Epoch: [1][1300/1539] Elapsed 10m 42s (remain 1m 57s) Loss: 0.0747(0.7420) Grad: 19107.8828  LR: 0.00000906  
Epoch: [1][1400/1539] Elapsed 11m 21s (remain 1m 7s) Loss: 0.0718(0.6980) Grad: 6552.0317  LR: 0.00000890  
Epoch: [1][1500/1539] Elapsed 11m 59s (remain 0m 18s) Loss: 0.0630(0.6598) Grad: 8248.6934  LR: 0.00000874  
EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.3329(0.2216) 
Epoch: [1][1538/1539] Elapsed 12m 37s (remain 0m 0s) Loss: 0.1513(0.6463) Grad: 14301.7979  LR: 0.00000867  


Epoch 1 - Step 1538/1539 - Save Best Score: 0.4486 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2339(0.2016) 


100%|██████████| 6158/6158 [00:05<00:00, 1159.80it/s]
100%|██████████| 782/782 [00:00<00:00, 1295.80it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [2][0/1539] Elapsed 0m 1s (remain 34m 47s) Loss: 0.0841(0.0841) Grad: 180910.6250  LR: 0.00000867  


Epoch 2 - Step 0/1539 - Save Best Score: 0.4472 Model


EVAL: [195/196] Elapsed 0m 23s (remain 0m 0s) Loss: 0.2192(0.2004) 
Epoch: [2][100/1539] Elapsed 1m 30s (remain 21m 32s) Loss: 0.1033(0.1011) Grad: 104010.2031  LR: 0.00000849  
Epoch: [2][200/1539] Elapsed 2m 31s (remain 16m 46s) Loss: 0.1577(0.0988) Grad: 230135.3906  LR: 0.00000830  


Epoch 2 - Step 250/1539 - Save Best Score: 0.4444 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2490(0.1980) 
Epoch: [2][300/1539] Elapsed 3m 55s (remain 16m 8s) Loss: 0.0576(0.0932) Grad: 114803.9219  LR: 0.00000810  
Epoch: [2][400/1539] Elapsed 4m 55s (remain 13m 59s) Loss: 0.0933(0.0915) Grad: 123093.8203  LR: 0.00000789  
Epoch: [2][500/1539] Elapsed 5m 54s (remain 12m 14s) Loss: 0.1062(0.0906) Grad: 217553.6562  LR: 0.00000768  


Epoch 2 - Step 500/1539 - Save Best Score: 0.4433 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2343(0.1969) 
Epoch: [2][600/1539] Elapsed 7m 18s (remain 11m 24s) Loss: 0.0553(0.0896) Grad: 104256.1953  LR: 0.00000745  
Epoch: [2][700/1539] Elapsed 8m 18s (remain 9m 55s) Loss: 0.0286(0.0890) Grad: 142210.8438  LR: 0.00000722  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2684(0.2010) 
Epoch: [2][800/1539] Elapsed 9m 38s (remain 8m 53s) Loss: 0.0669(0.0890) Grad: 172894.3281  LR: 0.00000699  
Epoch: [2][900/1539] Elapsed 10m 37s (remain 7m 31s) Loss: 0.0158(0.0886) Grad: 40591.3789  LR: 0.00000675  
Epoch: [2][1000/1539] Elapsed 11m 36s (remain 6m 14s) Loss: 0.1208(0.0886) Grad: 161750.7656  LR: 0.00000650  


Epoch 2 - Step 1000/1539 - Save Best Score: 0.4432 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2372(0.1969) 
Epoch: [2][1100/1539] Elapsed 13m 0s (remain 5m 10s) Loss: 0.0297(0.0883) Grad: 64693.7070  LR: 0.00000625  
Epoch: [2][1200/1539] Elapsed 13m 59s (remain 3m 56s) Loss: 0.0225(0.0886) Grad: 72936.6172  LR: 0.00000600  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2287(0.1978) 
Epoch: [2][1300/1539] Elapsed 15m 20s (remain 2m 48s) Loss: 0.0913(0.0888) Grad: 172778.8438  LR: 0.00000575  
Epoch: [2][1400/1539] Elapsed 16m 20s (remain 1m 36s) Loss: 0.0702(0.0884) Grad: 127212.8750  LR: 0.00000549  
Epoch: [2][1500/1539] Elapsed 17m 19s (remain 0m 26s) Loss: 0.0787(0.0885) Grad: 170000.8438  LR: 0.00000523  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2904(0.1997) 
Epoch: [2][1538/1539] Elapsed 18m 4s (remain 0m 0s) Loss: 0.0298(0.0886) Grad: 87676.4609  LR: 0.00000513  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2687(0.1977) 


100%|██████████| 6158/6158 [00:05<00:00, 1203.83it/s]
100%|██████████| 782/782 [00:00<00:00, 1302.97it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [3][0/1539] Elapsed 0m 1s (remain 34m 52s) Loss: 0.0280(0.0280) Grad: 154193.5625  LR: 0.00000513  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2714(0.1979) 
Epoch: [3][100/1539] Elapsed 1m 28s (remain 21m 1s) Loss: 0.0735(0.0797) Grad: 134058.5469  LR: 0.00000487  
Epoch: [3][200/1539] Elapsed 2m 30s (remain 16m 39s) Loss: 0.0832(0.0820) Grad: 120968.8125  LR: 0.00000461  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2903(0.1989) 
Epoch: [3][300/1539] Elapsed 3m 56s (remain 16m 11s) Loss: 0.0920(0.0849) Grad: 182717.3594  LR: 0.00000435  
Epoch: [3][400/1539] Elapsed 4m 58s (remain 14m 7s) Loss: 0.0171(0.0863) Grad: 67799.5859  LR: 0.00000410  
Epoch: [3][500/1539] Elapsed 6m 2s (remain 12m 30s) Loss: 0.0309(0.0838) Grad: 88767.2266  LR: 0.00000384  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2877(0.1980) 
Epoch: [3][600/1539] Elapsed 7m 29s (remain 11m 41s) Loss: 0.1096(0.0840) Grad: 141939.7656  LR: 0.00000359  
Epoch: [3][700/1539] Elapsed 8m 31

Epoch 3 - Step 1250/1539 - Save Best Score: 0.4429 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2686(0.1967) 
Epoch: [3][1300/1539] Elapsed 16m 4s (remain 2m 56s) Loss: 0.1125(0.0817) Grad: 135844.1406  LR: 0.00000198  
Epoch: [3][1400/1539] Elapsed 17m 7s (remain 1m 41s) Loss: 0.2327(0.0825) Grad: 133709.1562  LR: 0.00000178  
Epoch: [3][1500/1539] Elapsed 18m 9s (remain 0m 27s) Loss: 0.1201(0.0825) Grad: 121151.6562  LR: 0.00000158  


Epoch 3 - Step 1500/1539 - Save Best Score: 0.4423 Model


EVAL: [195/196] Elapsed 0m 26s (remain 0m 0s) Loss: 0.2629(0.1962) 
Epoch: [3][1538/1539] Elapsed 19m 1s (remain 0m 0s) Loss: 0.0215(0.0822) Grad: 81386.6172  LR: 0.00000151  


Epoch 3 - Step 1538/1539 - Save Best Score: 0.4423 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2717(0.1961) 


100%|██████████| 6158/6158 [00:05<00:00, 1188.00it/s]
100%|██████████| 782/782 [00:00<00:00, 1287.96it/s]

782 (782, 9)



------------------- Start AWP --------------------


Epoch: [4][0/1539] Elapsed 0m 1s (remain 33m 12s) Loss: 0.1543(0.1543) Grad: inf  LR: 0.00000151  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2700(0.1963) 
Epoch: [4][100/1539] Elapsed 1m 23s (remain 19m 49s) Loss: 0.0749(0.0787) Grad: 118536.9609  LR: 0.00000133  
Epoch: [4][200/1539] Elapsed 2m 23s (remain 15m 54s) Loss: 0.0516(0.0771) Grad: 72452.5312  LR: 0.00000116  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2682(0.1970) 
Epoch: [4][300/1539] Elapsed 3m 46s (remain 15m 32s) Loss: 0.1660(0.0742) Grad: 185340.5938  LR: 0.00000100  
Epoch: [4][400/1539] Elapsed 4m 46s (remain 13m 34s) Loss: 0.0930(0.0759) Grad: 125548.1797  LR: 0.00000085  
Epoch: [4][500/1539] Elapsed 5m 47s (remain 11m 59s) Loss: 0.0340(0.0766) Grad: 83057.1484  LR: 0.00000071  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.3020(0.1998) 
Epoch: [4][600/1539] Elapsed 7m 9s (remain 11m 10s) Loss: 0.0805(0.0768) Grad: 148445.0000  LR: 0.00000058  
Epoch: [4][700/1539] Elapsed 8m 8s (rema

Score: 0.4423
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█

783 (783, 9)





Epoch: [1][0/1539] Elapsed 0m 1s (remain 26m 49s) Loss: 10.6121(10.6121) Grad: inf  LR: 0.00000010  


Epoch 1 - Step 0/1539 - Save Best Score: 2.9460 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 9.0841(8.7851) 
Epoch: [1][100/1539] Elapsed 1m 0s (remain 14m 25s) Loss: 0.6579(8.1450) Grad: 48127.2930  LR: 0.00001000  
Epoch: [1][200/1539] Elapsed 1m 37s (remain 10m 45s) Loss: 0.0601(4.2382) Grad: 9421.9043  LR: 0.00000999  


Epoch 1 - Step 250/1539 - Save Best Score: 0.5038 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2153(0.2578) 
Epoch: [1][300/1539] Elapsed 2m 36s (remain 10m 45s) Loss: 0.1441(2.8816) Grad: 9255.7275  LR: 0.00000997  
Epoch: [1][400/1539] Elapsed 3m 12s (remain 9m 6s) Loss: 0.1353(2.2041) Grad: 13474.5361  LR: 0.00000994  
Epoch: [1][500/1539] Elapsed 3m 48s (remain 7m 52s) Loss: 0.0439(1.7935) Grad: 6340.6787  LR: 0.00000989  


Epoch 1 - Step 500/1539 - Save Best Score: 0.4677 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2354(0.2193) 


------------------- Start AWP --------------------


Epoch: [1][600/1539] Elapsed 4m 48s (remain 7m 30s) Loss: 0.0286(1.5171) Grad: 5560.8086  LR: 0.00000983  
Epoch: [1][700/1539] Elapsed 5m 24s (remain 6m 27s) Loss: 0.1669(1.3194) Grad: 11775.3760  LR: 0.00000976  


Epoch 1 - Step 750/1539 - Save Best Score: 0.4658 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2461(0.2175) 
Epoch: [1][800/1539] Elapsed 6m 23s (remain 5m 53s) Loss: 0.0427(1.1722) Grad: 5549.7832  LR: 0.00000967  
Epoch: [1][900/1539] Elapsed 6m 59s (remain 4m 56s) Loss: 0.7284(1.0572) Grad: 22687.5625  LR: 0.00000957  
Epoch: [1][1000/1539] Elapsed 7m 35s (remain 4m 4s) Loss: 0.1215(0.9652) Grad: 9848.5889  LR: 0.00000946  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2153(0.2215) 
Epoch: [1][1100/1539] Elapsed 8m 33s (remain 3m 24s) Loss: 0.1488(0.8877) Grad: 7247.4688  LR: 0.00000934  
Epoch: [1][1200/1539] Elapsed 9m 8s (remain 2m 34s) Loss: 0.0875(0.8239) Grad: 4520.3887  LR: 0.00000921  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2568(0.2261) 
Epoch: [1][1300/1539] Elapsed 10m 6s (remain 1m 50s) Loss: 0.0224(0.7723) Grad: 5179.2344  LR: 0.00000906  
Epoch: [1][1400/1539] Elapsed 10m 41s (remain 1m 3s) Loss: 0.1232(0.7249) Grad: 8524.2070  LR: 0.00000890  
Epoch: [1][1500/1539] Elapsed 11m 17s (rema

Epoch 1 - Step 1500/1539 - Save Best Score: 0.4575 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2341(0.2098) 
Epoch: [1][1538/1539] Elapsed 11m 55s (remain 0m 0s) Loss: 0.1614(0.6716) Grad: 5879.0947  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2250(0.2100) 


100%|██████████| 6157/6157 [00:05<00:00, 1227.13it/s]
100%|██████████| 783/783 [00:00<00:00, 1312.67it/s]


783 (783, 9)


------------------- Start AWP --------------------


Epoch: [2][0/1539] Elapsed 0m 1s (remain 34m 44s) Loss: 0.0285(0.0285) Grad: 227089.0625  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2300(0.2110) 
Epoch: [2][100/1539] Elapsed 1m 23s (remain 19m 50s) Loss: 0.0782(0.0910) Grad: 125796.3828  LR: 0.00000849  
Epoch: [2][200/1539] Elapsed 2m 23s (remain 15m 56s) Loss: 0.0949(0.0923) Grad: 146102.9531  LR: 0.00000830  


Epoch 2 - Step 250/1539 - Save Best Score: 0.4502 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2515(0.2032) 
Epoch: [2][300/1539] Elapsed 3m 48s (remain 15m 40s) Loss: 0.0052(0.0895) Grad: 35120.5938  LR: 0.00000810  
Epoch: [2][400/1539] Elapsed 4m 49s (remain 13m 40s) Loss: 0.1181(0.0897) Grad: 193459.1406  LR: 0.00000789  
Epoch: [2][500/1539] Elapsed 5m 49s (remain 12m 3s) Loss: 0.0753(0.0909) Grad: 140054.3750  LR: 0.00000768  


Epoch 2 - Step 500/1539 - Save Best Score: 0.4497 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2384(0.2027) 
Epoch: [2][600/1539] Elapsed 7m 12s (remain 11m 15s) Loss: 0.2399(0.0900) Grad: 120523.7891  LR: 0.00000745  
Epoch: [2][700/1539] Elapsed 8m 13s (remain 9m 50s) Loss: 0.0620(0.0895) Grad: 123571.9844  LR: 0.00000722  


Epoch 2 - Step 750/1539 - Save Best Score: 0.4495 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2450(0.2025) 
Epoch: [2][800/1539] Elapsed 9m 38s (remain 8m 52s) Loss: 0.0399(0.0934) Grad: 102663.0859  LR: 0.00000699  
Epoch: [2][900/1539] Elapsed 10m 38s (remain 7m 31s) Loss: 0.0440(0.0918) Grad: 87042.3125  LR: 0.00000675  
Epoch: [2][1000/1539] Elapsed 11m 38s (remain 6m 15s) Loss: 0.0737(0.0921) Grad: 80668.3594  LR: 0.00000650  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2282(0.2081) 
Epoch: [2][1100/1539] Elapsed 13m 1s (remain 5m 10s) Loss: 0.1183(0.0915) Grad: 88270.9453  LR: 0.00000625  
Epoch: [2][1200/1539] Elapsed 14m 1s (remain 3m 56s) Loss: 0.0497(0.0911) Grad: 70273.0625  LR: 0.00000600  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2709(0.2040) 
Epoch: [2][1300/1539] Elapsed 15m 22s (remain 2m 48s) Loss: 0.0830(0.0909) Grad: 76964.3438  LR: 0.00000575  
Epoch: [2][1400/1539] Elapsed 16m 24s (remain 1m 36s) Loss: 0.0578(0.0911) Grad: 43238.1758  LR: 0.00000549  
Epoch: [2][1500/1539] Elapsed 

100%|██████████| 6157/6157 [00:05<00:00, 1198.37it/s]
100%|██████████| 783/783 [00:00<00:00, 1301.88it/s]


783 (783, 9)


------------------- Start AWP --------------------


Epoch: [3][0/1539] Elapsed 0m 1s (remain 35m 38s) Loss: 0.1949(0.1949) Grad: 344085.3438  LR: 0.00000513  


Epoch 3 - Step 0/1539 - Save Best Score: 0.4494 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2291(0.2024) 
Epoch: [3][100/1539] Elapsed 1m 31s (remain 21m 43s) Loss: 0.2329(0.0832) Grad: 134323.5938  LR: 0.00000487  
Epoch: [3][200/1539] Elapsed 2m 33s (remain 17m 2s) Loss: 0.0521(0.0864) Grad: 51454.2773  LR: 0.00000461  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2253(0.2029) 
Epoch: [3][300/1539] Elapsed 3m 59s (remain 16m 26s) Loss: 0.0499(0.0855) Grad: 63939.5312  LR: 0.00000435  
Epoch: [3][400/1539] Elapsed 5m 1s (remain 14m 16s) Loss: 0.0062(0.0843) Grad: 22739.7266  LR: 0.00000410  
Epoch: [3][500/1539] Elapsed 6m 4s (remain 12m 35s) Loss: 0.0522(0.0843) Grad: 39790.3906  LR: 0.00000384  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2313(0.2030) 
Epoch: [3][600/1539] Elapsed 7m 31s (remain 11m 44s) Loss: 0.1905(0.0831) Grad: 128200.6406  LR: 0.00000359  
Epoch: [3][700/1539] Elapsed 8m 33s (remain 10m 13s) Loss: 0.1037(0.0842) Grad: 76223.9609  LR: 0.00000334  
EVAL: [195/196] Elapsed 0m 25s (re

100%|██████████| 6157/6157 [00:05<00:00, 1200.12it/s]
100%|██████████| 783/783 [00:00<00:00, 1312.34it/s]


783 (783, 9)


------------------- Start AWP --------------------


Epoch: [4][0/1539] Elapsed 0m 1s (remain 34m 37s) Loss: 0.0056(0.0056) Grad: 65503.0391  LR: 0.00000151  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2510(0.2034) 
Epoch: [4][100/1539] Elapsed 1m 21s (remain 19m 26s) Loss: 0.2063(0.0755) Grad: 188119.9219  LR: 0.00000133  
Epoch: [4][200/1539] Elapsed 2m 20s (remain 15m 35s) Loss: 0.0542(0.0722) Grad: 118136.6250  LR: 0.00000116  


Epoch 4 - Step 250/1539 - Save Best Score: 0.4493 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2393(0.2023) 
Epoch: [4][300/1539] Elapsed 3m 45s (remain 15m 26s) Loss: 0.0057(0.0711) Grad: 26966.2363  LR: 0.00000100  
Epoch: [4][400/1539] Elapsed 4m 45s (remain 13m 30s) Loss: 0.1390(0.0725) Grad: 60284.1562  LR: 0.00000085  
Epoch: [4][500/1539] Elapsed 5m 46s (remain 11m 56s) Loss: 0.0066(0.0727) Grad: 29242.6465  LR: 0.00000071  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2476(0.2031) 
Epoch: [4][600/1539] Elapsed 7m 8s (remain 11m 9s) Loss: 0.0142(0.0777) Grad: 28957.0723  LR: 0.00000058  
Epoch: [4][700/1539] Elapsed 8m 8s (remain 9m 43s) Loss: 0.1203(0.0780) Grad: 75589.8359  LR: 0.00000047  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2435(0.2028) 
Epoch: [4][800/1539] Elapsed 9m 28s (remain 8m 43s) Loss: 0.1640(0.0771) Grad: 63749.5547  LR: 0.00000036  
Epoch: [4][900/1539] Elapsed 10m 27s (remain 7m 24s) Loss: 0.0638(0.0765) Grad: 57095.7578  LR: 0.00000027  
Epoch: [4][1000/1539] Elapsed 11m 27s 

Score: 0.4493
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█

782 (782, 9)





Epoch: [1][0/1539] Elapsed 0m 1s (remain 26m 40s) Loss: 9.8026(9.8026) Grad: inf  LR: 0.00000010  


Epoch 1 - Step 0/1539 - Save Best Score: 2.9848 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 9.8770(9.0190) 
Epoch: [1][100/1539] Elapsed 1m 0s (remain 14m 16s) Loss: 0.2419(7.9128) Grad: 51256.1055  LR: 0.00001000  
Epoch: [1][200/1539] Elapsed 1m 35s (remain 10m 37s) Loss: 0.2205(4.0740) Grad: 49634.8711  LR: 0.00000999  


Epoch 1 - Step 250/1539 - Save Best Score: 0.4882 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1058(0.2389) 


------------------- Start AWP --------------------


Epoch: [1][300/1539] Elapsed 2m 34s (remain 10m 37s) Loss: 0.0491(2.7744) Grad: 10576.3838  LR: 0.00000997  
Epoch: [1][400/1539] Elapsed 3m 10s (remain 9m 0s) Loss: 0.1772(2.1219) Grad: 23042.4746  LR: 0.00000994  
Epoch: [1][500/1539] Elapsed 3m 46s (remain 7m 48s) Loss: 0.0571(1.7211) Grad: 12117.0586  LR: 0.00000989  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0834(0.2463) 
Epoch: [1][600/1539] Elapsed 4m 46s (remain 7m 26s) Loss: 0.1833(1.4553) Grad: 38396.9492  LR: 0.00000983  
Epoch: [1][700/1539] Elapsed 5m 22s (remain 6m 25s) Loss: 0.0489(1.2652) Grad: 13993.6924  LR: 0.00000976  


Epoch 1 - Step 750/1539 - Save Best Score: 0.4704 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1092(0.2217) 
Epoch: [1][800/1539] Elapsed 6m 23s (remain 5m 53s) Loss: 0.0946(1.1228) Grad: 21492.4668  LR: 0.00000967  
Epoch: [1][900/1539] Elapsed 7m 0s (remain 4m 57s) Loss: 0.1401(1.0106) Grad: 16304.7119  LR: 0.00000957  
Epoch: [1][1000/1539] Elapsed 7m 37s (remain 4m 5s) Loss: 0.0388(0.9205) Grad: 11764.2773  LR: 0.00000946  


Epoch 1 - Step 1000/1539 - Save Best Score: 0.4621 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0769(0.2142) 
Epoch: [1][1100/1539] Elapsed 8m 38s (remain 3m 26s) Loss: 0.2621(0.8499) Grad: 18473.2207  LR: 0.00000934  
Epoch: [1][1200/1539] Elapsed 9m 15s (remain 2m 36s) Loss: 0.0803(0.7894) Grad: 13791.4736  LR: 0.00000921  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1437(0.2255) 
Epoch: [1][1300/1539] Elapsed 10m 14s (remain 1m 52s) Loss: 0.0445(0.7375) Grad: 6583.9272  LR: 0.00000906  
Epoch: [1][1400/1539] Elapsed 10m 51s (remain 1m 4s) Loss: 0.1386(0.6936) Grad: 16260.3818  LR: 0.00000890  
Epoch: [1][1500/1539] Elapsed 11m 27s (remain 0m 17s) Loss: 0.0477(0.6552) Grad: 14773.9209  LR: 0.00000874  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0891(0.2204) 
Epoch: [1][1538/1539] Elapsed 12m 3s (remain 0m 0s) Loss: 0.2017(0.6420) Grad: 17029.4707  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1031(0.2168) 


100%|██████████| 6157/6157 [00:04<00:00, 1239.04it/s]
100%|██████████| 782/782 [00:00<00:00, 1315.97it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [2][0/1539] Elapsed 0m 1s (remain 34m 26s) Loss: 0.0870(0.0870) Grad: 322227.4688  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1158(0.2205) 
Epoch: [2][100/1539] Elapsed 1m 22s (remain 19m 34s) Loss: 0.0549(0.0930) Grad: 137054.7188  LR: 0.00000849  
Epoch: [2][200/1539] Elapsed 2m 22s (remain 15m 47s) Loss: 0.0659(0.0943) Grad: 82910.8516  LR: 0.00000830  


Epoch 2 - Step 250/1539 - Save Best Score: 0.4574 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0719(0.2098) 
Epoch: [2][300/1539] Elapsed 3m 47s (remain 15m 34s) Loss: 0.0461(0.0909) Grad: 94110.4609  LR: 0.00000810  
Epoch: [2][400/1539] Elapsed 4m 46s (remain 13m 33s) Loss: 0.1121(0.0888) Grad: 200661.1250  LR: 0.00000789  
Epoch: [2][500/1539] Elapsed 5m 47s (remain 11m 59s) Loss: 0.0652(0.0867) Grad: 70831.1641  LR: 0.00000768  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0626(0.2116) 
Epoch: [2][600/1539] Elapsed 7m 9s (remain 11m 10s) Loss: 0.0071(0.0846) Grad: 89333.2656  LR: 0.00000745  
Epoch: [2][700/1539] Elapsed 8m 10s (remain 9m 46s) Loss: 0.0838(0.0850) Grad: 160045.6094  LR: 0.00000722  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0687(0.2105) 
Epoch: [2][800/1539] Elapsed 9m 32s (remain 8m 47s) Loss: 0.0908(0.0850) Grad: 122208.8750  LR: 0.00000699  
Epoch: [2][900/1539] Elapsed 10m 32s (remain 7m 28s) Loss: 0.0480(0.0849) Grad: 126129.5078  LR: 0.00000675  
Epoch: [2][1000/1539] Elapsed 11

Epoch 2 - Step 1000/1539 - Save Best Score: 0.4572 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.0727(0.2096) 
Epoch: [2][1100/1539] Elapsed 12m 58s (remain 5m 9s) Loss: 0.0824(0.0870) Grad: 156053.0781  LR: 0.00000625  
Epoch: [2][1200/1539] Elapsed 13m 57s (remain 3m 55s) Loss: 0.0472(0.0868) Grad: 86658.1641  LR: 0.00000600  


Epoch 2 - Step 1250/1539 - Save Best Score: 0.4567 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0856(0.2091) 
Epoch: [2][1300/1539] Elapsed 15m 20s (remain 2m 48s) Loss: 0.0586(0.0860) Grad: 104437.6875  LR: 0.00000575  
Epoch: [2][1400/1539] Elapsed 16m 20s (remain 1m 36s) Loss: 0.1084(0.0859) Grad: 176641.7656  LR: 0.00000549  
Epoch: [2][1500/1539] Elapsed 17m 20s (remain 0m 26s) Loss: 0.0030(0.0868) Grad: 40579.3594  LR: 0.00000523  


Epoch 2 - Step 1500/1539 - Save Best Score: 0.4563 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0850(0.2088) 
Epoch: [2][1538/1539] Elapsed 18m 6s (remain 0m 0s) Loss: 0.0990(0.0865) Grad: 166984.5312  LR: 0.00000513  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0698(0.2093) 


100%|██████████| 6157/6157 [00:05<00:00, 1211.32it/s]
100%|██████████| 782/782 [00:00<00:00, 1312.05it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [3][0/1539] Elapsed 0m 1s (remain 35m 36s) Loss: 0.0560(0.0560) Grad: 235967.8906  LR: 0.00000513  


Epoch 3 - Step 0/1539 - Save Best Score: 0.4557 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.0881(0.2083) 
Epoch: [3][100/1539] Elapsed 1m 30s (remain 21m 33s) Loss: 0.0525(0.0747) Grad: 81518.3281  LR: 0.00000487  
Epoch: [3][200/1539] Elapsed 2m 32s (remain 16m 58s) Loss: 0.0546(0.0769) Grad: 80855.2891  LR: 0.00000461  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1156(0.2087) 
Epoch: [3][300/1539] Elapsed 3m 59s (remain 16m 26s) Loss: 0.1028(0.0786) Grad: 185080.2188  LR: 0.00000435  
Epoch: [3][400/1539] Elapsed 5m 2s (remain 14m 19s) Loss: 0.0585(0.0789) Grad: 99592.3203  LR: 0.00000410  
Epoch: [3][500/1539] Elapsed 6m 6s (remain 12m 39s) Loss: 0.0155(0.0784) Grad: 68789.8984  LR: 0.00000384  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1017(0.2086) 
Epoch: [3][600/1539] Elapsed 7m 34s (remain 11m 49s) Loss: 0.0669(0.0786) Grad: 125810.1641  LR: 0.00000359  
Epoch: [3][700/1539] Elapsed 8m 37s (remain 10m 18s) Loss: 0.0149(0.0790) Grad: 51966.7773  LR: 0.00000334  
EVAL: [195/196] Elapsed 0m 25s (r

100%|██████████| 6157/6157 [00:05<00:00, 1133.53it/s]
100%|██████████| 782/782 [00:00<00:00, 1214.51it/s]

782 (782, 9)



------------------- Start AWP --------------------


Epoch: [4][0/1539] Elapsed 0m 1s (remain 35m 5s) Loss: 0.1137(0.1137) Grad: 248702.5469  LR: 0.00000151  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0685(0.2103) 
Epoch: [4][100/1539] Elapsed 1m 22s (remain 19m 37s) Loss: 0.0884(0.0789) Grad: 124308.3594  LR: 0.00000133  
Epoch: [4][200/1539] Elapsed 2m 22s (remain 15m 49s) Loss: 0.0517(0.0745) Grad: 150350.2031  LR: 0.00000116  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0851(0.2106) 
Epoch: [4][300/1539] Elapsed 3m 46s (remain 15m 29s) Loss: 0.0026(0.0721) Grad: 27309.8789  LR: 0.00000100  
Epoch: [4][400/1539] Elapsed 4m 46s (remain 13m 34s) Loss: 0.0660(0.0730) Grad: 100740.0391  LR: 0.00000085  
Epoch: [4][500/1539] Elapsed 5m 47s (remain 12m 0s) Loss: 0.1451(0.0741) Grad: 173338.2656  LR: 0.00000071  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0780(0.2101) 
Epoch: [4][600/1539] Elapsed 7m 11s (remain 11m 13s) Loss: 0.0542(0.0734) Grad: 90397.2500  LR: 0.00000058  
Epoch: [4][700/1539] Elapsed 8m 1

Score: 0.4557
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█

782 (782, 9)





Epoch: [1][0/1539] Elapsed 0m 1s (remain 26m 3s) Loss: 12.8599(12.8599) Grad: inf  LR: 0.00000010  


Epoch 1 - Step 0/1539 - Save Best Score: 2.9656 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 12.6761(8.9016) 
Epoch: [1][100/1539] Elapsed 1m 1s (remain 14m 31s) Loss: 0.5079(8.1187) Grad: 290934.6250  LR: 0.00001000  
Epoch: [1][200/1539] Elapsed 1m 38s (remain 10m 54s) Loss: 0.1490(4.1751) Grad: 59722.8008  LR: 0.00000999  


Epoch 1 - Step 250/1539 - Save Best Score: 0.5579 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.0817(0.3118) 
Epoch: [1][300/1539] Elapsed 2m 38s (remain 10m 50s) Loss: 0.1213(2.8440) Grad: 36555.4844  LR: 0.00000997  
Epoch: [1][400/1539] Elapsed 3m 14s (remain 9m 12s) Loss: 0.3310(2.1723) Grad: 127410.0703  LR: 0.00000994  
Epoch: [1][500/1539] Elapsed 3m 50s (remain 7m 58s) Loss: 0.2761(1.7688) Grad: 77649.0234  LR: 0.00000989  


Epoch 1 - Step 500/1539 - Save Best Score: 0.5039 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2019(0.2549) 
Epoch: [1][600/1539] Elapsed 4m 52s (remain 7m 36s) Loss: 0.1396(1.4967) Grad: 33833.2969  LR: 0.00000983  
Epoch: [1][700/1539] Elapsed 5m 28s (remain 6m 32s) Loss: 0.1486(1.3022) Grad: 61238.3906  LR: 0.00000976  


Epoch 1 - Step 750/1539 - Save Best Score: 0.4643 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1842(0.2163) 


------------------- Start AWP --------------------


Epoch: [1][800/1539] Elapsed 6m 28s (remain 5m 58s) Loss: 0.1686(1.1561) Grad: 43811.5586  LR: 0.00000967  
Epoch: [1][900/1539] Elapsed 7m 4s (remain 5m 0s) Loss: 0.1182(1.0411) Grad: 28054.0527  LR: 0.00000957  
Epoch: [1][1000/1539] Elapsed 7m 41s (remain 4m 7s) Loss: 0.1245(0.9493) Grad: 45420.9805  LR: 0.00000946  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1901(0.2239) 
Epoch: [1][1100/1539] Elapsed 8m 40s (remain 3m 26s) Loss: 0.1834(0.8760) Grad: 71590.0312  LR: 0.00000934  
Epoch: [1][1200/1539] Elapsed 9m 16s (remain 2m 36s) Loss: 0.0937(0.8131) Grad: 31313.5918  LR: 0.00000921  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1260(0.2211) 
Epoch: [1][1300/1539] Elapsed 10m 14s (remain 1m 52s) Loss: 0.0478(0.7590) Grad: 17585.3340  LR: 0.00000906  
Epoch: [1][1400/1539] Elapsed 10m 51s (remain 1m 4s) Loss: 0.0445(0.7138) Grad: 20662.6973  LR: 0.00000890  
Epoch: [1][1500/1539] Elapsed 11m 27s (remain 0m 17s) Loss: 0.1181(0.6740) Grad: 45424.8555  LR: 0.000008

Epoch 1 - Step 1538/1539 - Save Best Score: 0.4569 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.2003(0.2093) 


100%|██████████| 6157/6157 [00:05<00:00, 1229.09it/s]
100%|██████████| 782/782 [00:00<00:00, 1293.97it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [2][0/1539] Elapsed 0m 1s (remain 32m 23s) Loss: 0.2010(0.2010) Grad: 456180.5938  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2133(0.2120) 
Epoch: [2][100/1539] Elapsed 1m 22s (remain 19m 35s) Loss: 0.1115(0.1091) Grad: 107808.3516  LR: 0.00000849  
Epoch: [2][200/1539] Elapsed 2m 22s (remain 15m 51s) Loss: 0.1197(0.0996) Grad: 130188.2188  LR: 0.00000830  


Epoch 2 - Step 250/1539 - Save Best Score: 0.4496 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1677(0.2028) 
Epoch: [2][300/1539] Elapsed 3m 48s (remain 15m 38s) Loss: 0.0359(0.1036) Grad: 60606.0859  LR: 0.00000810  
Epoch: [2][400/1539] Elapsed 4m 48s (remain 13m 39s) Loss: 0.0688(0.1018) Grad: 103291.5156  LR: 0.00000789  
Epoch: [2][500/1539] Elapsed 5m 49s (remain 12m 3s) Loss: 0.1879(0.0989) Grad: 99368.5234  LR: 0.00000768  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1714(0.2087) 
Epoch: [2][600/1539] Elapsed 7m 10s (remain 11m 12s) Loss: 0.0836(0.0985) Grad: 90697.5234  LR: 0.00000745  
Epoch: [2][700/1539] Elapsed 8m 13s (remain 9m 49s) Loss: 0.2895(0.0990) Grad: 62507.7734  LR: 0.00000722  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1828(0.2069) 
Epoch: [2][800/1539] Elapsed 9m 35s (remain 8m 50s) Loss: 0.0646(0.0994) Grad: 25209.9316  LR: 0.00000699  
Epoch: [2][900/1539] Elapsed 10m 35s (remain 7m 30s) Loss: 0.0837(0.0996) Grad: 34850.2500  LR: 0.00000675  
Epoch: [2][1000/1539] Elapsed 11m 3

100%|██████████| 6157/6157 [00:05<00:00, 1187.01it/s]
100%|██████████| 782/782 [00:00<00:00, 1280.53it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [3][0/1539] Elapsed 0m 1s (remain 35m 8s) Loss: 0.1022(0.1022) Grad: 294377.8438  LR: 0.00000513  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1305(0.2070) 
Epoch: [3][100/1539] Elapsed 1m 30s (remain 21m 27s) Loss: 0.1121(0.0892) Grad: 188759.7656  LR: 0.00000487  
Epoch: [3][200/1539] Elapsed 2m 32s (remain 16m 55s) Loss: 0.0694(0.0840) Grad: 114350.1484  LR: 0.00000461  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1457(0.2032) 
Epoch: [3][300/1539] Elapsed 4m 0s (remain 16m 28s) Loss: 0.1966(0.0793) Grad: 229651.8125  LR: 0.00000435  
Epoch: [3][400/1539] Elapsed 5m 2s (remain 14m 17s) Loss: 0.0891(0.0795) Grad: 115649.6016  LR: 0.00000410  
Epoch: [3][500/1539] Elapsed 6m 6s (remain 12m 38s) Loss: 0.0871(0.0818) Grad: 96877.8125  LR: 0.00000384  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1330(0.2030) 
Epoch: [3][600/1539] Elapsed 7m 33s (remain 11m 48s) Loss: 0.1172(0.0810) Grad: 101203.4219  LR: 0.00000359  
Epoch: [3][700/1539] Elapsed 8m 36

100%|██████████| 6157/6157 [00:05<00:00, 1185.24it/s]
100%|██████████| 782/782 [00:00<00:00, 1290.43it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [4][0/1539] Elapsed 0m 1s (remain 35m 40s) Loss: 0.1275(0.1275) Grad: 246645.7188  LR: 0.00000151  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1629(0.2036) 
Epoch: [4][100/1539] Elapsed 1m 23s (remain 19m 48s) Loss: 0.0810(0.0748) Grad: 226945.6719  LR: 0.00000133  
Epoch: [4][200/1539] Elapsed 2m 23s (remain 15m 58s) Loss: 0.1075(0.0656) Grad: 243416.8750  LR: 0.00000116  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1510(0.2027) 
Epoch: [4][300/1539] Elapsed 3m 47s (remain 15m 34s) Loss: 0.0333(0.0669) Grad: 83671.4922  LR: 0.00000100  
Epoch: [4][400/1539] Elapsed 4m 48s (remain 13m 37s) Loss: 0.0551(0.0678) Grad: 121773.0234  LR: 0.00000085  
Epoch: [4][500/1539] Elapsed 5m 48s (remain 12m 2s) Loss: 0.0560(0.0690) Grad: 93942.7812  LR: 0.00000071  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1474(0.2028) 
Epoch: [4][600/1539] Elapsed 7m 11s (remain 11m 13s) Loss: 0.0759(0.0702) Grad: 150651.7500  LR: 0.00000058  
Epoch: [4][700/1539] Elapsed 8m 

Score: 0.4496
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█

782 (782, 9)





Epoch: [1][0/1539] Elapsed 0m 1s (remain 26m 41s) Loss: 10.9333(10.9333) Grad: inf  LR: 0.00000010  


Epoch 1 - Step 0/1539 - Save Best Score: 2.9818 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 9.0209(9.0020) 
Epoch: [1][100/1539] Elapsed 1m 1s (remain 14m 31s) Loss: 0.3195(8.1398) Grad: 46228.6523  LR: 0.00001000  
Epoch: [1][200/1539] Elapsed 1m 38s (remain 10m 53s) Loss: 0.1523(4.1826) Grad: 36210.8750  LR: 0.00000999  


Epoch 1 - Step 250/1539 - Save Best Score: 0.4892 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1888(0.2403) 


------------------- Start AWP --------------------


Epoch: [1][300/1539] Elapsed 2m 37s (remain 10m 48s) Loss: 0.1601(2.8501) Grad: 39358.2031  LR: 0.00000997  
Epoch: [1][400/1539] Elapsed 3m 13s (remain 9m 9s) Loss: 0.2930(2.1772) Grad: 32382.6250  LR: 0.00000994  
Epoch: [1][500/1539] Elapsed 3m 49s (remain 7m 54s) Loss: 0.0769(1.7710) Grad: 13112.9453  LR: 0.00000989  


Epoch 1 - Step 500/1539 - Save Best Score: 0.4687 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1746(0.2203) 
Epoch: [1][600/1539] Elapsed 4m 50s (remain 7m 32s) Loss: 0.1177(1.4982) Grad: 15370.6738  LR: 0.00000983  
Epoch: [1][700/1539] Elapsed 5m 26s (remain 6m 30s) Loss: 0.0352(1.3014) Grad: 9315.8232  LR: 0.00000976  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1912(0.2227) 
Epoch: [1][800/1539] Elapsed 6m 24s (remain 5m 54s) Loss: 0.2212(1.1548) Grad: 20841.5820  LR: 0.00000967  
Epoch: [1][900/1539] Elapsed 7m 0s (remain 4m 57s) Loss: 0.0893(1.0396) Grad: 16884.2871  LR: 0.00000957  
Epoch: [1][1000/1539] Elapsed 7m 36s (remain 4m 5s) Loss: 0.0972(0.9470) Grad: 15633.8623  LR: 0.00000946  


Epoch 1 - Step 1000/1539 - Save Best Score: 0.4619 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.2094(0.2138) 
Epoch: [1][1100/1539] Elapsed 8m 37s (remain 3m 26s) Loss: 0.1019(0.8728) Grad: 21465.0195  LR: 0.00000934  
Epoch: [1][1200/1539] Elapsed 9m 14s (remain 2m 36s) Loss: 0.2161(0.8103) Grad: 15186.3125  LR: 0.00000921  


Epoch 1 - Step 1250/1539 - Save Best Score: 0.4541 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1272(0.2068) 
Epoch: [1][1300/1539] Elapsed 10m 14s (remain 1m 52s) Loss: 0.1583(0.7578) Grad: 24000.1777  LR: 0.00000906  
Epoch: [1][1400/1539] Elapsed 10m 50s (remain 1m 4s) Loss: 0.1356(0.7114) Grad: 15447.2686  LR: 0.00000890  
Epoch: [1][1500/1539] Elapsed 11m 28s (remain 0m 17s) Loss: 0.0557(0.6722) Grad: 10375.4434  LR: 0.00000874  


Epoch 1 - Step 1500/1539 - Save Best Score: 0.4475 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1592(0.2007) 
Epoch: [1][1538/1539] Elapsed 12m 5s (remain 0m 0s) Loss: 0.0848(0.6584) Grad: 16190.4111  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1826(0.2078) 


100%|██████████| 6157/6157 [00:04<00:00, 1254.06it/s]
100%|██████████| 782/782 [00:00<00:00, 1308.88it/s]


782 (782, 9)


------------------- Start AWP --------------------


Epoch: [2][0/1539] Elapsed 0m 1s (remain 34m 59s) Loss: 0.0755(0.0755) Grad: 272091.4688  LR: 0.00000867  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1811(0.2077) 
Epoch: [2][100/1539] Elapsed 1m 22s (remain 19m 33s) Loss: 0.0676(0.0974) Grad: 70843.9219  LR: 0.00000849  
Epoch: [2][200/1539] Elapsed 2m 21s (remain 15m 40s) Loss: 0.0345(0.0929) Grad: 72870.9531  LR: 0.00000830  


Epoch 2 - Step 250/1539 - Save Best Score: 0.4467 Model


EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1315(0.2000) 
Epoch: [2][300/1539] Elapsed 3m 45s (remain 15m 25s) Loss: 0.0929(0.0922) Grad: 121632.7656  LR: 0.00000810  
Epoch: [2][400/1539] Elapsed 4m 44s (remain 13m 26s) Loss: 0.0923(0.0916) Grad: 140961.7188  LR: 0.00000789  
Epoch: [2][500/1539] Elapsed 5m 43s (remain 11m 52s) Loss: 0.0771(0.0912) Grad: 113142.6172  LR: 0.00000768  


Epoch 2 - Step 500/1539 - Save Best Score: 0.4451 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1413(0.1985) 
Epoch: [2][600/1539] Elapsed 7m 8s (remain 11m 8s) Loss: 0.1670(0.0906) Grad: 105371.7812  LR: 0.00000745  
Epoch: [2][700/1539] Elapsed 8m 9s (remain 9m 45s) Loss: 0.0239(0.0888) Grad: 84871.6172  LR: 0.00000722  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1272(0.2005) 
Epoch: [2][800/1539] Elapsed 9m 30s (remain 8m 46s) Loss: 0.0831(0.0890) Grad: 142478.4062  LR: 0.00000699  
Epoch: [2][900/1539] Elapsed 10m 30s (remain 7m 26s) Loss: 0.1235(0.0886) Grad: 132466.5312  LR: 0.00000675  
Epoch: [2][1000/1539] Elapsed 11m 29s (remain 6m 10s) Loss: 0.2011(0.0895) Grad: 112298.4453  LR: 0.00000650  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1355(0.1986) 
Epoch: [2][1100/1539] Elapsed 12m 53s (remain 5m 7s) Loss: 0.0586(0.0897) Grad: 83806.8125  LR: 0.00000625  
Epoch: [2][1200/1539] Elapsed 13m 53s (remain 3m 54s) Loss: 0.0507(0.0896) Grad: 80747.8828  LR: 0.00000600  


Epoch 2 - Step 1250/1539 - Save Best Score: 0.4445 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1554(0.1979) 
Epoch: [2][1300/1539] Elapsed 15m 18s (remain 2m 48s) Loss: 0.0914(0.0892) Grad: 150266.0938  LR: 0.00000575  
Epoch: [2][1400/1539] Elapsed 16m 20s (remain 1m 36s) Loss: 0.0812(0.0892) Grad: 109347.0781  LR: 0.00000549  
Epoch: [2][1500/1539] Elapsed 17m 19s (remain 0m 26s) Loss: 0.0757(0.0892) Grad: 113348.1641  LR: 0.00000523  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1253(0.1987) 
Epoch: [2][1538/1539] Elapsed 18m 3s (remain 0m 0s) Loss: 0.0450(0.0892) Grad: 76725.2188  LR: 0.00000513  


Epoch 2 - Step 1538/1539 - Save Best Score: 0.4430 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1372(0.1966) 


100%|██████████| 6157/6157 [00:05<00:00, 1139.75it/s]
100%|██████████| 782/782 [00:00<00:00, 1198.80it/s]

782 (782, 9)



------------------- Start AWP --------------------


Epoch: [3][0/1539] Elapsed 0m 1s (remain 36m 6s) Loss: 0.0331(0.0331) Grad: 188270.7344  LR: 0.00000513  


Epoch 3 - Step 0/1539 - Save Best Score: 0.4429 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1340(0.1966) 
Epoch: [3][100/1539] Elapsed 1m 31s (remain 21m 37s) Loss: 0.1133(0.0798) Grad: 273234.8750  LR: 0.00000487  
Epoch: [3][200/1539] Elapsed 2m 33s (remain 17m 3s) Loss: 0.0292(0.0800) Grad: 201894.4688  LR: 0.00000461  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1435(0.1968) 
Epoch: [3][300/1539] Elapsed 4m 0s (remain 16m 27s) Loss: 0.0383(0.0807) Grad: 119773.1016  LR: 0.00000435  
Epoch: [3][400/1539] Elapsed 5m 3s (remain 14m 20s) Loss: 0.0609(0.0803) Grad: 127953.4922  LR: 0.00000410  
Epoch: [3][500/1539] Elapsed 6m 7s (remain 12m 41s) Loss: 0.0796(0.0831) Grad: 129478.7812  LR: 0.00000384  


Epoch 3 - Step 500/1539 - Save Best Score: 0.4429 Model


EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1376(0.1965) 
Epoch: [3][600/1539] Elapsed 7m 37s (remain 11m 53s) Loss: 0.0907(0.0834) Grad: 152783.3594  LR: 0.00000359  
Epoch: [3][700/1539] Elapsed 8m 39s (remain 10m 21s) Loss: 0.0966(0.0837) Grad: 46897.5547  LR: 0.00000334  
EVAL: [195/196] Elapsed 0m 25s (remain 0m 0s) Loss: 0.1234(0.2001) 
Epoch: [3][800/1539] Elapsed 10m 8s (remain 9m 20s) Loss: 0.0778(0.0829) Grad: 43258.3906  LR: 0.00000310  
Epoch: [3][900/1539] Elapsed 11m 10s (remain 7m 54s) Loss: 0.0956(0.0828) Grad: 77340.7031  LR: 0.00000287  
Epoch: [3][1000/1539] Elapsed 12m 13s (remain 6m 34s) Loss: 0.0866(0.0832) Grad: 68579.3594  LR: 0.00000263  
EVAL: [195/196] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1451(0.1967) 
Epoch: [3][1100/1539] Elapsed 13m 41s (remain 5m 26s) Loss: 0.0060(0.0829) Grad: 49390.4766  LR: 0.00000241  
Epoch: [3][1200/1539] Elapsed 14m 46s (remain 4m 9s) Loss: 0.0394(0.0830) Grad: 53969.0039  LR: 0.00000219  
EVAL: [195/196] Elapsed 0m 24s 

100%|██████████| 6157/6157 [00:05<00:00, 1145.19it/s]
100%|██████████| 782/782 [00:00<00:00, 1202.11it/s]

782 (782, 9)



------------------- Start AWP --------------------


Epoch: [4][0/1539] Elapsed 0m 1s (remain 35m 29s) Loss: 0.0967(0.0967) Grad: 232378.1406  LR: 0.00000151  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1398(0.1976) 
Epoch: [4][100/1539] Elapsed 1m 24s (remain 19m 59s) Loss: 0.1519(0.0851) Grad: 356525.2812  LR: 0.00000133  
Epoch: [4][200/1539] Elapsed 2m 24s (remain 15m 59s) Loss: 0.0027(0.0765) Grad: 107570.4766  LR: 0.00000116  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1409(0.1966) 
Epoch: [4][300/1539] Elapsed 3m 46s (remain 15m 33s) Loss: 0.0386(0.0747) Grad: 182123.1094  LR: 0.00000100  
Epoch: [4][400/1539] Elapsed 4m 46s (remain 13m 34s) Loss: 0.2092(0.0739) Grad: 371399.0000  LR: 0.00000085  
Epoch: [4][500/1539] Elapsed 5m 46s (remain 11m 58s) Loss: 0.1808(0.0755) Grad: 299562.7812  LR: 0.00000071  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1406(0.1974) 
Epoch: [4][600/1539] Elapsed 7m 10s (remain 11m 11s) Loss: 0.0549(0.0754) Grad: 220431.7344  LR: 0.00000058  
Epoch: [4][700/1539] Elapsed 

Epoch 4 - Step 750/1539 - Save Best Score: 0.4429 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1449(0.1965) 
Epoch: [4][800/1539] Elapsed 9m 34s (remain 8m 49s) Loss: 0.0042(0.0745) Grad: 128234.0312  LR: 0.00000036  
Epoch: [4][900/1539] Elapsed 10m 35s (remain 7m 30s) Loss: 0.2112(0.0747) Grad: 414776.8750  LR: 0.00000027  
Epoch: [4][1000/1539] Elapsed 11m 36s (remain 6m 14s) Loss: 0.0832(0.0749) Grad: 123096.0781  LR: 0.00000019  


Epoch 4 - Step 1000/1539 - Save Best Score: 0.4427 Model


EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1443(0.1964) 
Epoch: [4][1100/1539] Elapsed 13m 1s (remain 5m 10s) Loss: 0.0543(0.0748) Grad: 102284.5312  LR: 0.00000013  
Epoch: [4][1200/1539] Elapsed 14m 2s (remain 3m 57s) Loss: 0.0021(0.0756) Grad: 35553.8945  LR: 0.00000008  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1415(0.1965) 
Epoch: [4][1300/1539] Elapsed 15m 25s (remain 2m 49s) Loss: 0.1578(0.0758) Grad: 165586.1875  LR: 0.00000004  
Epoch: [4][1400/1539] Elapsed 16m 26s (remain 1m 37s) Loss: 0.0181(0.0766) Grad: 32381.0156  LR: 0.00000001  
Epoch: [4][1500/1539] Elapsed 17m 26s (remain 0m 26s) Loss: 0.0340(0.0765) Grad: 47045.3398  LR: 0.00000000  
EVAL: [195/196] Elapsed 0m 21s (remain 0m 0s) Loss: 0.1412(0.1966) 
Epoch: [4][1538/1539] Elapsed 18m 10s (remain 0m 0s) Loss: 0.0953(0.0764) Grad: 58932.4844  LR: 0.00000000  
EVAL: [195/196] Elapsed 0m 22s (remain 0m 0s) Loss: 0.1412(0.1966) 


Score: 0.4427
Score: 0.4480


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] lr,▅██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold1] loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold1] lr,▅██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold2] loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold2] lr,▅██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold3] loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold3] lr,▅██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold4] loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold4] lr,▅██████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁

0,1
[fold0] loss,0.06943
[fold0] lr,0.0
[fold1] loss,0.07988
[fold1] lr,0.0
[fold2] loss,0.01709
[fold2] lr,0.0
[fold3] loss,0.0839
[fold3] lr,0.0
[fold4] loss,0.09529
[fold4] lr,0.0


In [24]:
CFG.tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')

('/home/rashmi/Documents/kaggle/feedback3//src/models_exp117a_PL_9/tokenizer/tokenizer_config.json',
 '/home/rashmi/Documents/kaggle/feedback3//src/models_exp117a_PL_9/tokenizer/special_tokens_map.json',
 '/home/rashmi/Documents/kaggle/feedback3//src/models_exp117a_PL_9/tokenizer/spm.model',
 '/home/rashmi/Documents/kaggle/feedback3//src/models_exp117a_PL_9/tokenizer/added_tokens.json',
 '/home/rashmi/Documents/kaggle/feedback3//src/models_exp117a_PL_9/tokenizer/tokenizer.json')

In [25]:
!nvidia-smi

Thu Nov 17 03:55:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.56.06    Driver Version: 520.56.06    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  Off |
| 31%   47C    P2    72W / 450W |  10847MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [26]:
if colab:
    from google.colab import runtime
    runtime.unassign()