## Setup

In [1]:
!pip install wandb
!pip install transformers[deepspeed]

[0m

In [2]:
from datasets import Dataset,DatasetDict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoModelForSequenceClassification,AutoTokenizer, AutoConfig
from transformers import TrainingArguments, Trainer
import torch
import os
import random
from transformers import EvalPrediction, TrainerCallback
import torch
from sklearn.metrics import mean_squared_error
import gc
import transformers
import wandb

"""
%%bash
cat <<'EOT' > ds_config_zero3.json
{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "offload_param": {
            "device": "cpu",
            "pin_memory": true
        },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "stage3_gather_16bit_weights_on_model_save": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}
EOT
"

In [3]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msimveit[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Constants

In [4]:
%env WANDB_PROJECT=FP3

env: WANDB_PROJECT=FP3


In [5]:
# DeepSpeed requires a distributed environment even when only one process is used.
# This emulates a launcher in the notebook
#os.environ["MASTER_ADDR"] = "localhost"
#os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
#os.environ["RANK"] = "0"
#os.environ["LOCAL_RANK"] = "0"
#os.environ["WORLD_SIZE"] = "1"
#os.environ["TOKENIZERS_PARALLELISM"] = "1"

In [6]:
BASE_PATH = "/notebooks"

In [7]:
CV_PATH = os.path.join(BASE_PATH, 'data/train/cv')

In [8]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [9]:
LABELS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
id2label = {idx:label for idx, label in enumerate(LABELS)}
label2id = {label:idx for idx, label in enumerate(LABELS)}

## Functions

In [10]:
# Metric
def mcrmse(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score

    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    y_pred = torch.Tensor(predictions)
    y_true = labels
    mcrmse_acc = mcrmse(y_trues=y_true, y_preds=y_pred)
    # return as dictionary
    metrics = {'mcrmse': mcrmse_acc}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [11]:
from torch import nn

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = nn.SmoothL1Loss(reduction='mean')
        loss = loss_fct(logits.squeeze(), labels.squeeze())
        return (loss, outputs) if return_outputs else loss

In [12]:
def train_model(model_dir, out_dir, df_train, df_val, 
                hyperparams={'bs': 4, 'lr': 9e-6, 'ep': 1, 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0}, 
                save_model = False):
    transformers.logging.set_verbosity_error()
    # CREATE DATASETS
    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_val)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    
    # training scores
    scores = []
    
    def preprocess_data(examples):
        # take a batch of texts
        text = examples["full_text"]
        # encode them
        encoding = tokenizer(text, truncation=True, padding=True, max_length=MAX_LENGTH)
        # add labels
        labels_batch = {k: examples[k] for k in examples.keys() if k in LABELS}
        # create numpy array of shape (batch_size, num_labels)
        labels_matrix = np.zeros((len(text), len(LABELS)))
        # fill numpy array
        for idx, label in enumerate(LABELS):
            labels_matrix[:, idx] = labels_batch[label]
        encoding["labels"] = labels_matrix.tolist()
        return encoding

    dataset_train_encoded = dataset_train.map(preprocess_data, batched=True, remove_columns=LABELS+["full_text","text_id"])
    dataset_test_encoded = dataset_test.map(preprocess_data, batched=True, remove_columns=LABELS+["full_text","text_id"])
    
    
    # MODEL
    config = AutoConfig.from_pretrained(model_dir, 
                                        num_labels=len(LABELS),
                                        id2label=id2label,
                                        label2id=label2id, 
                                        hidden_dropout_prob=hyperparams['hidden_dropout_prob'],
                                        attention_probs_dropout_prob = hyperparams['attention_probs_dropout_prob']
                                        )
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, 
                                                               config = config)
    
    # ARGUMENTS
    args = TrainingArguments(
            output_dir=out_dir,
            evaluation_strategy = "steps",
            warmup_ratio = 0.1,
            learning_rate = hyperparams['lr'],
            eval_steps = 100, 
            num_train_epochs=hyperparams['ep'],
            lr_scheduler_type='cosine',
            load_best_model_at_end=True,
            per_device_train_batch_size=hyperparams['bs'],
            per_device_eval_batch_size=hyperparams['bs'],
            #save_strategy="no",
            save_total_limit = 1,
            metric_for_best_model="mcrmse",
            greater_is_better = False,
            report_to="wandb",  # enable logging to W&B
            run_name=out_dir,  # name of the W&B run (optional)
            #deepspeed="ds_config_zero3.json" #deepspeed
     )
    
    class SaveLogs(TrainerCallback):
        """
        A bare [`TrainerCallback`] that just prints the logs.
        """

        def on_log(self, args, state, control, logs=None, **kwargs):
            _ = logs.pop("total_flos", None)
            if state.is_local_process_zero:
                try:
                    scores.append(logs["eval_mcrmse"])
                except:
                    pass
    
    # TRAINER
    trainer = CustomTrainer(model,
                            args,
                            train_dataset=dataset_train_encoded,
                            eval_dataset=dataset_test_encoded,
                            tokenizer=tokenizer,
                            compute_metrics=compute_metrics,
                            callbacks = [SaveLogs]
    )
    trainer.train()
    trainer.save_model("trainer" + out_dir)
    wandb.finish()
    if save_model:
        pass
    else:
        del model
        gc.collect()
        return min(scores)
    return all_logs

In [13]:
#train_model("microsoft/deberta-v3-small", "out", pd.read_csv("data/train/cv/train_fold_0.csv"), pd.read_csv("data/train/cv/val_fold_0.csv"))

In [14]:
def train_cv_v2(model_dir, out_dir, fold_dir, hyperparams={'bs': 4, 'lr': 9e-6, 'ep': 1},
                kfolds=[0, 1, 2, 3, 4, 5], continue_training = True):
    scores = []
    for fold in kfolds:
        train_df = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
        val_df = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')

        model_out_dir = out_dir + '/model_fold_' + str(fold)
        if continue_training:
            final_model_dir = model_dir + '/model_fold_' + str(fold) + '/best'
        else:
            final_model_dir = model_dir

        best_score = train_model(
            model_dir=model_dir,
            out_dir=model_out_dir,
            df_train=train_df,
            df_val=val_df,
            save_model=False,
            hyperparams=hyperparams,
          )
        scores.append(best_score)
    cv_score = np.mean(scores)
    return scores, cv_score

In [15]:
model_dir = "microsoft/deberta-v3-large"
fold_dir = BASE_PATH + "/data/train/cv"
out_dir = "DEBERTA-LARGE"
hyperparams = {'bs': 4, 'lr': 2e-5, 'ep': 3, 'hidden_dropout_prob': 0.0, 'attention_probs_dropout_prob': 0.0}
scores, cv_score = train_cv_v2(model_dir, out_dir, fold_dir, hyperparams)



  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1370846927165985, 'eval_mcrmse': 0.5267567038536072, 'eval_runtime': 21.8138, 'eval_samples_per_second': 29.889, 'eval_steps_per_second': 7.472, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.14703688025474548, 'eval_mcrmse': 0.5461340546607971, 'eval_runtime': 23.1959, 'eval_samples_per_second': 28.108, 'eval_steps_per_second': 7.027, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11995860934257507, 'eval_mcrmse': 0.4900945723056793, 'eval_runtime': 22.0053, 'eval_samples_per_second': 29.629, 'eval_steps_per_second': 7.407, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12254511564970016, 'eval_mcrmse': 0.4968127906322479, 'eval_runtime': 22.1519, 'eval_samples_per_second': 29.433, 'eval_steps_per_second': 7.358, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.4481, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_0/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_0/checkpoint-500/config.json


{'eval_loss': 0.12896405160427094, 'eval_mcrmse': 0.5089833736419678, 'eval_runtime': 21.8275, 'eval_samples_per_second': 29.871, 'eval_steps_per_second': 7.468, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_0/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_0/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_0/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10960336774587631, 'eval_mcrmse': 0.4688187837600708, 'eval_runtime': 22.0111, 'eval_samples_per_second': 29.621, 'eval_steps_per_second': 7.405, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13614210486412048, 'eval_mcrmse': 0.5225774049758911, 'eval_runtime': 21.9451, 'eval_samples_per_second': 29.711, 'eval_steps_per_second': 7.428, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11864342540502548, 'eval_mcrmse': 0.4886339008808136, 'eval_runtime': 21.8583, 'eval_samples_per_second': 29.829, 'eval_steps_per_second': 7.457, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10803566873073578, 'eval_mcrmse': 0.46565768122673035, 'eval_runtime': 23.6404, 'eval_samples_per_second': 27.58, 'eval_steps_per_second': 6.895, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.1172, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_0/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_0/checkpoint-1000/config.json


{'eval_loss': 0.10916637629270554, 'eval_mcrmse': 0.4678000509738922, 'eval_runtime': 22.2053, 'eval_samples_per_second': 29.362, 'eval_steps_per_second': 7.341, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_0/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_0/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_0/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_0/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10675899684429169, 'eval_mcrmse': 0.46168509125709534, 'eval_runtime': 21.7841, 'eval_samples_per_second': 29.93, 'eval_steps_per_second': 7.483, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10358770936727524, 'eval_mcrmse': 0.4556765854358673, 'eval_runtime': 21.8822, 'eval_samples_per_second': 29.796, 'eval_steps_per_second': 7.449, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12939292192459106, 'eval_mcrmse': 0.5099655985832214, 'eval_runtime': 21.7825, 'eval_samples_per_second': 29.932, 'eval_steps_per_second': 7.483, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10609663277864456, 'eval_mcrmse': 0.4620300829410553, 'eval_runtime': 22.788, 'eval_samples_per_second': 28.612, 'eval_steps_per_second': 7.153, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0943, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_0/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_0/checkpoint-1500/config.json


{'eval_loss': 0.10232391953468323, 'eval_mcrmse': 0.45301952958106995, 'eval_runtime': 21.8415, 'eval_samples_per_second': 29.851, 'eval_steps_per_second': 7.463, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_0/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_0/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_0/checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_0/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1070546805858612, 'eval_mcrmse': 0.463383287191391, 'eval_runtime': 22.0389, 'eval_samples_per_second': 29.584, 'eval_steps_per_second': 7.396, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10183504968881607, 'eval_mcrmse': 0.45204198360443115, 'eval_runtime': 21.9126, 'eval_samples_per_second': 29.755, 'eval_steps_per_second': 7.439, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1037071943283081, 'eval_mcrmse': 0.4558808505535126, 'eval_runtime': 22.0668, 'eval_samples_per_second': 29.547, 'eval_steps_per_second': 7.387, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10273992270231247, 'eval_mcrmse': 0.4542276859283447, 'eval_runtime': 23.0441, 'eval_samples_per_second': 28.294, 'eval_steps_per_second': 7.073, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10203609615564346, 'eval_mcrmse': 0.45249471068382263, 'eval_runtime': 21.9167, 'eval_samples_per_second': 29.749, 'eval_steps_per_second': 7.437, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10130654275417328, 'eval_mcrmse': 0.4507924020290375, 'eval_runtime': 21.8828, 'eval_samples_per_second': 29.795, 'eval_steps_per_second': 7.449, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1013089194893837, 'eval_mcrmse': 0.4509223699569702, 'eval_runtime': 21.8862, 'eval_samples_per_second': 29.791, 'eval_steps_per_second': 7.448, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1012047529220581, 'eval_mcrmse': 0.45068123936653137, 'eval_runtime': 21.9046, 'eval_samples_per_second': 29.765, 'eval_steps_per_second': 7.441, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_0/checkpoint-2000 (score: 0.4529070556163788).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_0
Configuration saved in trainerDEBERTA-LARGE/model_fold_0/config.json


{'train_runtime': 1750.6883, 'train_samples_per_second': 5.585, 'train_steps_per_second': 1.397, 'train_loss': 0.1579831963911622, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_0/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_0/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_0/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▆█▄▄▅▂▆▄▂▂▂▁▅▂▁▂▁▁▁▁▁▁▁▁
eval/mcrmse,▇█▄▄▅▂▆▄▂▂▂▁▅▂▁▂▁▁▁▁▁▁▁▁
eval/runtime,▁▆▂▂▁▂▂▁█▃▁▁▁▅▁▂▁▂▆▃▂▁▁▁
eval/samples_per_second,█▃▇▇█▇▇█▁▆███▄█▇▇▇▃▆▇███
eval/steps_per_second,█▃▇▇█▇▇█▁▆███▄█▇▇▇▃▆▇███
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▂▁
train/total_flos,▁

0,1
eval/loss,0.1012
eval/mcrmse,0.45068
eval/runtime,21.9046
eval/samples_per_second,29.765
eval/steps_per_second,7.441
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0641
train/total_flos,4555838797046784.0




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.21462015807628632, 'eval_mcrmse': 0.6707934737205505, 'eval_runtime': 21.861, 'eval_samples_per_second': 29.825, 'eval_steps_per_second': 7.456, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.2151668220758438, 'eval_mcrmse': 0.6648253798484802, 'eval_runtime': 21.9917, 'eval_samples_per_second': 29.648, 'eval_steps_per_second': 7.412, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13617165386676788, 'eval_mcrmse': 0.5233010649681091, 'eval_runtime': 21.8459, 'eval_samples_per_second': 29.845, 'eval_steps_per_second': 7.461, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.2130146324634552, 'eval_mcrmse': 0.6634412407875061, 'eval_runtime': 21.8702, 'eval_samples_per_second': 29.812, 'eval_steps_per_second': 7.453, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.4459, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_1/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_1/checkpoint-500/config.json


{'eval_loss': 0.129531130194664, 'eval_mcrmse': 0.5111370086669922, 'eval_runtime': 22.2245, 'eval_samples_per_second': 29.337, 'eval_steps_per_second': 7.334, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_1/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_1/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13192009925842285, 'eval_mcrmse': 0.5146897435188293, 'eval_runtime': 21.7785, 'eval_samples_per_second': 29.938, 'eval_steps_per_second': 7.484, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12027673423290253, 'eval_mcrmse': 0.49194014072418213, 'eval_runtime': 23.055, 'eval_samples_per_second': 28.28, 'eval_steps_per_second': 7.07, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11686734855175018, 'eval_mcrmse': 0.4846620559692383, 'eval_runtime': 21.9723, 'eval_samples_per_second': 29.674, 'eval_steps_per_second': 7.418, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12264508008956909, 'eval_mcrmse': 0.4972743093967438, 'eval_runtime': 25.063, 'eval_samples_per_second': 26.014, 'eval_steps_per_second': 6.504, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.1208, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_1/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_1/checkpoint-1000/config.json


{'eval_loss': 0.12253057956695557, 'eval_mcrmse': 0.49697670340538025, 'eval_runtime': 21.8436, 'eval_samples_per_second': 29.849, 'eval_steps_per_second': 7.462, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_1/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_1/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_1/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_1/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13875974714756012, 'eval_mcrmse': 0.5301674008369446, 'eval_runtime': 21.8552, 'eval_samples_per_second': 29.833, 'eval_steps_per_second': 7.458, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11768734455108643, 'eval_mcrmse': 0.48629891872406006, 'eval_runtime': 21.8735, 'eval_samples_per_second': 29.808, 'eval_steps_per_second': 7.452, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11329328268766403, 'eval_mcrmse': 0.47708141803741455, 'eval_runtime': 21.8378, 'eval_samples_per_second': 29.856, 'eval_steps_per_second': 7.464, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11218245327472687, 'eval_mcrmse': 0.47440361976623535, 'eval_runtime': 22.8716, 'eval_samples_per_second': 28.507, 'eval_steps_per_second': 7.127, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0932, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_1/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_1/checkpoint-1500/config.json


{'eval_loss': 0.10979566723108292, 'eval_mcrmse': 0.4691852629184723, 'eval_runtime': 21.9042, 'eval_samples_per_second': 29.766, 'eval_steps_per_second': 7.441, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_1/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_1/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_1/checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_1/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11076731979846954, 'eval_mcrmse': 0.4711887538433075, 'eval_runtime': 21.8234, 'eval_samples_per_second': 29.876, 'eval_steps_per_second': 7.469, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10822010785341263, 'eval_mcrmse': 0.46566176414489746, 'eval_runtime': 21.8774, 'eval_samples_per_second': 29.802, 'eval_steps_per_second': 7.451, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10994858294725418, 'eval_mcrmse': 0.46925392746925354, 'eval_runtime': 21.7836, 'eval_samples_per_second': 29.931, 'eval_steps_per_second': 7.483, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10840417444705963, 'eval_mcrmse': 0.4661999046802521, 'eval_runtime': 21.8212, 'eval_samples_per_second': 29.879, 'eval_steps_per_second': 7.47, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0695, 'learning_rate': 1.9520036835178667e-06, 'epoch': 2.45}


Saving model checkpoint to DEBERTA-LARGE/model_fold_1/checkpoint-2000
Configuration saved in DEBERTA-LARGE/model_fold_1/checkpoint-2000/config.json


{'eval_loss': 0.10807471722364426, 'eval_mcrmse': 0.46537473797798157, 'eval_runtime': 21.8787, 'eval_samples_per_second': 29.801, 'eval_steps_per_second': 7.45, 'epoch': 2.45}


Model weights saved in DEBERTA-LARGE/model_fold_1/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_1/checkpoint-2000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_1/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_1/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10811436921358109, 'eval_mcrmse': 0.4654439389705658, 'eval_runtime': 21.857, 'eval_samples_per_second': 29.83, 'eval_steps_per_second': 7.458, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10737935453653336, 'eval_mcrmse': 0.4638478457927704, 'eval_runtime': 22.0295, 'eval_samples_per_second': 29.597, 'eval_steps_per_second': 7.399, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10740187764167786, 'eval_mcrmse': 0.46390894055366516, 'eval_runtime': 21.9859, 'eval_samples_per_second': 29.655, 'eval_steps_per_second': 7.414, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10741549730300903, 'eval_mcrmse': 0.4639393389225006, 'eval_runtime': 21.7659, 'eval_samples_per_second': 29.955, 'eval_steps_per_second': 7.489, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_1/checkpoint-2000 (score: 0.46537473797798157).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_1
Configuration saved in trainerDEBERTA-LARGE/model_fold_1/config.json


{'train_runtime': 1773.6774, 'train_samples_per_second': 5.512, 'train_steps_per_second': 1.378, 'train_loss': 0.15984971918211394, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_1/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_1/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_1/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,██▃█▂▃▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/mcrmse,██▃█▃▃▂▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▁▁▂▁▄▁█▁▁▁▁▃▁▁▁▁▁▁▁▂▁▁
eval/samples_per_second,█▇██▇█▅█▁████▅███████▇▇█
eval/steps_per_second,█▇██▇█▅▇▁████▅███████▇▇█
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▁▁
train/total_flos,▁

0,1
eval/loss,0.10742
eval/mcrmse,0.46394
eval/runtime,21.7659
eval/samples_per_second,29.955
eval/steps_per_second,7.489
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0695
train/total_flos,4555838797046784.0




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.19463416934013367, 'eval_mcrmse': 0.6292238235473633, 'eval_runtime': 21.808, 'eval_samples_per_second': 29.897, 'eval_steps_per_second': 7.474, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1630294919013977, 'eval_mcrmse': 0.5752027630805969, 'eval_runtime': 21.8223, 'eval_samples_per_second': 29.878, 'eval_steps_per_second': 7.469, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.16391751170158386, 'eval_mcrmse': 0.577516496181488, 'eval_runtime': 21.8591, 'eval_samples_per_second': 29.827, 'eval_steps_per_second': 7.457, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13241831958293915, 'eval_mcrmse': 0.5164709687232971, 'eval_runtime': 21.8633, 'eval_samples_per_second': 29.822, 'eval_steps_per_second': 7.455, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.4379, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_2/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_2/checkpoint-500/config.json


{'eval_loss': 0.1317400485277176, 'eval_mcrmse': 0.5137775540351868, 'eval_runtime': 21.877, 'eval_samples_per_second': 29.803, 'eval_steps_per_second': 7.451, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_2/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_2/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1326916664838791, 'eval_mcrmse': 0.5152541995048523, 'eval_runtime': 21.8238, 'eval_samples_per_second': 29.876, 'eval_steps_per_second': 7.469, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12667304277420044, 'eval_mcrmse': 0.5051281452178955, 'eval_runtime': 22.4087, 'eval_samples_per_second': 29.096, 'eval_steps_per_second': 7.274, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12194082140922546, 'eval_mcrmse': 0.49487996101379395, 'eval_runtime': 21.9822, 'eval_samples_per_second': 29.66, 'eval_steps_per_second': 7.415, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1256069839000702, 'eval_mcrmse': 0.5032078623771667, 'eval_runtime': 21.8176, 'eval_samples_per_second': 29.884, 'eval_steps_per_second': 7.471, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.118, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_2/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_2/checkpoint-1000/config.json


{'eval_loss': 0.11577456444501877, 'eval_mcrmse': 0.482128381729126, 'eval_runtime': 21.8901, 'eval_samples_per_second': 29.785, 'eval_steps_per_second': 7.446, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_2/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_2/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11685344576835632, 'eval_mcrmse': 0.4841444790363312, 'eval_runtime': 21.8206, 'eval_samples_per_second': 29.88, 'eval_steps_per_second': 7.47, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11725005507469177, 'eval_mcrmse': 0.4859500825405121, 'eval_runtime': 21.9493, 'eval_samples_per_second': 29.705, 'eval_steps_per_second': 7.426, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11332755535840988, 'eval_mcrmse': 0.4770815074443817, 'eval_runtime': 21.8937, 'eval_samples_per_second': 29.78, 'eval_steps_per_second': 7.445, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1378432959318161, 'eval_mcrmse': 0.5267776250839233, 'eval_runtime': 23.0283, 'eval_samples_per_second': 28.313, 'eval_steps_per_second': 7.078, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0955, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_2/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_2/checkpoint-1500/config.json


{'eval_loss': 0.1147896945476532, 'eval_mcrmse': 0.4803767204284668, 'eval_runtime': 21.9931, 'eval_samples_per_second': 29.646, 'eval_steps_per_second': 7.411, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_2/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_2/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_2/checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_2/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11031550168991089, 'eval_mcrmse': 0.470780611038208, 'eval_runtime': 22.0603, 'eval_samples_per_second': 29.555, 'eval_steps_per_second': 7.389, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10971738398075104, 'eval_mcrmse': 0.4694833755493164, 'eval_runtime': 21.9281, 'eval_samples_per_second': 29.734, 'eval_steps_per_second': 7.433, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11548476666212082, 'eval_mcrmse': 0.48154914379119873, 'eval_runtime': 21.8568, 'eval_samples_per_second': 29.831, 'eval_steps_per_second': 7.458, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11203035712242126, 'eval_mcrmse': 0.4742553234100342, 'eval_runtime': 21.9631, 'eval_samples_per_second': 29.686, 'eval_steps_per_second': 7.422, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0676, 'learning_rate': 1.9520036835178667e-06, 'epoch': 2.45}


Saving model checkpoint to DEBERTA-LARGE/model_fold_2/checkpoint-2000
Configuration saved in DEBERTA-LARGE/model_fold_2/checkpoint-2000/config.json


{'eval_loss': 0.11256564408540726, 'eval_mcrmse': 0.475892037153244, 'eval_runtime': 21.8408, 'eval_samples_per_second': 29.852, 'eval_steps_per_second': 7.463, 'epoch': 2.45}


Model weights saved in DEBERTA-LARGE/model_fold_2/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_2/checkpoint-2000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_2/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_2/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10964980721473694, 'eval_mcrmse': 0.46915867924690247, 'eval_runtime': 21.822, 'eval_samples_per_second': 29.878, 'eval_steps_per_second': 7.47, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10997775942087173, 'eval_mcrmse': 0.4699234664440155, 'eval_runtime': 22.0726, 'eval_samples_per_second': 29.539, 'eval_steps_per_second': 7.385, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11005282402038574, 'eval_mcrmse': 0.4701957404613495, 'eval_runtime': 21.9796, 'eval_samples_per_second': 29.664, 'eval_steps_per_second': 7.416, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11001697927713394, 'eval_mcrmse': 0.47012266516685486, 'eval_runtime': 22.8385, 'eval_samples_per_second': 28.548, 'eval_steps_per_second': 7.137, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_2/checkpoint-2000 (score: 0.475892037153244).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_2
Configuration saved in trainerDEBERTA-LARGE/model_fold_2/config.json


{'train_runtime': 1764.0384, 'train_samples_per_second': 5.542, 'train_steps_per_second': 1.386, 'train_loss': 0.15742389702358128, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_2/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_2/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_2/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▅▅▃▃▃▂▂▂▂▂▂▁▃▁▁▁▁▁▁▁▁▁▁
eval/mcrmse,█▆▆▃▃▃▃▂▂▂▂▂▁▄▁▁▁▂▁▁▁▁▁▁
eval/runtime,▁▁▁▁▁▁▄▂▁▁▁▂▁█▂▂▂▁▂▁▁▃▂▇
eval/samples_per_second,██████▄▇███▇▇▁▇▆▇█▇██▆▇▂
eval/steps_per_second,██████▄▇███▇▇▁▇▆▇█▇██▆▇▂
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▂▁
train/total_flos,▁

0,1
eval/loss,0.11002
eval/mcrmse,0.47012
eval/runtime,22.8385
eval/samples_per_second,28.548
eval/steps_per_second,7.137
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0676
train/total_flos,4555838797046784.0




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.17355100810527802, 'eval_mcrmse': 0.597622811794281, 'eval_runtime': 21.9913, 'eval_samples_per_second': 29.648, 'eval_steps_per_second': 7.412, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.2946293354034424, 'eval_mcrmse': 0.8326303362846375, 'eval_runtime': 22.3308, 'eval_samples_per_second': 29.197, 'eval_steps_per_second': 7.299, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1289903223514557, 'eval_mcrmse': 0.5078224539756775, 'eval_runtime': 23.4144, 'eval_samples_per_second': 27.846, 'eval_steps_per_second': 6.962, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1190231442451477, 'eval_mcrmse': 0.4892277717590332, 'eval_runtime': 21.8716, 'eval_samples_per_second': 29.81, 'eval_steps_per_second': 7.453, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.4584, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_3/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_3/checkpoint-500/config.json


{'eval_loss': 0.1450168937444687, 'eval_mcrmse': 0.5403605699539185, 'eval_runtime': 21.8165, 'eval_samples_per_second': 29.886, 'eval_steps_per_second': 7.471, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_3/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_3/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_3/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11942071467638016, 'eval_mcrmse': 0.48967185616493225, 'eval_runtime': 21.7773, 'eval_samples_per_second': 29.939, 'eval_steps_per_second': 7.485, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1294674426317215, 'eval_mcrmse': 0.512749969959259, 'eval_runtime': 21.9454, 'eval_samples_per_second': 29.71, 'eval_steps_per_second': 7.428, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11501263082027435, 'eval_mcrmse': 0.48060718178749084, 'eval_runtime': 22.7231, 'eval_samples_per_second': 28.693, 'eval_steps_per_second': 7.173, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1225045770406723, 'eval_mcrmse': 0.49544429779052734, 'eval_runtime': 21.8827, 'eval_samples_per_second': 29.795, 'eval_steps_per_second': 7.449, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.1194, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_3/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_3/checkpoint-1000/config.json


{'eval_loss': 0.11953000724315643, 'eval_mcrmse': 0.4909560978412628, 'eval_runtime': 23.1709, 'eval_samples_per_second': 28.139, 'eval_steps_per_second': 7.035, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_3/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_3/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_3/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_3/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10827372223138809, 'eval_mcrmse': 0.4662870466709137, 'eval_runtime': 21.8945, 'eval_samples_per_second': 29.779, 'eval_steps_per_second': 7.445, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10547565668821335, 'eval_mcrmse': 0.46012595295906067, 'eval_runtime': 21.8269, 'eval_samples_per_second': 29.871, 'eval_steps_per_second': 7.468, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11970259994268417, 'eval_mcrmse': 0.4909043312072754, 'eval_runtime': 22.1249, 'eval_samples_per_second': 29.469, 'eval_steps_per_second': 7.367, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10829336941242218, 'eval_mcrmse': 0.46654054522514343, 'eval_runtime': 21.7866, 'eval_samples_per_second': 29.927, 'eval_steps_per_second': 7.482, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0945, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_3/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_3/checkpoint-1500/config.json


{'eval_loss': 0.10898231714963913, 'eval_mcrmse': 0.46781793236732483, 'eval_runtime': 23.1378, 'eval_samples_per_second': 28.179, 'eval_steps_per_second': 7.045, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_3/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_3/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_3/checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_3/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10875146090984344, 'eval_mcrmse': 0.4671614468097687, 'eval_runtime': 21.7897, 'eval_samples_per_second': 29.922, 'eval_steps_per_second': 7.481, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10704611241817474, 'eval_mcrmse': 0.4636197090148926, 'eval_runtime': 22.2855, 'eval_samples_per_second': 29.257, 'eval_steps_per_second': 7.314, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10733963549137115, 'eval_mcrmse': 0.4642641246318817, 'eval_runtime': 21.9482, 'eval_samples_per_second': 29.706, 'eval_steps_per_second': 7.427, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10864907503128052, 'eval_mcrmse': 0.46713629364967346, 'eval_runtime': 22.0941, 'eval_samples_per_second': 29.51, 'eval_steps_per_second': 7.378, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0677, 'learning_rate': 1.9520036835178667e-06, 'epoch': 2.45}


Saving model checkpoint to DEBERTA-LARGE/model_fold_3/checkpoint-2000
Configuration saved in DEBERTA-LARGE/model_fold_3/checkpoint-2000/config.json


{'eval_loss': 0.10612836480140686, 'eval_mcrmse': 0.4614690840244293, 'eval_runtime': 22.0047, 'eval_samples_per_second': 29.63, 'eval_steps_per_second': 7.408, 'epoch': 2.45}


Model weights saved in DEBERTA-LARGE/model_fold_3/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_3/checkpoint-2000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_3/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_3/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10540474206209183, 'eval_mcrmse': 0.4597926437854767, 'eval_runtime': 21.9947, 'eval_samples_per_second': 29.644, 'eval_steps_per_second': 7.411, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10536673665046692, 'eval_mcrmse': 0.4597257673740387, 'eval_runtime': 22.0216, 'eval_samples_per_second': 29.607, 'eval_steps_per_second': 7.402, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10538946092128754, 'eval_mcrmse': 0.45976564288139343, 'eval_runtime': 21.8146, 'eval_samples_per_second': 29.888, 'eval_steps_per_second': 7.472, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.10536704212427139, 'eval_mcrmse': 0.4597109258174896, 'eval_runtime': 21.8493, 'eval_samples_per_second': 29.841, 'eval_steps_per_second': 7.46, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_3/checkpoint-2000 (score: 0.4614690840244293).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_3
Configuration saved in trainerDEBERTA-LARGE/model_fold_3/config.json


{'train_runtime': 1767.1689, 'train_samples_per_second': 5.533, 'train_steps_per_second': 1.384, 'train_loss': 0.16159235090322047, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_3/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_3/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_3/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▄█▂▂▂▂▂▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁
eval/mcrmse,▄█▂▂▃▂▂▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▃█▁▁▁▂▅▁▇▂▁▂▁▇▁▃▂▂▂▂▂▁▁
eval/samples_per_second,▇▆▁███▇▄█▂▇█▆█▂█▆▇▇▇▇▇██
eval/steps_per_second,▇▆▁███▇▄█▂▇█▆█▂█▆▇▇▇▇▇██
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▁▁
train/total_flos,▁

0,1
eval/loss,0.10537
eval/mcrmse,0.45971
eval/runtime,21.8493
eval/samples_per_second,29.841
eval/steps_per_second,7.46
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0677
train/total_flos,4555838797046784.0




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3260
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.25637558102607727, 'eval_mcrmse': 0.738994836807251, 'eval_runtime': 21.8454, 'eval_samples_per_second': 29.8, 'eval_steps_per_second': 7.462, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.20440465211868286, 'eval_mcrmse': 0.6469113230705261, 'eval_runtime': 21.9173, 'eval_samples_per_second': 29.703, 'eval_steps_per_second': 7.437, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.15186165273189545, 'eval_mcrmse': 0.5524607300758362, 'eval_runtime': 21.7831, 'eval_samples_per_second': 29.886, 'eval_steps_per_second': 7.483, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.14916324615478516, 'eval_mcrmse': 0.5509610772132874, 'eval_runtime': 21.7787, 'eval_samples_per_second': 29.892, 'eval_steps_per_second': 7.484, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'loss': 0.4482, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_4/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_4/checkpoint-500/config.json


{'eval_loss': 0.11898963898420334, 'eval_mcrmse': 0.48939600586891174, 'eval_runtime': 27.7081, 'eval_samples_per_second': 23.495, 'eval_steps_per_second': 5.883, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_4/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_4/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_4/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.13274994492530823, 'eval_mcrmse': 0.517382800579071, 'eval_runtime': 21.7231, 'eval_samples_per_second': 29.968, 'eval_steps_per_second': 7.504, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.12069390714168549, 'eval_mcrmse': 0.4929755926132202, 'eval_runtime': 21.8748, 'eval_samples_per_second': 29.76, 'eval_steps_per_second': 7.452, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.12626004219055176, 'eval_mcrmse': 0.5051617622375488, 'eval_runtime': 21.8583, 'eval_samples_per_second': 29.783, 'eval_steps_per_second': 7.457, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.1075499951839447, 'eval_mcrmse': 0.4651283323764801, 'eval_runtime': 21.8291, 'eval_samples_per_second': 29.823, 'eval_steps_per_second': 7.467, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'loss': 0.1206, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_4/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_4/checkpoint-1000/config.json


{'eval_loss': 0.11658301949501038, 'eval_mcrmse': 0.4840054214000702, 'eval_runtime': 24.124, 'eval_samples_per_second': 26.986, 'eval_steps_per_second': 6.757, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_4/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_4/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_4/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_4/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.1200837641954422, 'eval_mcrmse': 0.4916214644908905, 'eval_runtime': 21.7576, 'eval_samples_per_second': 29.921, 'eval_steps_per_second': 7.492, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.11270428448915482, 'eval_mcrmse': 0.476224422454834, 'eval_runtime': 21.8446, 'eval_samples_per_second': 29.801, 'eval_steps_per_second': 7.462, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10679031163454056, 'eval_mcrmse': 0.4631342887878418, 'eval_runtime': 21.9741, 'eval_samples_per_second': 29.626, 'eval_steps_per_second': 7.418, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10796169936656952, 'eval_mcrmse': 0.466154545545578, 'eval_runtime': 21.8153, 'eval_samples_per_second': 29.841, 'eval_steps_per_second': 7.472, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'loss': 0.0952, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_4/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_4/checkpoint-1500/config.json


{'eval_loss': 0.11097747832536697, 'eval_mcrmse': 0.4722318947315216, 'eval_runtime': 21.7868, 'eval_samples_per_second': 29.88, 'eval_steps_per_second': 7.482, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_4/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_4/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_4/checkpoint-1500/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_4/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10865263640880585, 'eval_mcrmse': 0.4668833315372467, 'eval_runtime': 22.0988, 'eval_samples_per_second': 29.459, 'eval_steps_per_second': 7.376, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10557871311903, 'eval_mcrmse': 0.4604159891605377, 'eval_runtime': 23.1789, 'eval_samples_per_second': 28.086, 'eval_steps_per_second': 7.032, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10486762225627899, 'eval_mcrmse': 0.45880410075187683, 'eval_runtime': 22.0924, 'eval_samples_per_second': 29.467, 'eval_steps_per_second': 7.378, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10549431294202805, 'eval_mcrmse': 0.46033430099487305, 'eval_runtime': 22.0213, 'eval_samples_per_second': 29.562, 'eval_steps_per_second': 7.402, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'loss': 0.0676, 'learning_rate': 1.9520036835178667e-06, 'epoch': 2.45}


Saving model checkpoint to DEBERTA-LARGE/model_fold_4/checkpoint-2000
Configuration saved in DEBERTA-LARGE/model_fold_4/checkpoint-2000/config.json


{'eval_loss': 0.10484741628170013, 'eval_mcrmse': 0.4589599072933197, 'eval_runtime': 21.8538, 'eval_samples_per_second': 29.789, 'eval_steps_per_second': 7.459, 'epoch': 2.45}


Model weights saved in DEBERTA-LARGE/model_fold_4/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_4/checkpoint-2000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_4/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_4/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10455658286809921, 'eval_mcrmse': 0.45833835005760193, 'eval_runtime': 21.9481, 'eval_samples_per_second': 29.661, 'eval_steps_per_second': 7.427, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10460924357175827, 'eval_mcrmse': 0.458395391702652, 'eval_runtime': 22.9211, 'eval_samples_per_second': 28.402, 'eval_steps_per_second': 7.111, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10425802320241928, 'eval_mcrmse': 0.4576106071472168, 'eval_runtime': 22.165, 'eval_samples_per_second': 29.371, 'eval_steps_per_second': 7.354, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 651
  Batch size = 4


{'eval_loss': 0.10417678952217102, 'eval_mcrmse': 0.4574316740036011, 'eval_runtime': 21.8389, 'eval_samples_per_second': 29.809, 'eval_steps_per_second': 7.464, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_4/checkpoint-2000 (score: 0.4589599072933197).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_4
Configuration saved in trainerDEBERTA-LARGE/model_fold_4/config.json


{'train_runtime': 1775.5151, 'train_samples_per_second': 5.508, 'train_steps_per_second': 1.377, 'train_loss': 0.15999494057247732, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_4/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_4/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_4/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▆▃▃▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mcrmse,█▆▃▃▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▁▁█▁▁▁▁▄▁▁▁▁▁▁▃▁▁▁▁▂▂▁
eval/samples_per_second,████▁████▅█████▇▆▇███▆▇█
eval/steps_per_second,████▁████▅█████▇▆▇███▆▇█
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▂▁
train/total_flos,▁

0,1
eval/loss,0.10418
eval/mcrmse,0.45743
eval/runtime,21.8389
eval/samples_per_second,29.809
eval/steps_per_second,7.464
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0676
train/total_flos,4557236722421760.0




  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

***** Running training *****
  Num examples = 3259
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2445
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1798526495695114, 'eval_mcrmse': 0.6080383658409119, 'eval_runtime': 23.7709, 'eval_samples_per_second': 27.428, 'eval_steps_per_second': 6.857, 'epoch': 0.12}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.2216256707906723, 'eval_mcrmse': 0.6690865159034729, 'eval_runtime': 21.9426, 'eval_samples_per_second': 29.714, 'eval_steps_per_second': 7.428, 'epoch': 0.25}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13361552357673645, 'eval_mcrmse': 0.5195302367210388, 'eval_runtime': 22.7663, 'eval_samples_per_second': 28.639, 'eval_steps_per_second': 7.16, 'epoch': 0.37}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12345369160175323, 'eval_mcrmse': 0.49946311116218567, 'eval_runtime': 21.881, 'eval_samples_per_second': 29.798, 'eval_steps_per_second': 7.449, 'epoch': 0.49}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.4473, 'learning_rate': 1.9344306953445632e-05, 'epoch': 0.61}


Saving model checkpoint to DEBERTA-LARGE/model_fold_5/checkpoint-500
Configuration saved in DEBERTA-LARGE/model_fold_5/checkpoint-500/config.json


{'eval_loss': 0.15058931708335876, 'eval_mcrmse': 0.5515605807304382, 'eval_runtime': 23.4326, 'eval_samples_per_second': 27.824, 'eval_steps_per_second': 6.956, 'epoch': 0.61}


Model weights saved in DEBERTA-LARGE/model_fold_5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_5/checkpoint-500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_5/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.1271062195301056, 'eval_mcrmse': 0.5065149664878845, 'eval_runtime': 21.7863, 'eval_samples_per_second': 29.927, 'eval_steps_per_second': 7.482, 'epoch': 0.74}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.13798366487026215, 'eval_mcrmse': 0.5297662615776062, 'eval_runtime': 21.8769, 'eval_samples_per_second': 29.803, 'eval_steps_per_second': 7.451, 'epoch': 0.86}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12199853360652924, 'eval_mcrmse': 0.4962909519672394, 'eval_runtime': 21.8253, 'eval_samples_per_second': 29.874, 'eval_steps_per_second': 7.468, 'epoch': 0.98}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11790776252746582, 'eval_mcrmse': 0.4872227907180786, 'eval_runtime': 23.2406, 'eval_samples_per_second': 28.054, 'eval_steps_per_second': 7.014, 'epoch': 1.1}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.1146, 'learning_rate': 1.4729702107813438e-05, 'epoch': 1.23}


Saving model checkpoint to DEBERTA-LARGE/model_fold_5/checkpoint-1000
Configuration saved in DEBERTA-LARGE/model_fold_5/checkpoint-1000/config.json


{'eval_loss': 0.11980653554201126, 'eval_mcrmse': 0.4916222095489502, 'eval_runtime': 21.9101, 'eval_samples_per_second': 29.758, 'eval_steps_per_second': 7.439, 'epoch': 1.23}


Model weights saved in DEBERTA-LARGE/model_fold_5/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_5/checkpoint-1000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_5/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_5/checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12172272056341171, 'eval_mcrmse': 0.4951744079589844, 'eval_runtime': 21.809, 'eval_samples_per_second': 29.896, 'eval_steps_per_second': 7.474, 'epoch': 1.35}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11977870762348175, 'eval_mcrmse': 0.4914788007736206, 'eval_runtime': 22.082, 'eval_samples_per_second': 29.526, 'eval_steps_per_second': 7.382, 'epoch': 1.47}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12291842699050903, 'eval_mcrmse': 0.4982386827468872, 'eval_runtime': 21.8613, 'eval_samples_per_second': 29.824, 'eval_steps_per_second': 7.456, 'epoch': 1.6}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12130042165517807, 'eval_mcrmse': 0.4948798418045044, 'eval_runtime': 21.9977, 'eval_samples_per_second': 29.64, 'eval_steps_per_second': 7.41, 'epoch': 1.72}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0951, 'learning_rate': 7.804633756159258e-06, 'epoch': 1.84}


Saving model checkpoint to DEBERTA-LARGE/model_fold_5/checkpoint-1500
Configuration saved in DEBERTA-LARGE/model_fold_5/checkpoint-1500/config.json


{'eval_loss': 0.12334058433771133, 'eval_mcrmse': 0.4986139237880707, 'eval_runtime': 22.0426, 'eval_samples_per_second': 29.579, 'eval_steps_per_second': 7.395, 'epoch': 1.84}


Model weights saved in DEBERTA-LARGE/model_fold_5/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_5/checkpoint-1500/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_5/checkpoint-1500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.12401095032691956, 'eval_mcrmse': 0.4999326765537262, 'eval_runtime': 22.8639, 'eval_samples_per_second': 28.517, 'eval_steps_per_second': 7.129, 'epoch': 1.96}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11435551196336746, 'eval_mcrmse': 0.47974538803100586, 'eval_runtime': 22.2855, 'eval_samples_per_second': 29.257, 'eval_steps_per_second': 7.314, 'epoch': 2.09}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11712316423654556, 'eval_mcrmse': 0.48600277304649353, 'eval_runtime': 21.9812, 'eval_samples_per_second': 29.662, 'eval_steps_per_second': 7.415, 'epoch': 2.21}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11336223781108856, 'eval_mcrmse': 0.47774598002433777, 'eval_runtime': 21.8012, 'eval_samples_per_second': 29.907, 'eval_steps_per_second': 7.477, 'epoch': 2.33}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'loss': 0.0671, 'learning_rate': 1.9520036835178667e-06, 'epoch': 2.45}


Saving model checkpoint to DEBERTA-LARGE/model_fold_5/checkpoint-2000
Configuration saved in DEBERTA-LARGE/model_fold_5/checkpoint-2000/config.json


{'eval_loss': 0.11338387429714203, 'eval_mcrmse': 0.47765398025512695, 'eval_runtime': 22.3298, 'eval_samples_per_second': 29.199, 'eval_steps_per_second': 7.3, 'epoch': 2.45}


Model weights saved in DEBERTA-LARGE/model_fold_5/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in DEBERTA-LARGE/model_fold_5/checkpoint-2000/tokenizer_config.json
Special tokens file saved in DEBERTA-LARGE/model_fold_5/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [DEBERTA-LARGE/model_fold_5/checkpoint-1000] due to args.save_total_limit
Deleting older checkpoint [DEBERTA-LARGE/model_fold_5/checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11291727423667908, 'eval_mcrmse': 0.4767405092716217, 'eval_runtime': 21.9016, 'eval_samples_per_second': 29.77, 'eval_steps_per_second': 7.442, 'epoch': 2.58}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11202742159366608, 'eval_mcrmse': 0.4748063385486603, 'eval_runtime': 23.1985, 'eval_samples_per_second': 28.105, 'eval_steps_per_second': 7.026, 'epoch': 2.7}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11251739412546158, 'eval_mcrmse': 0.475888729095459, 'eval_runtime': 21.8107, 'eval_samples_per_second': 29.894, 'eval_steps_per_second': 7.473, 'epoch': 2.82}


***** Running Evaluation *****
  Num examples = 652
  Batch size = 4


{'eval_loss': 0.11227554827928543, 'eval_mcrmse': 0.4753783047199249, 'eval_runtime': 22.0779, 'eval_samples_per_second': 29.532, 'eval_steps_per_second': 7.383, 'epoch': 2.94}




Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from DEBERTA-LARGE/model_fold_5/checkpoint-2000 (score: 0.47765398025512695).
Saving model checkpoint to trainerDEBERTA-LARGE/model_fold_5
Configuration saved in trainerDEBERTA-LARGE/model_fold_5/config.json


{'train_runtime': 1769.0949, 'train_samples_per_second': 5.527, 'train_steps_per_second': 1.382, 'train_loss': 0.15810979831438124, 'epoch': 3.0}


Model weights saved in trainerDEBERTA-LARGE/model_fold_5/pytorch_model.bin
tokenizer config file saved in trainerDEBERTA-LARGE/model_fold_5/tokenizer_config.json
Special tokens file saved in trainerDEBERTA-LARGE/model_fold_5/special_tokens_map.json


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▅█▂▂▃▂▃▂▁▁▂▁▂▂▂▂▁▁▁▁▁▁▁▁
eval/mcrmse,▆█▃▂▄▂▃▂▁▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁
eval/runtime,█▂▄▁▇▁▁▁▆▁▁▂▁▂▂▅▃▂▁▃▁▆▁▂
eval/samples_per_second,▁▇▄█▂███▃██▇█▇▇▄▆▇█▆█▃█▇
eval/steps_per_second,▁▇▄█▂███▃██▇█▇▇▄▆▇█▆█▃█▇
train/epoch,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇███
train/learning_rate,█▆▃▁
train/loss,█▂▂▁
train/total_flos,▁

0,1
eval/loss,0.11228
eval/mcrmse,0.47538
eval/runtime,22.0779
eval/samples_per_second,29.532
eval/steps_per_second,7.383
train/epoch,3.0
train/global_step,2445.0
train/learning_rate,0.0
train/loss,0.0671
train/total_flos,4555838797046784.0


In [16]:
print("scores = ", scores)
print("cv_score = ", cv_score)

scores =  [0.45068123936653137, 0.4638478457927704, 0.46915867924690247, 0.4597109258174896, 0.4574316740036011, 0.4748063385486603]
cv_score =  0.46260611712932587
