In [1]:

import random

import evaluate
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, DataCollatorWithPadding,
                          Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback)
from sklearn.model_selection import ParameterGrid
from rebert.model import ReBertModel
import gc

seed = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

2024-01-05 16:51:10.017531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-05 16:51:10.017561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-05 16:51:10.018617: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-05 16:51:10.023663: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<torch._C.Generator at 0x7fd037194970>

In [2]:
device = "cuda"
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {v: k for k, v in id2label.items()}
ds = load_from_disk("./data/mnli_roberta")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 19647
    })
})

In [3]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": acc}

In [4]:
BATCH_EVAL = 128
LAMBDA = 0.01
OUTPUT = "rebert_rope_mnli"
TB_DIR = "rebert_rope_mnli_tb"

hyper_params = {
    "batch": [16, 32, 64],
    "lr": [1e-3, 5e-4, 1e-4, 2e-5]
}

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_result = []

for params in ParameterGrid(hyper_params):
    model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

    max_steps = 2000 * 32 // params["batch"]
    training_args = TrainingArguments(
        output_dir=OUTPUT,
        learning_rate=params["lr"],
        per_device_train_batch_size=params["batch"],
        per_device_eval_batch_size=BATCH_EVAL,
        bf16=True,
        gradient_checkpointing=True,
        max_steps=max_steps,
        weight_decay=LAMBDA,
        lr_scheduler_type="constant",
        evaluation_strategy="no",
        save_strategy="no",
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["eval"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    results = trainer.train()
    eval_perf = trainer.evaluate()
    eval_result.append((params, eval_perf))
    
    gc.collect()
    torch.cuda.empty_cache()

Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.153
1000,1.1238
1500,1.118
2000,1.1157
2500,1.11
3000,1.1051
3500,1.105
4000,1.1036


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': True, 'lr': 0.001}
Eval: {'eval_loss': 1.0996252298355103, 'eval_accuracy': 0.3182165216063521, 'eval_runtime': 16.4935, 'eval_samples_per_second': 1191.195, 'eval_steps_per_second': 9.337, 'epoch': 0.16}




Step,Training Loss
500,1.1402
1000,1.1164
1500,1.1088
2000,1.1089
2500,1.1057
3000,1.1042
3500,1.1043
4000,1.1027


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': True, 'lr': 0.0005}
Eval: {'eval_loss': 1.1001811027526855, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.4735, 'eval_samples_per_second': 1192.642, 'eval_steps_per_second': 9.348, 'epoch': 0.16}




Step,Training Loss
500,1.1197
1000,1.1151
1500,1.1087
2000,1.1089
2500,1.1077
3000,1.1057
3500,1.1063
4000,1.1061


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': True, 'lr': 0.0001}
Eval: {'eval_loss': 1.0953338146209717, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 17.64, 'eval_samples_per_second': 1113.776, 'eval_steps_per_second': 8.73, 'epoch': 0.16}




Step,Training Loss
500,1.0834
1000,1.0192
1500,0.9774
2000,0.9537
2500,0.9512
3000,0.9224
3500,0.9277
4000,0.9223


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': True, 'lr': 2e-05}
Eval: {'eval_loss': 0.8950769901275635, 'eval_accuracy': 0.5938311192548481, 'eval_runtime': 16.6185, 'eval_samples_per_second': 1182.233, 'eval_steps_per_second': 9.267, 'epoch': 0.16}




Step,Training Loss
500,1.1543
1000,1.1258
1500,1.1195
2000,1.1177
2500,1.1131
3000,1.1051
3500,1.1046
4000,1.1033


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': False, 'lr': 0.001}
Eval: {'eval_loss': 1.0988366603851318, 'eval_accuracy': 0.32844709115895554, 'eval_runtime': 16.7338, 'eval_samples_per_second': 1174.093, 'eval_steps_per_second': 9.203, 'epoch': 0.16}




Step,Training Loss
500,1.1402
1000,1.1164
1500,1.1088
2000,1.1089
2500,1.1057
3000,1.1042
3500,1.1043
4000,1.1027


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': False, 'lr': 0.0005}
Eval: {'eval_loss': 1.1001811027526855, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 17.2379, 'eval_samples_per_second': 1139.757, 'eval_steps_per_second': 8.934, 'epoch': 0.16}




Step,Training Loss
500,1.1197
1000,1.1151
1500,1.1087
2000,1.1089
2500,1.1077
3000,1.1057
3500,1.1063
4000,1.1061


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': False, 'lr': 0.0001}
Eval: {'eval_loss': 1.0953338146209717, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.8929, 'eval_samples_per_second': 1163.036, 'eval_steps_per_second': 9.116, 'epoch': 0.16}




Step,Training Loss
500,1.0834
1000,1.0192
1500,0.9774
2000,0.9537
2500,0.9512
3000,0.9224
3500,0.9277
4000,0.9223


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 16, 'freeze_embeddings': False, 'lr': 2e-05}
Eval: {'eval_loss': 0.8950769901275635, 'eval_accuracy': 0.5938311192548481, 'eval_runtime': 16.6523, 'eval_samples_per_second': 1179.84, 'eval_steps_per_second': 9.248, 'epoch': 0.16}




Step,Training Loss
500,1.1395
1000,1.1133
1500,1.1068
2000,1.1055


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': True, 'lr': 0.001}
Eval: {'eval_loss': 1.099082350730896, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.6219, 'eval_samples_per_second': 1181.992, 'eval_steps_per_second': 9.265, 'epoch': 0.16}




Step,Training Loss
500,1.1294
1000,1.1087
1500,1.104
2000,1.1023


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': True, 'lr': 0.0005}
Eval: {'eval_loss': 1.0991730690002441, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.6529, 'eval_samples_per_second': 1179.794, 'eval_steps_per_second': 9.248, 'epoch': 0.16}




Step,Training Loss
500,1.1137
1000,1.1078
1500,1.1054
2000,1.1049


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': True, 'lr': 0.0001}
Eval: {'eval_loss': 1.1004061698913574, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 15.8943, 'eval_samples_per_second': 1236.104, 'eval_steps_per_second': 9.689, 'epoch': 0.16}




Step,Training Loss
500,1.0566
1000,0.959
1500,0.9325
2000,0.9168


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': True, 'lr': 2e-05}
Eval: {'eval_loss': 0.8783003687858582, 'eval_accuracy': 0.6039089937395022, 'eval_runtime': 15.9369, 'eval_samples_per_second': 1232.799, 'eval_steps_per_second': 9.663, 'epoch': 0.16}




Step,Training Loss
500,1.1395
1000,1.1133
1500,1.1068
2000,1.1055


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': False, 'lr': 0.001}
Eval: {'eval_loss': 1.099082350730896, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 15.8742, 'eval_samples_per_second': 1237.671, 'eval_steps_per_second': 9.701, 'epoch': 0.16}




Step,Training Loss
500,1.1294
1000,1.1087
1500,1.104
2000,1.1023


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': False, 'lr': 0.0005}
Eval: {'eval_loss': 1.0991730690002441, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 17.5457, 'eval_samples_per_second': 1119.76, 'eval_steps_per_second': 8.777, 'epoch': 0.16}




Step,Training Loss
500,1.1137
1000,1.1078
1500,1.1054
2000,1.1049


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': False, 'lr': 0.0001}
Eval: {'eval_loss': 1.1004061698913574, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.6166, 'eval_samples_per_second': 1182.373, 'eval_steps_per_second': 9.268, 'epoch': 0.16}




Step,Training Loss
500,1.0566
1000,0.959
1500,0.9325
2000,0.9168


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 32, 'freeze_embeddings': False, 'lr': 2e-05}
Eval: {'eval_loss': 0.8783003687858582, 'eval_accuracy': 0.6039089937395022, 'eval_runtime': 16.6518, 'eval_samples_per_second': 1179.875, 'eval_steps_per_second': 9.248, 'epoch': 0.16}




Step,Training Loss
500,1.1274
1000,1.1053


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': True, 'lr': 0.001}
Eval: {'eval_loss': 1.0988378524780273, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.5978, 'eval_samples_per_second': 1183.713, 'eval_steps_per_second': 9.278, 'epoch': 0.16}




Step,Training Loss
500,1.1194
1000,1.1032


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': True, 'lr': 0.0005}
Eval: {'eval_loss': 1.0973502397537231, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.5351, 'eval_samples_per_second': 1188.201, 'eval_steps_per_second': 9.314, 'epoch': 0.16}




Step,Training Loss
500,1.1076
1000,1.0782


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': True, 'lr': 0.0001}
Eval: {'eval_loss': 1.0554519891738892, 'eval_accuracy': 0.4482618211431771, 'eval_runtime': 16.6213, 'eval_samples_per_second': 1182.034, 'eval_steps_per_second': 9.265, 'epoch': 0.16}




Step,Training Loss
500,1.0183
1000,0.933


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': True, 'lr': 2e-05}
Eval: {'eval_loss': 0.886547863483429, 'eval_accuracy': 0.5980556828014455, 'eval_runtime': 16.6818, 'eval_samples_per_second': 1177.749, 'eval_steps_per_second': 9.232, 'epoch': 0.16}




Step,Training Loss
500,1.1274
1000,1.1053


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': False, 'lr': 0.001}
Eval: {'eval_loss': 1.0988378524780273, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.544, 'eval_samples_per_second': 1187.563, 'eval_steps_per_second': 9.309, 'epoch': 0.16}




Step,Training Loss
500,1.1194
1000,1.1032


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': False, 'lr': 0.0005}
Eval: {'eval_loss': 1.0973502397537231, 'eval_accuracy': 0.3533363872346923, 'eval_runtime': 16.5399, 'eval_samples_per_second': 1187.858, 'eval_steps_per_second': 9.311, 'epoch': 0.16}




Step,Training Loss
500,1.1076
1000,1.0782


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameters: {'batch': 64, 'freeze_embeddings': False, 'lr': 0.0001}
Eval: {'eval_loss': 1.0554519891738892, 'eval_accuracy': 0.4482618211431771, 'eval_runtime': 16.6328, 'eval_samples_per_second': 1181.218, 'eval_steps_per_second': 9.259, 'epoch': 0.16}




Step,Training Loss
500,1.0183
1000,0.933


Parameters: {'batch': 64, 'freeze_embeddings': False, 'lr': 2e-05}
Eval: {'eval_loss': 0.886547863483429, 'eval_accuracy': 0.5980556828014455, 'eval_runtime': 16.6838, 'eval_samples_per_second': 1177.606, 'eval_steps_per_second': 9.23, 'epoch': 0.16}


In [5]:
best_params = sorted(eval_result, key=lambda x: x[1]["eval_loss"])
best_params[:5]

[({'batch': 32, 'freeze_embeddings': True, 'lr': 2e-05},
  {'eval_loss': 0.8783003687858582,
   'eval_accuracy': 0.6039089937395022,
   'eval_runtime': 15.9369,
   'eval_samples_per_second': 1232.799,
   'eval_steps_per_second': 9.663,
   'epoch': 0.16}),
 ({'batch': 32, 'freeze_embeddings': False, 'lr': 2e-05},
  {'eval_loss': 0.8783003687858582,
   'eval_accuracy': 0.6039089937395022,
   'eval_runtime': 16.6518,
   'eval_samples_per_second': 1179.875,
   'eval_steps_per_second': 9.248,
   'epoch': 0.16}),
 ({'batch': 64, 'freeze_embeddings': True, 'lr': 2e-05},
  {'eval_loss': 0.886547863483429,
   'eval_accuracy': 0.5980556828014455,
   'eval_runtime': 16.6818,
   'eval_samples_per_second': 1177.749,
   'eval_steps_per_second': 9.232,
   'epoch': 0.16}),
 ({'batch': 64, 'freeze_embeddings': False, 'lr': 2e-05},
  {'eval_loss': 0.886547863483429,
   'eval_accuracy': 0.5980556828014455,
   'eval_runtime': 16.6838,
   'eval_samples_per_second': 1177.606,
   'eval_steps_per_second': 9.2

In [None]:
BATCH_TRAIN = best_params[0][0]["batch"]
BATCH_EVAL = 128
LEARNING_RATE = best_params[0][0]["lr"]
EPOCHS = 10
SAVE_STEPS = 2000
LOG_STEPS = 2000
LAMBDA = 0.01
SAVE_LIMITS = 10
WARMUP = 0.06
OUTPUT = "rebert_scratch_mnli"
TB_DIR = "rebert_scratch_mnli_tb"

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

training_args = TrainingArguments(
    output_dir=OUTPUT,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    bf16=True,
    gradient_checkpointing=True,
    num_train_epochs=EPOCHS,
    weight_decay=LAMBDA,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    logging_dir=TB_DIR,
    save_total_limit=SAVE_LIMITS,
    load_best_model_at_end=True,
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

results = trainer.train(resume_from_checkpoint=False)

Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
2000,1.058,0.930387,0.565023
4000,0.9265,0.875809,0.606759
6000,0.8839,0.838052,0.633023
8000,0.8648,0.82148,0.634295
10000,0.8424,0.828421,0.64646
12000,0.8292,0.798516,0.65328
14000,0.8023,0.764684,0.66799
16000,0.7929,0.763899,0.671349
18000,0.7822,0.753154,0.675421
20000,0.7724,0.752355,0.677152


