In [1]:

import random

import evaluate
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, DataCollatorWithPadding,
                          Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback)
from sklearn.model_selection import ParameterGrid
from rebert.model import ReBertModel
import gc

seed = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

2024-01-06 11:18:45.751984: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-06 11:18:45.752029: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-06 11:18:45.752857: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-06 11:18:45.756987: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<torch._C.Generator at 0x7f8edc2b0a50>

In [2]:
device = "cuda"
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {v: k for k, v in id2label.items()}
ds = load_from_disk("./data/mnli_roberta")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9815
    })
})

In [3]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": acc}

In [4]:
BATCH_EVAL = 128
LAMBDA = 0.01
OUTPUT = "rebert_mlm_mnli"

hyper_params = {
    "batch": [16, 32],
    "lr": [1e-4, 5e-5, 2e-5, 1e-5]
}

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_result = []

for params in ParameterGrid(hyper_params):
    model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

    max_steps = 2000 * 32 // params["batch"]
    training_args = TrainingArguments(
        output_dir=OUTPUT,
        learning_rate=params["lr"],
        per_device_train_batch_size=params["batch"],
        per_device_eval_batch_size=BATCH_EVAL,
        bf16=True,
        gradient_checkpointing=True,
        max_steps=max_steps,
        weight_decay=LAMBDA,
        lr_scheduler_type="constant",
        evaluation_strategy="no",
        save_strategy="no",
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["eval"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    results = trainer.train()
    eval_perf = trainer.evaluate()
    eval_result.append((params, eval_perf))
    
    gc.collect()
    torch.cuda.empty_cache()

Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.1217
1000,1.1124
1500,1.1062
2000,1.1076
2500,1.1049
3000,1.105
3500,1.1052
4000,1.1048


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.1056
1000,1.0344
1500,0.9922
2000,0.9787
2500,0.9693
3000,0.9408
3500,0.9506
4000,0.9518


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.0864
1000,0.9967
1500,0.9309
2000,0.913
2500,0.9002
3000,0.8665
3500,0.872
4000,0.8715


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.0834
1000,1.0283
1500,0.9528
2000,0.92
2500,0.9097
3000,0.8781
3500,0.8794
4000,0.8817


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.1134
1000,1.1057
1500,1.1038
2000,1.1032


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.0404
1000,0.9454
1500,0.9094
2000,0.8941


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.0487
1000,0.9263
1500,0.8838
2000,0.8649


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.0731
1000,0.977
1500,0.928
2000,0.9035


In [5]:
best_params = sorted(eval_result, key=lambda x: x[1]["eval_loss"])
best_params[:3]

[({'batch': 32, 'lr': 2e-05},
  {'eval_loss': 0.8303027153015137,
   'eval_accuracy': 0.6297503820682628,
   'eval_runtime': 7.2676,
   'eval_samples_per_second': 1350.506,
   'eval_steps_per_second': 10.595,
   'epoch': 0.16}),
 ({'batch': 16, 'lr': 2e-05},
  {'eval_loss': 0.833569347858429,
   'eval_accuracy': 0.6312786551197147,
   'eval_runtime': 7.2674,
   'eval_samples_per_second': 1350.552,
   'eval_steps_per_second': 10.595,
   'epoch': 0.16}),
 ({'batch': 16, 'lr': 1e-05},
  {'eval_loss': 0.8480516672134399,
   'eval_accuracy': 0.6217014773306164,
   'eval_runtime': 7.3653,
   'eval_samples_per_second': 1332.601,
   'eval_steps_per_second': 10.454,
   'epoch': 0.16})]

In [7]:
BATCH_TRAIN = best_params[0][0]["batch"]
BATCH_EVAL = 128
LEARNING_RATE = best_params[0][0]["lr"]
EPOCHS = 10
SAVE_STEPS = 2000
LOG_STEPS = 2000
LAMBDA = 0.01
SAVE_LIMITS = 10
WARMUP = 0.06
OUTPUT = "rebert_mlm_mnli"
TB_DIR = "rebert_mlm_mnli_tb"

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

training_args = TrainingArguments(
    output_dir=OUTPUT,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    bf16=True,
    gradient_checkpointing=True,
    num_train_epochs=EPOCHS,
    weight_decay=LAMBDA,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    logging_dir=TB_DIR,
    save_total_limit=SAVE_LIMITS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

results = trainer.train(resume_from_checkpoint=False)

Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
2000,1.0796,1.035268,0.473357
4000,0.9228,0.878902,0.611207
6000,0.8478,0.812051,0.639429
8000,0.8182,0.794534,0.650025
10000,0.7911,0.782334,0.657056
12000,0.7743,0.756043,0.674478
14000,0.7417,0.743746,0.685583
16000,0.7333,0.71949,0.694753
18000,0.72,0.707528,0.700051
20000,0.7116,0.711221,0.701477


