In [1]:

import random

import evaluate
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, DataCollatorWithPadding,
                          Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback)
from sklearn.model_selection import ParameterGrid
from rebert.model import ReBertModel
import gc

seed = 93
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

2024-01-07 23:40:07.913485: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-07 23:40:07.913520: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-07 23:40:07.914469: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-07 23:40:07.919708: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<torch._C.Generator at 0x7f722d980c30>

In [2]:
device = "cuda"
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {v: k for k, v in id2label.items()}
ds = load_from_disk("./data/mnli")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9815
    })
})

In [3]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": acc}

In [None]:
BATCH_EVAL = 128
LAMBDA = 0.01
OUTPUT = "rebert_mlm_mnli"

hyper_params = {
    "batch": [64, 32, 16],
    "lr": [1e-4, 5e-5, 2e-5, 1e-5]
}

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_result = []

for params in ParameterGrid(hyper_params):
    model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

    max_steps = 2000 * 32 // params["batch"]
    training_args = TrainingArguments(
        output_dir=OUTPUT,
        learning_rate=params["lr"],
        per_device_train_batch_size=params["batch"],
        per_device_eval_batch_size=BATCH_EVAL,
        bf16=True,
        gradient_checkpointing=True,
        max_steps=max_steps,
        weight_decay=LAMBDA,
        lr_scheduler_type="constant",
        evaluation_strategy="no",
        save_strategy="no",
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["eval"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    results = trainer.train()
    eval_perf = trainer.evaluate()
    eval_result.append((params, eval_perf))

    gc.collect()
    torch.cuda.empty_cache()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.8974
1000,0.7894


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8637
1000,0.7689


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8976
1000,0.7943


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9374
1000,0.8277


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9402
1000,0.84
1500,0.8157
2000,0.8052


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9127
1000,0.8012
1500,0.7665
2000,0.7447


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9326
1000,0.8187
1500,0.7919
2000,0.7725


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9689
1000,0.8492
1500,0.8173
2000,0.799


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,1.1307


In [None]:
best_params = sorted(eval_result, key=lambda x: x[1]["eval_loss"])
best_params[:3]

In [None]:
BATCH_TRAIN = best_params[0][0]["batch"]
BATCH_EVAL = 128
LEARNING_RATE = best_params[0][0]["lr"]
EPOCHS = 10
SAVE_STEPS = 2000
LOG_STEPS = 2000
LAMBDA = 0.01
SAVE_LIMITS = 10
WARMUP = 0
OUTPUT = "rebert_mlm_mnli"
TB_DIR = "rebert_mlm_mnli_tb/minipile-1"

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir=OUTPUT,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    bf16=True,
    gradient_checkpointing=True,
    num_train_epochs=EPOCHS,
    weight_decay=LAMBDA,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    logging_dir=TB_DIR,
    save_total_limit=SAVE_LIMITS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

results = trainer.train(resume_from_checkpoint=False)
model.save_pretrained("rebert_mlm_mnli_best")
tokenizer.save_pretrained("rebert_mlm_mnli_best")
trainer.evaluate(ds["test_matched"])