In [1]:

import random

import evaluate
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, DataCollatorWithPadding,
                          Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback)
from sklearn.model_selection import ParameterGrid
from rebert.model import ReBertModel
import gc

seed = 93
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

2024-01-08 10:46:36.832077: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-08 10:46:36.832111: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-08 10:46:36.833033: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-08 10:46:36.838002: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<torch._C.Generator at 0x7f0e83584c50>

In [2]:
device = "cuda"
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {v: k for k, v in id2label.items()}
ds = load_from_disk("./data/mnli")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9815
    })
})

In [3]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": acc}

In [4]:
BATCH_EVAL = 128
LAMBDA = 0.01
OUTPUT = "rebert_mlm_mnli"

hyper_params = {
    "batch": [64, 32, 16],
    "lr": [1e-4, 5e-5, 2e-5, 1e-5]
}

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
eval_result = []

for params in ParameterGrid(hyper_params):
    model = AutoModelForSequenceClassification.from_pretrained(
        "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
    ).to(device)

    max_steps = 2000 * 32 // params["batch"]
    training_args = TrainingArguments(
        output_dir=OUTPUT,
        learning_rate=params["lr"],
        per_device_train_batch_size=params["batch"],
        per_device_eval_batch_size=BATCH_EVAL,
        bf16=True,
        gradient_checkpointing=True,
        max_steps=max_steps,
        weight_decay=LAMBDA,
        lr_scheduler_type="constant",
        evaluation_strategy="no",
        save_strategy="no",
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["eval"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    results = trainer.train()
    eval_perf = trainer.evaluate()
    eval_result.append((params, eval_perf))

    gc.collect()
    torch.cuda.empty_cache()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.861
1000,0.7778


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8534
1000,0.7592


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.8844
1000,0.786


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9228
1000,0.8193


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9273
1000,0.8278
1500,0.8094
2000,0.7838


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9075
1000,0.8007
1500,0.773
2000,0.7436


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9232
1000,0.8119
1500,0.7867
2000,0.7641


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9585
1000,0.8382
1500,0.8137
2000,0.795


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9954
1000,0.9094
1500,0.884
2000,0.8732
2500,0.8644
3000,0.852
3500,0.8254
4000,0.8443


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9521
1000,0.855
1500,0.8128
2000,0.8095
2500,0.7888
3000,0.7729
3500,0.7448
4000,0.778


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9644
1000,0.848
1500,0.803
2000,0.796
2500,0.7784
3000,0.7609
3500,0.7297
4000,0.7576


Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.9927
1000,0.8727
1500,0.8318
2000,0.8236
2500,0.8043
3000,0.7928
3500,0.7676
4000,0.7899


In [5]:
best_params = sorted(eval_result, key=lambda x: x[1]["eval_loss"])
best_params[:3]

[({'batch': 32, 'lr': 5e-05},
  {'eval_loss': 0.6976487040519714,
   'eval_accuracy': 0.7053489556800815,
   'eval_runtime': 8.0863,
   'eval_samples_per_second': 1213.776,
   'eval_steps_per_second': 9.522,
   'epoch': 0.16}),
 ({'batch': 64, 'lr': 5e-05},
  {'eval_loss': 0.7134973406791687,
   'eval_accuracy': 0.6925114620478859,
   'eval_runtime': 8.0809,
   'eval_samples_per_second': 1214.593,
   'eval_steps_per_second': 9.529,
   'epoch': 0.16}),
 ({'batch': 16, 'lr': 2e-05},
  {'eval_loss': 0.7210526466369629,
   'eval_accuracy': 0.6912888436067244,
   'eval_runtime': 8.0737,
   'eval_samples_per_second': 1215.675,
   'eval_steps_per_second': 9.537,
   'epoch': 0.16})]

In [6]:
BATCH_TRAIN = best_params[0][0]["batch"]
BATCH_EVAL = 128
LEARNING_RATE = best_params[0][0]["lr"]
EPOCHS = 10
SAVE_STEPS = 2000
LOG_STEPS = 2000
LAMBDA = 0.01
SAVE_LIMITS = 10
WARMUP = 0
OUTPUT = "rebert_mlm_mnli"
TB_DIR = "rebert_mlm_mnli_tb/minipile-1"

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
).to(device)

training_args = TrainingArguments(
    output_dir=OUTPUT,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    bf16=True,
    gradient_checkpointing=True,
    num_train_epochs=EPOCHS,
    weight_decay=LAMBDA,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    logging_dir=TB_DIR,
    save_total_limit=SAVE_LIMITS,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

results = trainer.train(resume_from_checkpoint=False)
model.save_pretrained("rebert_mlm_mnli_best")
tokenizer.save_pretrained("rebert_mlm_mnli_best")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.weight', 'pooler.dense.weight', 'classifier.bias', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
2000,0.8054,0.697024,0.706266
4000,0.6945,0.659272,0.729598
6000,0.6585,0.639561,0.733877
8000,0.635,0.621359,0.750076
10000,0.6142,0.608545,0.752012
12000,0.6049,0.573612,0.768008
14000,0.522,0.615149,0.765257
16000,0.514,0.5886,0.769027
18000,0.5095,0.610907,0.767499
20000,0.5121,0.611583,0.770046




('rebert_mlm_mnli_best/tokenizer_config.json',
 'rebert_mlm_mnli_best/special_tokens_map.json',
 'rebert_mlm_mnli_best/tokenizer.json')

In [7]:
trainer.evaluate()

{'eval_loss': 1.4578484296798706,
 'eval_accuracy': 0.7888945491594498,
 'eval_runtime': 8.3634,
 'eval_samples_per_second': 1173.56,
 'eval_steps_per_second': 9.207,
 'epoch': 10.0}