In [1]:

import random

import evaluate
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (AutoTokenizer, TrainingArguments, DataCollatorWithPadding,
                          Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback)
from rebert.model import ReBertModel

seed = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7fbcec15c9d0>

In [2]:
device = "cuda"
id2label = {0: "Entailment", 1: "Neutral", 2: "Contradiction"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    "./rebert-base", num_labels=len(id2label), id2label=id2label, label2id=label2id
).to(device)

model

Some weights of ReBertForSequenceClassification were not initialized from the model checkpoint at ./rebert-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.pool_proj.bias', 'pooler.pool_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ReBertForSequenceClassification(
  (rebert): ReBertModel(
    (embedding): ReBertEmbedding(
      (word_embedding): Embedding(32002, 768, padding_idx=32000)
      (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ReBertEncoder(
      (rope): ROPEEmbedding()
      (encoder_layers): ModuleList(
        (0-11): 12 x ReBertEncoderLayer(
          (attention): ReBertMultiHeadAttention(
            (self_attention): ReBertSelfAttention(
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (k_proj): Linear(in_features=768, out_features=256, bias=True)
              (v_proj): Linear(in_features=768, out_features=256, bias=True)
              (attn_dropout): Dropout(p=0.1, inplace=False)
              (rope): ROPEEmbedding()
            )
            (o_proj): Linear(in_features=768, out_features=768, bias=True)
            (output_dropout): Dropout(p=0.1, inplace=False)
 

In [3]:
for param in model.parameters():
    param.requires_grad = True

In [4]:
ds = load_from_disk("./data/mnli_mistral")
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    test_matched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
    eval: Dataset({
        features: ['label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 19647
    })
})

In [5]:
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": acc}

In [6]:
BATCH_TRAIN = 32
BATCH_EVAL = 128
LEARNING_RATE = 2e-5
EPOCHS = 10
SAVE_STEPS = 2000
LOG_STEPS = 2000
LAMBDA = 0.01
SAVE_LIMITS = 10
WARMUP = 0.06
OUTPUT = "rebert_scratch_mnli"
TB_DIR = "rebert_scratch_mnli_tb"

tokenizer = AutoTokenizer.from_pretrained("./rebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=OUTPUT,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_TRAIN,
    per_device_eval_batch_size=BATCH_EVAL,
    bf16=True,
    gradient_checkpointing=True,
    num_train_epochs=EPOCHS,
    weight_decay=LAMBDA,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_steps=LOG_STEPS,
    save_steps=SAVE_STEPS,
    logging_dir=TB_DIR,
    save_total_limit=SAVE_LIMITS,
    load_best_model_at_end=True,
    seed=seed
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

results = trainer.train(resume_from_checkpoint=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
2000,0.0,,0.353336
4000,0.0,,0.353336
6000,0.0,,0.353336
8000,0.0,,0.353336


