In [1]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
HF_TOKEN = os.getenv("HF_TOKEN")
HF_TOKEN.startswith("hf_")

True

## Dataset Preprocessing

In [3]:
raw_datasets = load_dataset("glue", "mrpc")  # Loads the glue dataset's MRPC subset, stores it in ~/.cache/huggingface/datasets/glue/mrpc
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'mrpc' at /Users/thomas/.cache/huggingface/datasets/glue/mrpc/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Sun Mar  2 20:51:21 2025).


## Training

In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments("mrpc-output") # this is the output directory

In [5]:
# import evaluate

# def compute_metrics(eval_preds):
#     eval_metric_module = evaluate.load("glue", "mrpc")
#     logits, labels = eval_preds
#     probas = np.argmax(logits, axis=-1)
#     return eval_metric_module.compute(predictions=probas, references=labels)

`evaluate.load("glue", "mrpc")` keeps throwing a `FileNotFoundError` error. Couldn't solve it within a reasonable time, so I avoid the error instead:

In [6]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, preds), "f1": f1_score(labels, preds)}

In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer

training_args = TrainingArguments("trainer_output", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [8]:
# Metrics before training
predictions = trainer.predict(tokenized_datasets["validation"])
compute_metrics((predictions.predictions, predictions.label_ids))

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy,F1
1,No log,0.441116,0.0014,0.806373,0.870279
2,0.511100,0.68372,0.0014,0.818627,0.879479
3,0.271800,0.821456,0.0014,0.830882,0.884808


TrainOutput(global_step=1377, training_loss=0.3232925764096329, metrics={'train_runtime': 664.0402, 'train_samples_per_second': 16.571, 'train_steps_per_second': 2.074, 'total_flos': 405114969714960.0, 'train_loss': 0.3232925764096329, 'epoch': 3.0})