In [1]:
!pip install -U transformers datasets accelerate

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.2 pya

In [2]:
import transformers
print(transformers.__version__)


4.57.3


In [3]:
import torch
print(torch.cuda.is_available())

True


In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification , pipeline  # type: ignore
from transformers import Trainer, TrainingArguments # type: ignore
from sklearn.metrics import accuracy_score, f1_score # type: ignore
from datasets import load_dataset # type: ignore
import numpy as np # type: ignore

dataset = load_dataset("imdb")
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=128
        )

tokenized_dataset = dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=["text"]
    )
tokenized_dataset.set_format("torch")

model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )
eval_dataset = tokenized_dataset["test"]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
            "accuracy": accuracy_score(labels, predictions),
            "f1": f1_score(labels, predictions)
        }

training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="tensorboard",
    seed = 42,
    fp16=True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics
    )

trainer.train()

eval_results = trainer.evaluate()
print(eval_results)

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3584,0.313322,0.88564,0.883282
2,0.2257,0.415222,0.89112,0.89189


{'eval_loss': 0.4152224361896515, 'eval_accuracy': 0.89112, 'eval_f1': 0.8918897450154897, 'eval_runtime': 53.0211, 'eval_samples_per_second': 471.511, 'eval_steps_per_second': 58.939, 'epoch': 2.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')