# 0. Install libraries

In [1]:
!pip install transformers==4.55.4
!pip install datasets
!pip install evaluate

Collecting transformers==4.55.4
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers==4.55.4)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.4-py3-none-any.whl (561 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.33.1
    Uninstalling huggingface-hub-0.33.1:
      Successfully uninstalled huggingface-hub-0.33.1
  A

# 1. Import libs

In [2]:
import os
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

2025-08-26 12:19:05.199996: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756210745.548557      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756210745.652063      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# 2. Load datasets (AG News) + Load tokenizer and Model

In [3]:
dataset = load_dataset("ag_news")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Tokenize

In [5]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

# 4. Train/Test Split

In [6]:
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

# 5. Load metrics

In [7]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

# 6. Data Collator

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 7. TrainingArguments

In [9]:
training_args = TrainingArguments(
    output_dir="./models/distilbert_agnews",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs_clf",
    logging_steps=100,
    save_strategy="epoch",       # lưu checkpoint mỗi epoch
    save_total_limit=1,          # chỉ giữ 1 checkpoint
    report_to="none",            # tắt wandb
)


# 8. Trainer

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


# 9. Train and Eval

In [11]:
# Train
trainer.train()
trainer.save_model("./models/distilbert_agnews")
tokenizer.save_pretrained("./models/distilbert_agnews")

# Evaluate 
results = trainer.evaluate()
print("Evaluation results:", results)



Step,Training Loss
100,0.6783
200,0.3501
300,0.3106
400,0.2674
500,0.2576
600,0.2417
700,0.227
800,0.2351
900,0.222
1000,0.2033




Evaluation results: {'eval_loss': 0.4552611708641052, 'eval_accuracy': 0.9411842105263157, 'eval_f1': 0.9412016081827891, 'eval_runtime': 18.6404, 'eval_samples_per_second': 407.716, 'eval_steps_per_second': 12.768, 'epoch': 10.0}
