In [None]:
# ======================================
# 📦 1. Setup and Imports
# ======================================
import os
import time
import torch
import numpy as np
from datasets import load_from_disk
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

Mounted at /content/drive


In [None]:
# ============================
# Block 2 — Imports and Config
# ============================
import os
import numpy as np
import torch
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import (
    GPT2TokenizerFast,
    GPT2ForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix
import pickle

# Hyperparameters
MODEL_NAME = "gpt2"
BATCH_SIZE = 16
EPOCHS = 5          # You can increase to 10 later
LR = 5e-5
MAX_LENGTH = 256
SEED = 42

set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)


Running on: cuda


In [None]:
# ============================
# Block 3 — Load and Split Dataset
# ============================
from datasets import Dataset

# Load AG News dataset
dataset = load_dataset("ag_news")

# Merge train + test for fresh splitting
full_dataset = concatenate_datasets([dataset["train"], dataset["test"]])
full_dataset = full_dataset.shuffle(seed=SEED)

# Create train (80%), validation (10%), test (10%)
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=SEED)
valid_test = train_testvalid["test"].train_test_split(test_size=0.5, seed=SEED)

dataset = DatasetDict({
    "train": train_testvalid["train"],
    "validation": valid_test["train"],
    "test": valid_test["test"]
})

print(dataset)
print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Test size:", len(dataset["test"]))

# Save splits for future use
for split in ["train", "validation", "test"]:
    path = os.path.join(DATA_DIR, f"{split}_data.pkl")
    with open(path, "wb") as f:
        pickle.dump(dataset[split], f)
    print(f"Saved {split} split to {path}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 102080
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 12760
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12760
    })
})
Train size: 102080
Validation size: 12760
Test size: 12760
Saved train split to /content/drive/MyDrive/gpt2_agnews_project/data/train_data.pkl
Saved validation split to /content/drive/MyDrive/gpt2_agnews_project/data/validation_data.pkl
Saved test split to /content/drive/MyDrive/gpt2_agnews_project/data/test_data.pkl


In [None]:
# ============================
# Block 4 — Tokenizer and Model
# ============================
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

num_labels = 4
model = GPT2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id
model.to(device)

def preprocess_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LENGTH,
    )

tokenized_dataset = dataset.map(preprocess_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/102080 [00:00<?, ? examples/s]

Map:   0%|          | 0/12760 [00:00<?, ? examples/s]

Map:   0%|          | 0/12760 [00:00<?, ? examples/s]

In [None]:
# ============================
# Block 5 — Metrics Function
# ============================
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}


In [None]:
# ============================
# Block 6 — Training Arguments
# ============================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to=["tensorboard"],
    save_strategy="epoch",
    learning_rate=LR,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=LOG_DIR,
    save_total_limit=5,
    fp16=torch.cuda.is_available(),
    seed=SEED,
)


In [None]:
# ============================
# Block 7 — Initialize Trainer
# ============================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# ============================
# Block 8 — Train
# ============================
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2887,0.18278,0.940674,0.940485,0.940674,0.940567
2,0.162,0.177315,0.946552,0.946379,0.946552,0.946417
3,0.1185,0.19055,0.948981,0.948953,0.948981,0.948944
4,0.0807,0.242351,0.947806,0.947806,0.947806,0.947774
5,0.0527,0.284349,0.948433,0.94843,0.948433,0.94843


TrainOutput(global_step=31900, training_loss=0.14050207155998973, metrics={'train_runtime': 4809.4652, 'train_samples_per_second': 106.124, 'train_steps_per_second': 6.633, 'total_flos': 2.4644256113885184e+16, 'train_loss': 0.14050207155998973, 'epoch': 5.0})