**INSTALL REQUIREMENTS**

In [1]:
!pip install transformers datasets torch accelerate evaluate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


**MOUNT DRIVE & IMPORT LIBRARIES**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

Mounted at /content/drive
Using device: cuda
GPU: Tesla T4
Memory Available: 15.83 GB


**LOAD TOKENIZED DATASETS**

In [3]:
print("\n" + "="*50)
print("LOADING TOKENIZED DATASETS FROM GOOGLE DRIVE")
print("="*50)

drive_path = "/content/drive/MyDrive/AG_News_Project"

tokenized_train = load_from_disk(f"{drive_path}/tokenized_train")
tokenized_val = load_from_disk(f"{drive_path}/tokenized_val")
tokenized_test = load_from_disk(f"{drive_path}/tokenized_test")

print(f"✓ Loaded tokenized train: {len(tokenized_train)} samples")
print(f"✓ Loaded tokenized validation: {len(tokenized_val)} samples")
print(f"✓ Loaded tokenized test: {len(tokenized_test)} samples")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"{drive_path}/tokenizer")
print(f"✓ Loaded tokenizer")


LOADING TOKENIZED DATASETS FROM GOOGLE DRIVE
✓ Loaded tokenized train: 108000 samples
✓ Loaded tokenized validation: 12000 samples
✓ Loaded tokenized test: 7600 samples
✓ Loaded tokenizer


**LOAD DISTILLBERT MODEL**

In [4]:
print("\n" + "="*50)
print("LOADING DISTILBERT MODEL")
print("="*50)

model_name = "distilbert-base-uncased"
num_labels = 4  # AG News has 4 categories

print(f"\nLoading {model_name} for sequence classification...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Move model to GPU
model.to(device)

print(f"✓ Model loaded successfully")
print(f"Number of parameters: {model.num_parameters():,}")
print(f"Model on device: {next(model.parameters()).device}")


LOADING DISTILBERT MODEL

Loading distilbert-base-uncased for sequence classification...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded successfully
Number of parameters: 66,956,548
Model on device: cuda:0


**DEFINE EVALUATION METRICS**

In [5]:
print("\n" + "="*50)
print("SETTING UP EVALUATION METRICS")
print("="*50)

def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1-score"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        predictions,
        average='weighted'  # weighted average for multi-class
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("✓ Metrics function defined: accuracy, precision, recall, F1-score")


SETTING UP EVALUATION METRICS
✓ Metrics function defined: accuracy, precision, recall, F1-score


**TRAINING CONFIGURATION**

In [6]:
print("\n" + "="*50)
print("CONFIGURING TRAINING ARGUMENTS")
print("="*50)

training_args = TrainingArguments(
    output_dir=f"{drive_path}/results",          # Output directory for checkpoints
    eval_strategy="epoch",                  # Evaluate at end of each epoch
    save_strategy="epoch",                        # Save checkpoint at end of each epoch
    learning_rate=2e-5,                          # Learning rate (BERT authors recommend 2e-5 to 5e-5)
    per_device_train_batch_size=16,              # Batch size for training (adjust based on GPU memory)
    per_device_eval_batch_size=32,               # Batch size for evaluation (can be larger)
    num_train_epochs=3,                          # Number of epochs (2-4 recommended for BERT)
    weight_decay=0.01,                           # Weight decay for regularization
    warmup_steps=500,                            # Warmup steps for learning rate scheduler
    logging_dir=f"{drive_path}/logs",            # Directory for logs
    logging_steps=100,                           # Log every 100 steps
    load_best_model_at_end=True,                 # Load best model at end based on metric
    metric_for_best_model="accuracy",            # Use accuracy to determine best model
    save_total_limit=2,                          # Only keep 2 most recent checkpoints
    report_to="none",                            # Disable wandb/tensorboard reporting
    push_to_hub=False                           # Don't push to HuggingFace Hub
)

print("✓ Training arguments configured:")
print(f"  - Learning rate: {training_args.learning_rate}")
print(f"  - Batch size (train): {training_args.per_device_train_batch_size}")
print(f"  - Batch size (eval): {training_args.per_device_eval_batch_size}")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Warmup steps: {training_args.warmup_steps}")
print(f"  - Weight decay: {training_args.weight_decay}")


CONFIGURING TRAINING ARGUMENTS
✓ Training arguments configured:
  - Learning rate: 2e-05
  - Batch size (train): 16
  - Batch size (eval): 32
  - Epochs: 3
  - Warmup steps: 500
  - Weight decay: 0.01


**CREATE TRAINER**

In [7]:
print("\n" + "="*50)
print("CREATING TRAINER")
print("="*50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("✓ Trainer created successfully")


CREATING TRAINER
✓ Trainer created successfully


  trainer = Trainer(


**TRAINING START!!!**

In [8]:
print("\n" + "="*50)
print("STARTING TRAINING")
print("="*50)
print("This will take some time (approximately 15-30 minutes depending on GPU)...")
print("You can monitor progress below:\n")

# Train the model
train_result = trainer.train()

print("\n" + "="*50)
print("TRAINING COMPLETED!")
print("="*50)
print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples/second: {train_result.metrics['train_samples_per_second']:.2f}")


STARTING TRAINING
This will take some time (approximately 15-30 minutes depending on GPU)...
You can monitor progress below:



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1986,0.196233,0.938917,0.938997,0.938917,0.938868
2,0.145,0.185507,0.945583,0.94583,0.945583,0.945619
3,0.1056,0.219925,0.946333,0.946471,0.946333,0.946327



TRAINING COMPLETED!
Training time: 3546.93 seconds
Training samples/second: 91.35


**SAVE THE MODEL**

In [9]:
print("\n" + "="*50)
print("SAVING TRAINED MODEL")
print("="*50)

model_save_path = f"{drive_path}/distilbert_ag_news_final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✓ Model saved to: {model_save_path}")
print(f"✓ Tokenizer saved to: {model_save_path}")

print("\n" + "="*50)
print("MODEL TRAINING COMPLETE!")
print("Ready for evaluation on test set.")
print("="*50)


SAVING TRAINED MODEL
✓ Model saved to: /content/drive/MyDrive/AG_News_Project/distilbert_ag_news_final
✓ Tokenizer saved to: /content/drive/MyDrive/AG_News_Project/distilbert_ag_news_final

MODEL TRAINING COMPLETE!
Ready for evaluation on test set.
