**INSTALL REQUIREMENTS PACKAGES**

In [None]:
!pip install transformers datasets torch accelerate evaluate scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


**MOUNT TO DRIVE**

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**IMPORT LIBRARIES**

In [4]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


**LOAD THE DATA**

In [6]:
# Load AG News dataset from HuggingFace
print("Loading AG News dataset...")
dataset = load_dataset("sh0416/ag_news")

# Display dataset structure
print("\nDataset structure:")
print(dataset)

# Check the columns
print(f"\nColumns: {dataset['train'].column_names}")

# Check the splits
print(f"\nTraining samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

Loading AG News dataset...

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['label', 'title', 'description'],
        num_rows: 7600
    })
})

Columns: ['label', 'title', 'description']

Training samples: 120000
Test samples: 7600


**DATA EXPLORATION**

In [7]:
# Explore a few examples
print("\nSample data:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Title: {dataset['train'][i]['title']}")
    print(f"Description: {dataset['train'][i]['description'][:150]}...")
    print(f"Label: {dataset['train'][i]['label']}")

# Check label distribution
print("\nLabel mapping:")
# Note: sh0416/ag_news uses labels 1-4, not 0-3
label_names = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech"}
print(label_names)

# Convert labels from 1-4 to 0-3 (standard for PyTorch/transformers)
def adjust_labels(example):
    example['label'] = example['label'] - 1
    return example

dataset = dataset.map(adjust_labels)
print("\nAdjusted labels to 0-3 range")

# Combine title and description into one text field
def combine_text(example):
    example['text'] = example['title'] + ". " + example['description']
    return example

dataset = dataset.map(combine_text)
print("Combined title and description into 'text' field")


Sample data:

Example 1:
Title: Wall St. Bears Claw Back Into the Black (Reuters)
Description: Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again....
Label: 3

Example 2:
Title: Carlyle Looks Toward Commercial Aerospace (Reuters)
Description: Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense indu...
Label: 3

Example 3:
Title: Oil and Economy Cloud Stocks' Outlook (Reuters)
Description: Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during...
Label: 3

Label mapping:
{1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]


Adjusted labels to 0-3 range


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Combined title and description into 'text' field


**SPLIT DATA**

In [10]:
# Create validation split from training data (90% train, 10% validation)
train_test_split = dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']
test_dataset = dataset['test']

print(f"\nFinal splits:")
print(f"Training: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

# Show a sample with the new text field
print(f"\nSample with combined text:")
print(f"Text: {train_dataset[0]['text'][:200]}...")
print(f"Label: {train_dataset[0]['label']} ({['World', 'Sports', 'Business', 'Sci/Tech'][train_dataset[0]['label']]}")


Final splits:
Training: 108000 samples
Validation: 12000 samples
Test: 7600 samples

Sample with combined text:
Text: Despair and Anger in Small Russian Town After Siege.  BESLAN, Russia (Reuters) - The killing of more than 320  children, parents and teachers during the bloody end to a  53-hour school siege left bare...
Label: 0 (World


**SAVE THE DATA**

In [13]:
# ========== SAVE PROCESSED DATA TO GOOGLE DRIVE ==========
print("\n" + "="*50)
print("SAVING PROCESSED DATASETS TO GOOGLE DRIVE...")
print("="*50)

# Create a folder in your Drive (optional but organized)
import os
drive_path = "/content/drive/MyDrive/AG_News_Project"
os.makedirs(drive_path, exist_ok=True)

# Save to Google Drive
train_dataset.save_to_disk(f"{drive_path}/ag_news_train")
val_dataset.save_to_disk(f"{drive_path}/ag_news_val")
test_dataset.save_to_disk(f"{drive_path}/ag_news_test")

print(f"\n✓ Saved train dataset to: {drive_path}/ag_news_train")
print(f"✓ Saved validation dataset to: {drive_path}/ag_news_val")
print(f"✓ Saved test dataset to: {drive_path}/ag_news_test")
print("\nData preparation complete! Files saved to Google Drive.")


SAVING PROCESSED DATASETS TO GOOGLE DRIVE...


Saving the dataset (0/1 shards):   0%|          | 0/108000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7600 [00:00<?, ? examples/s]


✓ Saved train dataset to: /content/drive/MyDrive/AG_News_Project/ag_news_train
✓ Saved validation dataset to: /content/drive/MyDrive/AG_News_Project/ag_news_val
✓ Saved test dataset to: /content/drive/MyDrive/AG_News_Project/ag_news_test

Data preparation complete! Files saved to Google Drive.
