# Notebook 2: News Classification

This notebook implements:
- Loading pretrained multilingual models (XLM-RoBERTa)
- Fine-tuning for Nepali news classification
- Training on 13+ news categories
- Model evaluation and saving
- Prediction on new articles

In [2]:
%pip install numpy pandas matplotlib seaborn scikit-learn torch transformers

Collecting numpy
  Downloading numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (52 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting transformers
  Downloading transformers-4.57.6-py3-none-any.whl.metadata (43 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinu

In [3]:
%pip install datasets evaluate

Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting fsspec<=2025.10.0,>=2023.1.0 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.10.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_

In [1]:
# Import required libraries
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Transformers and PyTorch
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

print("\n‚úì Libraries imported successfully")

Using device: cpu

‚úì Libraries imported successfully


## 1. Configuration

In [7]:
# Paths
BASE_DIR = Path(r'/workspaces/sem_practise')
DATA_DIR = BASE_DIR / 'data' 
MODEL_DIR = BASE_DIR / 'models' / 'news_classifier'
RESULTS_DIR = BASE_DIR / 'results'

# Create directories
MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Model configuration
MODEL_NAME = "FacebookAI/xlm-roberta-base"  # Pretrained multilingual model
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3

print(f"Model: {MODEL_NAME}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Number of epochs: {NUM_EPOCHS}")

Model: FacebookAI/xlm-roberta-base
Max sequence length: 512
Batch size: 8
Learning rate: 2e-05
Number of epochs: 3


## 2. Load Data

In [8]:
# Load preprocessed data
with open(DATA_DIR / 'train_data.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

with open(DATA_DIR / 'test_data.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

print(f"‚úì Loaded {len(train_data)} training samples")
print(f"‚úì Loaded {len(test_data)} test samples")

# Convert to DataFrames for easier handling
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print("\nSample data:")
train_df.head()

‚úì Loaded 1431 training samples
‚úì Loaded 358 test samples

Sample data:


Unnamed: 0,text,category
0,"‡•©‡•¶ ‡§™‡•Å‡§∏, ‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç ‡•§ ‡§¶‡•ã‡§∞‡•ç‡§¶‡•Ä ‡§ñ‡•ã‡§≤‡§æ ‡§ú‡§≤‡§µ‡§ø‡§¶‡•ç‡§Æ‡•Å‡§§ ‡§ï‡§Æ‡•ç‡§™‡§®...",bank
1,"‡§ï‡•á ‡§§‡§™‡§æ‡§à‡§Ç‡§≤‡§æ‡§à ‡§•‡§æ‡§π‡§æ ‡§õ ? ‡§π‡§æ‡§•, ‡§ñ‡•Å‡§ü‡•ç‡§ü‡§æ‡§ï‡§æ ‡§®‡§ô‡§π‡§∞‡•Å‡§ï‡•ã ‡§∞‡§Ç‡§ó...",health
2,‡§™‡•ç‡§∞‡§ß‡§æ‡§®‡§Æ‡§®‡•ç‡§§‡•ç‡§∞‡•Ä ‡§ï‡•á‡§™‡•Ä ‡§∂‡§∞‡•ç‡§Æ‡§æ ‡§ì‡§≤‡•Ä ‡§•‡•ç‡§∞‡•Ä ‡§®‡•á‡§∏‡§®‡•ç‡§∏ ‡§ï‡§™‡§ï‡•ã ...,sports
3,"‡§∞‡§æ‡§ú‡§µ‡§ø‡§∞‡§æ‡§ú, ‡•®‡•™ ‡§ö‡•à‡§§ ‡•§ ‡§∏‡§™‡•ç‡§§‡§∞‡•Ä‡§ï‡•ã ‡§Æ‡§≤‡•á‡§ï‡§™‡•Å‡§∞ ‡§ó‡§æ‡§â‡§Å ‡§µ‡§ø‡§ï‡§æ‡§∏...",politic
4,‡§∏‡•á‡§û‡•ç‡§ö‡•Å‡§∞‡•Ä‡§Æ‡§æ ‡§ó‡§§ ‡§Ö‡§∏‡•ã‡§ú ‡•ß ‡§ó‡§§‡•á‡§¶‡•á‡§ñ‡§ø ‡§™‡•ç‡§∞‡§Æ‡•Ç‡§ñ ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§ï‡§æ‡§∞‡•Ä ...,bank


In [9]:
# Create label mappings
categories = sorted(train_df['category'].unique())
label2id = {label: idx for idx, label in enumerate(categories)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Number of categories: {len(categories)}")
print("\nCategory mappings:")
for label, idx in label2id.items():
    print(f"  {idx}: {label}")

# Add numeric labels
train_df['label'] = train_df['category'].map(label2id)
test_df['label'] = test_df['category'].map(label2id)

# Save label mappings
with open(MODEL_DIR / 'label_mappings.json', 'w', encoding='utf-8') as f:
    json.dump({'label2id': label2id, 'id2label': id2label}, f, ensure_ascii=False, indent=2)

print(f"\n‚úì Label mappings saved to {MODEL_DIR / 'label_mappings.json'}")

Number of categories: 13

Category mappings:
  0: Agriculture
  1: automobiles
  2: bank
  3: business
  4: economy
  5: education
  6: entertainment
  7: health
  8: politic
  9: sports
  10: technology
  11: tourism
  12: world

‚úì Label mappings saved to /workspaces/sem_practise/models/news_classifier/label_mappings.json


## 3. Load Pretrained Model and Tokenizer

In [10]:
# Load tokenizer
print(f"Loading tokenizer: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("‚úì Tokenizer loaded")

# Load pretrained model for sequence classification
print(f"\nLoading pretrained model: {MODEL_NAME}...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(categories),
    id2label=id2label,
    label2id=label2id
)
model.to(device)
print("‚úì Model loaded and moved to device")

# Display model info
print(f"\nModel parameters: {model.num_parameters():,}")

Loading tokenizer: FacebookAI/xlm-roberta-base...
‚úì Tokenizer loaded

Loading pretrained model: FacebookAI/xlm-roberta-base...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úì Model loaded and moved to device

Model parameters: 278,053,645


## 4. Tokenize Data

In [None]:
def tokenize_function(examples):
    """Tokenize text data"""
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_LENGTH
    )

# Convert to HuggingFace Dataset format
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

print("Tokenizing training data...")
train_dataset = train_dataset.map(tokenize_function, batched=True)

print("Tokenizing test data...")
test_dataset = test_dataset.map(tokenize_function, batched=True)

print("\n‚úì Tokenization complete")
print(f"Train dataset: {train_dataset}")
print(f"Test dataset: {test_dataset}")

## 5. Training Configuration

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=str(MODEL_DIR / 'checkpoints'),
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir=str(MODEL_DIR / 'logs'),
    logging_steps=10,
    warmup_steps=100,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("Training configuration:")
print(f"  Output directory: {training_args.output_dir}")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Mixed precision (FP16): {training_args.fp16}")

In [None]:
# Define metrics
def compute_metrics(eval_pred):
    """Compute accuracy and other metrics"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("‚úì Metrics and data collator configured")

## 6. Fine-tune Model

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("‚úì Trainer initialized")
print("\nStarting fine-tuning...\n")

# Train the model
train_result = trainer.train()

print("\n‚úì Training complete!")
print(f"Training loss: {train_result.training_loss:.4f}")

## 7. Evaluate Model

In [None]:
# Evaluate on test set
print("Evaluating model on test set...")
eval_results = trainer.evaluate()

print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Get predictions
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_df['label'].values

# Classification report
print("\nDetailed Classification Report:")
print("="*80)
report = classification_report(
    true_labels,
    pred_labels,
    target_names=categories,
    digits=4
)
print(report)

# Save classification report
report_dict = classification_report(
    true_labels,
    pred_labels,
    target_names=categories,
    output_dict=True
)

with open(RESULTS_DIR / 'classification_report.json', 'w', encoding='utf-8') as f:
    json.dump(report_dict, f, ensure_ascii=False, indent=2)

print(f"\n‚úì Report saved to {RESULTS_DIR / 'classification_report.json'}")

In [None]:
# Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(14, 12))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=categories,
    yticklabels=categories,
    cbar_kws={'label': 'Count'}
)
plt.title('Confusion Matrix - News Classification', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Predicted Category', fontsize=12)
plt.ylabel('True Category', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úì Confusion matrix saved to {RESULTS_DIR / 'confusion_matrix.png'}")

## 8. Save Fine-tuned Model

In [None]:
# Save the fine-tuned model
print("Saving fine-tuned model...")
trainer.save_model(str(MODEL_DIR / 'final_model'))
tokenizer.save_pretrained(str(MODEL_DIR / 'final_model'))

print(f"‚úì Model saved to {MODEL_DIR / 'final_model'}")

# Save training metrics
metrics = {
    'model_name': MODEL_NAME,
    'num_categories': len(categories),
    'train_samples': len(train_df),
    'test_samples': len(test_df),
    'accuracy': float(eval_results['eval_accuracy']),
    'training_loss': float(train_result.training_loss),
    'epochs': NUM_EPOCHS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE
}

with open(MODEL_DIR / 'training_metrics.json', 'w', encoding='utf-8') as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print(f"‚úì Metrics saved to {MODEL_DIR / 'training_metrics.json'}")

## 9. Test Predictions on Sample Articles

In [None]:
def predict_category(text, model, tokenizer, id2label, device):
    """
    Predict category for a given text
    """
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LENGTH,
        padding=True
    ).to(device)
    
    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    return id2label[str(predicted_class)], confidence

# Test on random samples
print("Testing predictions on random samples:\n")
print("="*80)

sample_indices = np.random.choice(len(test_df), 5, replace=False)

for idx in sample_indices:
    text = test_df.iloc[idx]['text']
    true_category = test_df.iloc[idx]['category']
    
    predicted_category, confidence = predict_category(text, model, tokenizer, id2label, device)
    
    print(f"\nText (first 150 chars): {text[:150]}...")
    print(f"True Category: {true_category}")
    print(f"Predicted Category: {predicted_category}")
    print(f"Confidence: {confidence:.4f}")
    print(f"Correct: {'‚úì' if true_category == predicted_category else '‚úó'}")
    print("-"*80)

## 10. Summary

In [None]:
print("="*80)
print("NEWS CLASSIFICATION SUMMARY")
print("="*80)
print(f"\nü§ñ Model: {MODEL_NAME} (Pretrained)")
print(f"üìä Categories: {len(categories)}")
print(f"üìù Training samples: {len(train_df)}")
print(f"üß™ Test samples: {len(test_df)}")
print(f"\nüìà Performance:")
print(f"  ‚Ä¢ Accuracy: {metrics['accuracy']:.4f}")
print(f"  ‚Ä¢ Training loss: {metrics['training_loss']:.4f}")
print(f"\nüíæ Saved Files:")
print(f"  ‚Ä¢ Model: {MODEL_DIR / 'final_model'}")
print(f"  ‚Ä¢ Label mappings: {MODEL_DIR / 'label_mappings.json'}")
print(f"  ‚Ä¢ Training metrics: {MODEL_DIR / 'training_metrics.json'}")
print(f"  ‚Ä¢ Classification report: {RESULTS_DIR / 'classification_report.json'}")
print(f"  ‚Ä¢ Confusion matrix: {RESULTS_DIR / 'confusion_matrix.png'}")
print("\n‚úÖ News classification completed successfully!")
print("="*80)