# **ALBERT**

In [None]:
%pip install transformers==4.36.0 torch==2.9.0 scikit-learn==1.7.2 pandas tqdm sentencepiece psutil

## Upload Dataset


In [None]:
from google.colab import files

# Upload merged_fakenewsnet.csv dataset
uploaded = files.upload()

# df = pd.read_csv("merged_fakenewsnet.csv")

## Data Cleaning and Validation
- Removes rows with missing values in the `clean_title` or `label` columns
- Ensures all labels are properly formatted as integers
- Displays before/after statistics to verify the cleaning process
- Loads the dataset and removes any rows with missing or invalid labels
- Converts labels to numeric format and validates data integrity

In [None]:
# Upload the CSV
import os
from pathlib import Path
from transformers import AlbertTokenizer
import torch
from sklearn.model_selection import train_test_split
import pandas as pd

# Get the notebook's directory and construct absolute path
notebook_dir = Path.cwd()
if 'notebooks' in str(notebook_dir):
    # If running from notebooks folder, go up one level
    data_path = notebook_dir.parent / "data" / "merged_fakenewsnet.csv"
    output_path = notebook_dir.parent / "data" / "merged_fakenewsnet_numeric.csv"
else:
    # If running from project root
    data_path = notebook_dir / "data" / "merged_fakenewsnet.csv"
    output_path = notebook_dir / "data" / "merged_fakenewsnet_numeric.csv"

print(f"Loading from: {data_path}")
df = pd.read_csv(data_path)

print(f"Original shape: {df.shape}")
print(f"Original labels:\n{df['label'].value_counts()}\n")

# Remove any rows where label is not 'fake' or 'real' (e.g., header rows)
df = df[df['label'].isin(['fake', 'real'])]

# Convert string labels to numeric
df['label'] = df['label'].map({'fake': 0, 'real': 1})

# Check conversion
print(f"After cleaning and conversion:")
print(f"Shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}\n")

# Save new version
df.to_csv(output_path, index=False)

print(f"Conversion complete! File saved as {output_path}")

## Tokenization & Data Splitting
**ALBERT Tokenizer Initialization**
- Loads the pre-trained `albert-base-v2` tokenizer from Hugging Face

**Dataset Splitting (80/10/10)**
- **Training set**: 80% of data for model training
- **Validation set**: 10% for hyperparameter tuning and monitoring
- **Test set**: 10% for final model evaluation
- Uses stratified splitting to maintain class balance across all sets

**Tokenization**
- Converts text to token IDs with padding and truncation
- Sets maximum sequence length to 128 tokens
- Returns PyTorch tensors ready for model input

**Output**: Three tokenized datasets ready for training, validation, and testing.

In [None]:
# Initialize ALBERT Tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')


# Split Dataset (80/10/10)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['clean_title'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

print(" Dataset split successful!")
print(f"Train: {len(train_texts)}, Validation: {len(val_texts)}, Test: {len(test_texts)}\n")

# Tokenization Function
def encode_data(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )

# Apply Tokenization
train_encodings = encode_data(train_texts)
val_encodings = encode_data(val_texts)
test_encodings = encode_data(test_texts)

print(" Tokenization complete!")
print(f"Train samples: {len(train_texts)}, Validation: {len(val_texts)}, Test: {len(test_texts)}")

## Create Custom Dataset and DataLoaders
- Defines `FakeNewsDataset` class that wraps tokenized encodings and labels
- Implements required methods (`__getitem__`, `__len__`) for PyTorch compatibility
- Formats data as dictionaries with input tensors and labels

- Creates three dataset instances for training, validation, and testing

**DataLoaders**
- Wraps datasets in DataLoader objects for automatic batching
- **Batch size**: 16 samples per batch
- **Training**: Shuffled for better generalization
- **Validation/Test**: Not shuffled to maintain consistency

**Output**: DataLoaders that efficiently feed batches of data to the model during training.

In [None]:
from torch.utils.data import Dataset, DataLoader

# Define a Custom Dataset Class
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


# Create Dataset Objects
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)
test_dataset = FakeNewsDataset(test_encodings, test_labels)

# Create DataLoaders (for batching)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("DataLoaders created successfully!")
print(f"Train batches: {len(train_loader)}, Validation: {len(val_loader)}, Test: {len(test_loader)}")

## Model Training with ALBERT
Complete training pipeline for fine-tuning ALBERT on fake news detection:

- Automatically detects and uses GPU if available, otherwise falls back to CPU

- Loads pre-trained `albert-base-v2` model
- Configures for binary classification (2 labels: fake/real)

**Optimizer & Learning Rate Scheduler**
- **Optimizer**: AdamW with learning rate of 2e-5 (standard for transformer fine-tuning)
- **Scheduler**: Linear learning rate decay over training steps
- **Epochs**: 3 (typical for fine-tuning pre-trained models)

**Training Loop**
For each epoch:
- **Training Phase**: 
  - Forward pass through model
  - Compute loss
  - Backward propagation
  - Update weights
  - Track average training loss
  
- **Validation Phase**:
  - Evaluate model on validation set without gradient updates
  - Monitor validation loss to detect overfitting

**Model Saving**
- Saves the fine-tuned model to `albert_fakenewsnet` directory
- Can be reloaded later for inference or further training

**Output**: A fine-tuned ALBERT model specialized for fake news detection.

In [None]:
from transformers import AlbertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
import torch

# Setup Device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Load ALBERT Pretrained Model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training Loop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    print(f"\n Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0

    # -------- TRAINING PHASE --------
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    avg_loss = total_loss / len(train_loader)
    print(f" Average training loss: {avg_loss:.4f}")

    # -------- VALIDATION PHASE --------
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f" Validation loss after epoch {epoch + 1}: {avg_val_loss:.4f}")
    model.train()  # switch back to training mode

# Save Fine-Tuned Model
model.save_pretrained("albert_fakenewsnet")
print("\n Model saved as 'albert_fakenewsnet'")

In [None]:
# Checking the trained model
%ls albert_fakenewsnet

In [None]:
# Include the Tokenizer
tokenizer.save_pretrained("albert_fakenewsnet")

In [None]:
# Check for Tokenizer inclusion
%ls albert_fakenewsnet

In [None]:
# Downdloading the model
from google.colab import files
%zip -r albert_fakenewsnet.zip albert_fakenewsnet
files.download("albert_fakenewsnet.zip")

## Model Performance Evaluation
Evaluates the fine-tuned ALBERT model on the test set:
- **Accuracy**: Overall correctness of predictions
- **Precision**: Proportion of correct positive predictions
- **Recall**: Proportion of actual positives correctly identified
- **F1 Score**: Harmonic mean of precision and recall
- **AUC-ROC**: Area under the receiver operating characteristic curve

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import torch
from tqdm import tqdm

# Evaluation Mode
model.eval()

predictions, true_labels, probs = [], [], []

# Inference Loop
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        pred_probs = torch.softmax(logits, dim=-1)  # probabilities
        preds = torch.argmax(pred_probs, dim=-1)

        probs.extend(pred_probs[:, 1].cpu().numpy())  # probability of "real" (label=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(batch['labels'].cpu().numpy())

# Compute Metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
auc = roc_auc_score(true_labels, probs)

# Display Results
print(f"\n Model Evaluation Results:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}\n")

print("Detailed Classification Report:\n")
print(classification_report(true_labels, predictions, target_names=['Fake', 'Real']))

## Operational Efficiency Metrics
Evaluates computational efficiency for real-time deployment:
- **Inference Latency**: Average time to process samples
- **Memory Usage**: GPU/CPU memory consumption during inference
- **Model Size**: Total disk space of saved model files

In [None]:
import time
import os
import psutil
import numpy as np

# -------- INFERENCE LATENCY --------
model.eval()
latencies = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Measuring Latency"):
        batch = {k: v.to(device) for k, v in batch.items()}
        start_time = time.time()
        outputs = model(**batch)
        latencies.append((time.time() - start_time) * 1000)  # ms

avg_latency = np.mean(latencies)
print(f"\nInference Latency:")
print(f"   Average: {avg_latency:.2f} ms/batch ({avg_latency/16:.2f} ms/sample)")

# -------- MEMORY USAGE --------
if torch.cuda.is_available():
    memory_mb = torch.cuda.memory_allocated(device) / (1024 ** 2)
    print(f"\nGPU Memory: {memory_mb:.2f} MB")
else:
    memory_mb = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 2)
    print(f"\nCPU Memory: {memory_mb:.2f} MB")

# -------- MODEL SIZE --------
model_dir = "albert_fakenewsnet"
total_size = sum(os.path.getsize(os.path.join(dp, f)) 
                 for dp, dn, fn in os.walk(model_dir) for f in fn)
size_mb = total_size / (1024 ** 2)
print(f"\nModel Size: {size_mb:.2f} MB")