In [None]:
# IMPORT
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, balanced_accuracy_score

from tqdm.notebook import tqdm
from torch.utils.data import Dataset

import wandb  # Weights & Biases for experiment tracking and visualization
import matplotlib.pyplot as plt  
import seaborn as sns
from transformers import AutoModel, AutoTokenizer

In [None]:
# Log in to Weights & Biases for experiment tracking
wandb.login(key='YOUR_KEY_HERE')

df = pd.read_csv('../data/evons.csv')

# Fill missing values in the 'title' and 'description' columns with empty strings
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')

# Calculate engagement 95 percentile threshold for virality
engagement_threshold = df['fb_engagements'].quantile(0.95)
df['is_viral'] = (df['fb_engagements'] > engagement_threshold).astype(int)

# Select relevant columns for the task
df = df[['title', 'description','fb_engagements', 'is_viral']]

X_text = df[['title', 'description']].values  # Features: title and description
y = df['is_viral'].values  # Labels: is_viral (binary classification)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define a PyTorch module for text embedding using a pre-trained BERT model
class TextEmbedder(nn.Module):
    def __init__(self, bert_model, tokenizer):
        super().__init__()
        self.bert_model = bert_model.to(device)
        self.tokenizer = tokenizer

    def forward(self, x):
        # Tokenize the input text and move tensors to the specified device
        inputs = self.tokenizer(x, padding=True, truncation=True, return_tensors="pt").to(device)
        # Pass the tokenized input through the BERT model
        output = self.bert_model(**inputs)
        # Return the embeddings for the [CLS] token (first token in each sequence)
        return output.last_hidden_state[:, 0, :]

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Function to compute text embeddings using the TextEmbedder module
def compute_embeddings(texts, text_embedder, batch_size=64, title_desc=None):
    try:
        # If precomputed embeddings are available, load them from disk
        if title_desc == 'title':
            title_embeddings = torch.load('../data/title_embeddings.pt')
            return title_embeddings
        elif title_desc == 'desc':
            desc_embeddings = torch.load('../data/desc_embeddings.pt')
            return desc_embeddings
    except:
        # If precomputed embeddings are not available, compute them
        all_embeddings = []
        text_embedder.eval()  # Set the model to evaluation mode
        with torch.no_grad():  # Disable gradient computation for efficiency
            for i in tqdm(range(0, len(texts), batch_size)):
                # Process texts in batches
                batch_texts = [str(t) for t in texts[i:i+batch_size]]
                # Compute embeddings for the batch and move them to CPU
                emb = text_embedder(batch_texts).detach().cpu()
                all_embeddings.append(emb)

        # Concatenate all batch embeddings into a single tensor
        return torch.cat(all_embeddings, dim=0)

In [None]:
# Initialize the tokenizer and BERT model from Hugging Face Transformers
# Using the 'roberta-base' model pre-trained by FacebookAI

tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-base')
bert_model = AutoModel.from_pretrained('FacebookAI/roberta-base')
text_embedder = TextEmbedder(bert_model, tokenizer).to(device)

# Compute embeddings for titles and descriptions in the dataset
# If precomputed embeddings are available, they will be loaded
# Otherwise, embeddings will be computed on-the-fly
title_embeddings = compute_embeddings(X_text[:, 0], text_embedder, title_desc='title')
desc_embeddings = compute_embeddings(X_text[:, 1], text_embedder, title_desc='desc')

print(f"Shape embeddings titles: {title_embeddings.shape}")
print(f"Shape embeddings descriptions: {desc_embeddings.shape}")

In [None]:
# Define a custom PyTorch Dataset for precomputed embeddings
class PrecomputedDataset(Dataset):
    def __init__(self, title_embeddings, desc_embeddings, labels):
        # Initialize the dataset with title embeddings, description embeddings, and labels
        self.title_embeddings = title_embeddings
        self.desc_embeddings = desc_embeddings
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to PyTorch tensors

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # Retrieve a single sample (title embedding, description embedding, label) by index
        return (self.title_embeddings[idx], self.desc_embeddings[idx]), self.labels[idx]

In [None]:
# Define a PyTorch module for the classification model
class ClassifierModel(nn.Module):
    def __init__(self, dropout_p=0.3, hidden=512):
        super().__init__()

        # Define the architecture of the classification model
        self.net = nn.Sequential(
            nn.Linear(768*2, hidden),  # Fully connected layer to reduce dimensionality
            nn.LayerNorm(hidden),  # Layer normalization for stable training
            nn.GELU(),  # Activation function (Gaussian Error Linear Unit)
            nn.Dropout(dropout_p),  # Dropout for regularization
            nn.Linear(hidden, 1)  # Output layer for binary classification
        )

    def forward(self, data):
        # Forward pass of the model
        title_embedding, desc_embedding = data  # Unpack title and description embeddings

        # Concatenate title and description embeddings along the feature dimension
        h_txt = torch.cat([title_embedding, desc_embedding], dim=1).to(device)

        # Pass the concatenated embeddings through the network
        return self.net(h_txt)

In [None]:
# Function to plot a confusion matrix as a heatmap
def plot_confusion_matrix_image(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt=".2f", cmap="BuGn", xticklabels=labels, yticklabels=labels, cbar=True)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title("Confusion Matrix")
    return fig

# Function to evaluate the model on the test set and log metrics to Weights & Biases
def test_model_wandb(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss with logits
    total_loss = 0
    all_probs, all_targets = [], []

    for x_batch, y_batch in test_loader:
        y_batch = y_batch.to(device).float().unsqueeze(1)  # Move labels to the device and reshape
        y_logits = model(x_batch)  # Get model predictions (logits)
        loss = criterion(y_logits, y_batch)  # Compute the loss
        total_loss += loss.item()  # Accumulate the total loss
        probs = torch.sigmoid(y_logits).detach().cpu().squeeze(1)  # Apply sigmoid to get probabilities
        all_probs.append(probs)  # Collect probabilities
        all_targets.append(y_batch.detach().cpu().squeeze(1).long())  # Collect true labels

    all_probs = torch.cat(all_probs)  # Concatenate all probabilities
    all_targets = torch.cat(all_targets)  # Concatenate all true labels
    all_preds = (all_probs > 0.5).long()  # Convert probabilities to binary predictions

    fig = plot_confusion_matrix_image(all_targets.numpy(), all_preds.numpy(), [0, 1])  # create confusion matrix

    # Compute evaluation metrics
    metrics = {
        "test_loss": total_loss / len(test_loader),
        "test_accuracy": accuracy_score(all_targets, all_preds),
        "test_balanced_accuracy": balanced_accuracy_score(all_targets, all_preds),
        "test_f1": f1_score(all_targets, all_preds),
        "test_precision": precision_score(all_targets, all_preds),
        "test_recall": recall_score(all_targets, all_preds),
        "test_roc_auc": roc_auc_score(all_targets, all_probs),
        "test_confusion_matrix": wandb.Image(fig),
    }

    wandb.log(metrics)  # Log metrics to Weights & Biases
    plt.close(fig)  # Close the confusion matrix plot
    return metrics

from transformers import get_linear_schedule_with_warmup

def train_single_fold(config, fold, train_indices, val_indices, title_embeddings, desc_embeddings, y, project_name):
    # Name this training run according to the fold
    run_name = f"fold_{fold}"

    # Start a new Weights & Biases run for experiment tracking
    with wandb.init(config=config, project=project_name, name=run_name, save_code=True):

        # Create training dataset from precomputed embeddings
        train_dataset = PrecomputedDataset(
            title_embeddings[train_indices],
            desc_embeddings[train_indices],
            y[train_indices]
        )

        # Create validation dataset from precomputed embeddings
        val_dataset = PrecomputedDataset(
            title_embeddings[val_indices],
            desc_embeddings[val_indices],
            y[val_indices]
        )

        # Wrap datasets in PyTorch DataLoader for batching
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

        # Log the fold number in Weights & Biases
        wandb.log({"fold": fold})

        # Compute class imbalance ratio to adjust loss function
        num_pos = (y[train_indices] == 1).sum()
        num_neg = (y[train_indices] == 0).sum()
        pos_weight = torch.tensor([num_neg / max(num_pos, 1)], dtype=torch.float).to(device)

        # Initialize model, optimizer, and loss function
        model = ClassifierModel(dropout_p=config['dropout']).to(device)
        optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  # Weighted loss for imbalanced classes

        # Create a learning rate scheduler with linear warmup/decay
        total_steps = len(train_loader) * config['num_epochs']
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        print(f"Training fold {fold}...")

        # Track the best validation F1 score
        best_f1 = 0.0
        best_epoch = 0

        # Training loop across epochs
        for epoch in range(config['num_epochs']):
            model.train()  # Put model in training mode
            total_loss = 0
            all_probs, all_targets = [], []

            # Iterate through training batches
            for x_batch, y_batch in tqdm(train_loader, desc=f"Fold {fold}, Epoch {epoch+1}"):
                optimizer.zero_grad()  # Reset gradients

                # Prepare labels and run forward pass
                y_batch_float = y_batch.to(device).float().unsqueeze(1)
                y_logits = model(x_batch)

                # Compute weighted loss
                loss = criterion(y_logits, y_batch_float)
                total_loss += loss.item()

                # Backpropagation and optimizer update
                loss.backward()
                optimizer.step()
                scheduler.step()

                # Collect predictions and true labels for metrics
                probs = torch.sigmoid(y_logits).detach().cpu().squeeze(1)
                all_probs.append(probs)
                all_targets.append(y_batch.detach().cpu().long())

            # Concatenate predictions and labels across all batches
            all_probs = torch.cat(all_probs)
            all_targets = torch.cat(all_targets)
            all_preds = (all_probs > 0.5).long()  # Convert probs to binary predictions

            # Plot and log confusion matrix for training set
            fig = plot_confusion_matrix_image(all_targets.numpy(), all_preds.numpy(), [0, 1])

            # Compute and log training metrics
            train_metrics = {
                "epoch": epoch,
                "train_loss": total_loss / len(train_loader),
                "train_accuracy": accuracy_score(all_targets, all_preds),
                "train_balanced_accuracy": balanced_accuracy_score(all_targets, all_preds),
                "train_f1": f1_score(all_targets, all_preds),
                "train_precision": precision_score(all_targets, all_preds),
                "train_recall": recall_score(all_targets, all_preds),
                "train_roc_auc": roc_auc_score(all_targets, all_probs),
                "train_confusion_matrix": wandb.Image(fig),
                "learning_rate": optimizer.param_groups[0]['lr']
            }

            wandb.log(train_metrics)
            plt.close(fig)

            # Evaluate on validation set and log metrics
            val_metrics = test_model_wandb(model, val_loader)

            # Update "best" model if F1 improved
            current_f1 = val_metrics["test_f1"]
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_epoch = epoch
                best_metrics = val_metrics.copy()
                best_metrics["best_epoch"] = best_epoch

                # Prefix keys with "best_" for logging
                best_log_metrics = {f"best_{key}": value for key, value in best_metrics.items() if key != "test_confusion_matrix"}
                wandb.log(best_log_metrics)

                print(f"New best validation f1: {best_f1:.4f} at epoch {epoch + 1}")

        # Attach fold info to final best metrics
        best_metrics["fold"] = fold
        print(f"Fold {fold} completed. Best f1: {best_f1:.4f} at epoch {best_epoch + 1}")

        return best_metrics


In [None]:
def run_cross_validation_sweep(config=None):
    """
    Perform cross-validation to evaluate the model's performance across multiple folds.

    Args:
        config (dict, optional): Configuration dictionary containing hyperparameters such as
            - learning_rate: Learning rate for the optimizer.
            - weight_decay: Weight decay (L2 regularization) for the optimizer.
            - dropout: Dropout probability for the model.
            - num_epochs: Number of training epochs.

    Returns:
        list: A list of dictionaries containing the best metrics for each fold.
    """
    if config is None:
        # Default configuration if none is provided
        config = {
            'learning_rate': 1e-4,
            'weight_decay': 0.01,
            'dropout': 0.1,
            'num_epochs': 50
        }

    # Define the project name for Weights & Biases logging
    project_name = 'PROJECT_NAME'

    # Initialize Stratified K-Fold cross-validation with 10 splits
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # List to store the best metrics for each fold
    all_best_metrics = []

    # Iterate through each fold
    for fold, (train_indices, val_indices) in enumerate(skf.split(title_embeddings, y)):
        print(f"\n=== FOLD {fold + 1}/10 ===")

        # Train the model on the current fold and retrieve the best metrics
        best_fold_metrics = train_single_fold(
            config, fold + 1, train_indices, val_indices,
            title_embeddings, desc_embeddings, y, project_name
        )
        all_best_metrics.append(best_fold_metrics)

    # Print a summary of the best metrics across all folds
    print(f"\n=== CROSS-VALIDATION SUMMARY (BEST METRICS) ===")
    metric_names = ['test_accuracy', 'test_balanced_accuracy', 'test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

    for metric_name in metric_names:
        # Compute the mean and standard deviation for each metric
        values = [fold_metrics[metric_name] for fold_metrics in all_best_metrics]
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"Best {metric_name}: {mean_val:.4f} ± {std_val:.4f}")

    # Print the best epoch for each fold
    print(f"\nBest epochs for each fold:")
    for i, fold_metrics in enumerate(all_best_metrics):
        print(f"Fold {i+1}: Epoch {fold_metrics['best_epoch'] + 1} (Balanced Accuracy: {fold_metrics['test_balanced_accuracy']:.4f})")

    return all_best_metrics

In [None]:
# Configuration for the experiment
config = {
    'learning_rate': 1e-4,
    'weight_decay': 0.01,
    'dropout': 0.1,
    'num_epochs': 50
}

# Run cross-validation with separate wandb runs for each fold
all_fold_results = run_cross_validation_sweep(config)

# Baselines

In [None]:
# BASELINE MODELS
# These traditional ML models act as reference points to contextualize
# the performance of the neural model trained on the same BERT embeddings.
# - DummyClassifier: Simple heuristics (most frequent / stratified) to establish
#   the minimum reasonable performance (chance or majority class).
# - LogisticRegression: Strong linear baseline that often performs well with
#   high-dimensional dense embeddings.
# - RandomForestClassifier: Non-linear ensemble baseline capturing feature
#   interactions without heavy tuning.
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [None]:
def prepare_baseline_features(df_full, train_indices, val_indices, title_embeddings, desc_embeddings):
    """
    Prepare (X_train, X_val) feature matrices for baseline models by concatenating
    the frozen BERT [CLS] embeddings of title and description.

    Rationale:
    - Each title / description has a 768-dim embedding (roberta-base hidden size).
    - Concatenation yields a 1536-dim representation per sample preserving the
      separation between short (title) and longer (description) contexts.
    - We keep embeddings fixed so traditional ML models train quickly.

    Parameters
    ----------
    df_full : pd.DataFrame
        Original dataframe (only its indices are used here for consistency).
    train_indices, val_indices : array-like
        Index arrays defining the split for the current fold.
    title_embeddings, desc_embeddings : torch.Tensor [N, 768]
        Precomputed embeddings aligned with df_full rows.

    Returns
    -------
    train_features : np.ndarray [n_train, 1536]
    val_features   : np.ndarray [n_val, 1536]
    """
    # Convert the selected rows to numpy (detaching from torch graph / device)
    train_title_emb = title_embeddings[train_indices].numpy()
    val_title_emb = title_embeddings[val_indices].numpy()

    train_desc_emb = desc_embeddings[train_indices].numpy()
    val_desc_emb = desc_embeddings[val_indices].numpy()

    # Horizontal stack -> concat feature dimension (768 + 768 = 1536)
    train_features = np.hstack([train_title_emb, train_desc_emb])
    val_features = np.hstack([val_title_emb, val_desc_emb])

    return train_features, val_features

In [None]:
def evaluate_baseline_model(model, X_train, y_train, X_val, y_val, model_name):
    """
    Fit a baseline model and compute standard classification metrics.

    Steps
    -----
    1. Fit the model on (X_train, y_train).
    2. Predict class labels on validation set.
    3. If available, obtain predicted probabilities (for ROC AUC).
    4. Compute a consistent metric set for later aggregation.

    Notes
    -----
    - zero_division=0 in precision avoids warnings when a class is never
      predicted (important for naive baselines).
    """
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Probability scores (if supported) for ROC AUC; else None
    try:
        y_proba = model.predict_proba(X_val)[:, 1]
        roc_auc = roc_auc_score(y_val, y_proba)
    except Exception:
        roc_auc = None

    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y_val, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_val, y_pred),
        'f1': f1_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred, zero_division=0),
        'recall': recall_score(y_val, y_pred),
        'roc_auc': roc_auc
    }

    return metrics

In [None]:
def run_baseline_cross_validation(df_full, y, title_embeddings, desc_embeddings, kfold = True):
    """
    Run either 10-fold stratified cross-validation or a single train/holdout split
    for the suite of baseline models, using frozen BERT embeddings as features.

    Parameters
    ----------
    df_full : pd.DataFrame
        Full dataset (used for indexing). Only indices matter here.
    y : np.ndarray [N]
        Binary labels aligned with df_full rows.
    title_embeddings, desc_embeddings : torch.Tensor [N, 768]
        Precomputed RoBERTa embeddings.
    kfold : bool
        If True -> 10-fold StratifiedKFold; else single 80/20 stratified split.

    Returns
    -------
    results : dict(str -> list[dict])
        For each model name, a list of per-fold (or single split) metric dicts.

    Design Choices
    --------------
    - StratifiedKFold preserves label proportions per fold (important for class imbalance).
    - Using consistent features (concatenated embeddings) makes comparison to
      the neural classifier fair (same representation, different learning head).
    - Limited hyperparameter tuning keeps baselines lightweight and fast.
    """
    # Define baseline estimators
    models = {
        'DummyClassifier (Most Frequent)': DummyClassifier(strategy='most_frequent'),
        'DummyClassifier (Stratified)': DummyClassifier(strategy='stratified'),
        'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            class_weight='balanced',
            random_state=42
        )
    }

    # Store list of per-fold metrics for each model
    results = {model_name: [] for model_name in models.keys()}

    if kfold:
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        print("=== BASELINE MODELS CROSS-VALIDATION ===")
        print("Using BERT embeddings (same as the main model)")

        for fold, (train_indices, val_indices) in enumerate(skf.split(df_full, y)):
            print(f"\nProcessing Fold {fold + 1}/10...")

            # Build fold-specific feature matrices
            X_train, X_val = prepare_baseline_features(df_full, train_indices, val_indices,
                                                    title_embeddings, desc_embeddings)
            y_train, y_val = y[train_indices], y[val_indices]

            print(f"  Features shape: {X_train.shape[1]} (768 title + 768 desc)")

            # Evaluate each model on this fold
            for model_name, model in models.items():
                metrics = evaluate_baseline_model(model, X_train, y_train, X_val, y_val, model_name)
                metrics['fold'] = fold + 1
                results[model_name].append(metrics)
                print(f"Model {model_name} done")

        return results

    else:
        print("=== BASELINE MODELS TRAINING ===")
        print("Using BERT embeddings (same as the main model)")

        # Single stratified split (deterministic for reproducibility)
        df_train, df_val = train_test_split(df_full, test_size=0.2, random_state=42, stratify=y)
        train_indices = df_train.index
        val_indices = df_val.index

        X_train, X_val = prepare_baseline_features(df_full, train_indices, val_indices,
                                                title_embeddings, desc_embeddings)
        y_train, y_val = y[train_indices], y[val_indices]

        print(f"Features shape: {X_train.shape[1]} (768 title + 768 desc)")

        for model_name, model in models.items():
            metrics = evaluate_baseline_model(model, X_train, y_train, X_val, y_val, model_name)
            results[model_name].append(metrics)
            print(f"Model {model_name} done")

        return results

In [None]:
def print_baseline_results(baseline_results):
    """
    Aggregate and display per-model performance statistics across folds.

    For each model we compute mean ± std for the metric set, skipping metrics
    that are None (e.g., roc_auc for models lacking probability outputs).

    baseline_results structure:
        { model_name: [ { 'accuracy': ..., 'balanced_accuracy': ..., ... }, ... ] }
    """
    print("\n" + "="*80)
    print("BASELINE MODELS RESULTS SUMMARY")
    print("="*80)

    metric_names = ['accuracy', 'balanced_accuracy', 'f1', 'precision', 'recall', 'roc_auc']

    for model_name, fold_results in baseline_results.items():
        print(f"\n{model_name}:")
        print("-" * (len(model_name) + 1))

        for metric_name in metric_names:
            # Collect available (non-None) values for this metric across folds
            values = [fold_result[metric_name] for fold_result in fold_results
                     if fold_result[metric_name] is not None]

            if values:
                mean_val = np.mean(values)
                std_val = np.std(values)
                print(f"  {metric_name.replace('_', ' ').title()}: {mean_val:.4f} ± {std_val:.4f}")
            else:
                print(f"  {metric_name.replace('_', ' ').title()}: N/A")

    print("\n" + "="*80)

In [None]:
# Kick off baseline model evaluation using 10-fold CV on the same frozen embeddings.
print("Starting evaluation of baseline models...")
baseline_results = run_baseline_cross_validation(df, y, title_embeddings, desc_embeddings)

# Summarize mean ± std metrics per model for quick comparison.
print_baseline_results(baseline_results)