In [1]:
# IMPORT
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, balanced_accuracy_score

from tqdm.notebook import tqdm
from torch.utils.data import Dataset

import wandb  # Weights & Biases for experiment tracking and visualization
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModel, AutoTokenizer

In [None]:
# Log in to Weights & Biases for experiment tracking
wandb.login(key='YOUR_KEY_HERE')

# Preprocessing Data
df = pd.read_csv('../data/evons.csv')

# Fill missing values in the 'title' and 'description' columns with empty strings
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')
df['media_source'] = df['media_source'].astype('category').cat.codes

# Calculate engagement 95 percentile threshold for virality
engagement_threshold = df['fb_engagements'].quantile(0.95)
df['is_viral'] = (df['fb_engagements'] > engagement_threshold).astype(int)


# Select relevant columns for the task
df = df[['title', 'description', 'media_source','fb_engagements', 'is_viral']]

X_text = df[['title','description','media_source']].values  # Features: title and description
y = df['is_viral'].values  # Labels: is_viral (binary classification)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Define a PyTorch module for text embedding using a pre-trained BERT model
class TextEmbedder(nn.Module):
    def __init__(self, bert_model, tokenizer):
        super().__init__()
        self.bert_model = bert_model.to(device)
        self.tokenizer = tokenizer

    def forward(self, x):
        # Tokenize the input text and move tensors to the specified device
        inputs = self.tokenizer(x, padding=True, truncation=True, return_tensors="pt").to(device)
        # Pass the tokenized input through the BERT model
        output = self.bert_model(**inputs)
        # Return the embeddings for the [CLS] token (first token in each sequence)
        return output.last_hidden_state[:, 0, :]

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
_lazy_text_embedder = None  # cache

def compute_embeddings(texts, batch_size=64, title_desc=None):
    global _lazy_text_embedder
    # Attempt to load precomputed embeddings first (no model download if present)
    try:
        if title_desc == 'title':
            return torch.load('../data/title_embeddings.pt')
        elif title_desc == 'desc':
            return torch.load('../data/desc_embeddings.pt')
    except Exception:
        pass  # Will fall back to computing

    # Lazy initialization of model ONLY now
    if _lazy_text_embedder is None:
        tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-base')
        bert_model = AutoModel.from_pretrained('FacebookAI/roberta-base')
        _lazy_text_embedder = TextEmbedder(bert_model, tokenizer).to(device)

    text_embedder = _lazy_text_embedder
    all_embeddings = []

    text_embedder.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = [str(t) for t in texts[i:i+batch_size]]
            emb = text_embedder(batch_texts).detach().cpu()
            all_embeddings.append(emb)

    return torch.cat(all_embeddings, dim=0)

In [None]:
# Compute embeddings for titles and descriptions in the dataset
# If precomputed embeddings are available, they will be loaded
# Otherwise, embeddings will be computed on-the-fly
title_embeddings = compute_embeddings(X_text[:, 0], title_desc='title')
desc_embeddings = compute_embeddings(X_text[:, 1], title_desc='desc')
sources = X_text[:, 2]

print(f"Shape embeddings titles: {title_embeddings.shape}")
print(f"Shape embeddings descriptions: {desc_embeddings.shape}")

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Shape embeddings titles: torch.Size([92969, 768])
Shape embeddings descriptions: torch.Size([92969, 768])


In [24]:
# Define a custom PyTorch Dataset for precomputed embeddings
class PrecomputedDataset(Dataset):
    def __init__(self, title_embeddings, desc_embeddings, sources, labels):
        # Initialize the dataset with title embeddings, description embeddings, and labels
        self.title_embeddings = title_embeddings
        self.desc_embeddings = desc_embeddings
        self.sources = torch.tensor(sources.astype(int), dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to PyTorch tensors

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.labels)

    def __getitem__(self, idx):
        # Retrieve a single sample (title embedding, description embedding, sources, label) by index
        return (self.title_embeddings[idx], self.desc_embeddings[idx], self.sources[idx]), self.labels[idx]

In [25]:
class ClassifierModel(nn.Module):
    def __init__(self, dropout_p=0.3):
        super().__init__()

        # Linear layer to reduce dimensionality of title embeddings (from 768 → 192)
        self.reduce_title = nn.Linear(768, 192)

        # Linear layer to reduce dimensionality of description embeddings (from 768 → 192)
        self.reduce_desc = nn.Linear(768, 192)

        # Embedding layer to represent categorical "source" IDs (11 possible sources, mapped to 192-dim vectors)
        self.source_embedding = nn.Embedding(11, 192)

        # Final linear layer for binary classification (input: concatenated 3 vectors of size 192, output: 1 logit)
        self.linear_1 = nn.Linear(192 * 3, 1)  # 1 output for binary classification

        # Activation function
        self.gelu = nn.GELU()

        # Dropout layer to prevent overfitting
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, data):
        # Unpack inputs (title embeddings, description embeddings, and categorical source IDs)
        title_embedding, desc_embedding, source = data

        # Reduce and activate title embedding
        embed_title = self.reduce_title(title_embedding.to(device))
        embed_title = self.gelu(embed_title)

        # Reduce and activate description embedding
        embed_desc = self.reduce_desc(desc_embedding.to(device))
        embed_desc = self.gelu(embed_desc)

        # Embed categorical source ID and apply activation
        source = self.source_embedding(source.to(device))
        source = self.gelu(source)

        # Concatenate reduced title, description, and source embeddings into a single feature vector
        x = torch.cat([embed_title, embed_desc, source], dim=1)

        # Apply final classification layer → raw logits (before sigmoid)
        logits = self.linear_1(x)

        return logits

In [26]:
# Function to plot a confusion matrix as a heatmap
def plot_confusion_matrix_image(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt=".2f", cmap="BuGn", xticklabels=labels, yticklabels=labels, cbar=True)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title("Confusion Matrix")
    return fig

# Function to evaluate the model on the test set and log metrics to Weights & Biases
def test_model_wandb(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss with logits
    total_loss = 0
    all_probs, all_targets = [], []

    for x_batch, y_batch in test_loader:
        y_batch = y_batch.to(device).float().unsqueeze(1)  # Move labels to the device and reshape
        y_logits = model(x_batch)  # Get model predictions (logits)
        loss = criterion(y_logits, y_batch)  # Compute the loss
        total_loss += loss.item()  # Accumulate the total loss
        probs = torch.sigmoid(y_logits).detach().cpu().squeeze(1)  # Apply sigmoid to get probabilities
        all_probs.append(probs)  # Collect probabilities
        all_targets.append(y_batch.detach().cpu().squeeze(1).long())  # Collect true labels

    all_probs = torch.cat(all_probs)  # Concatenate all probabilities
    all_targets = torch.cat(all_targets)  # Concatenate all true labels
    all_preds = (all_probs > 0.5).long()  # Convert probabilities to binary predictions

    fig = plot_confusion_matrix_image(all_targets.numpy(), all_preds.numpy(), [0, 1])  # create confusion matrix

    # Compute evaluation metrics
    metrics = {
        "test_loss": total_loss / len(test_loader),
        "test_accuracy": accuracy_score(all_targets, all_preds),
        "test_balanced_accuracy": balanced_accuracy_score(all_targets, all_preds),
        "test_f1": f1_score(all_targets, all_preds),
        "test_precision": precision_score(all_targets, all_preds),
        "test_recall": recall_score(all_targets, all_preds),
        "test_roc_auc": roc_auc_score(all_targets, all_probs),
        "test_confusion_matrix": wandb.Image(fig),
    }

    wandb.log(metrics)  # Log metrics to Weights & Biases
    plt.close(fig)  # Close the confusion matrix plot
    return metrics

from transformers import get_linear_schedule_with_warmup

def train_single_fold(config, fold, train_indices, val_indices, title_embeddings, desc_embeddings,sources, y, project_name):
    # Name this training run according to the fold
    run_name = f"fold_{fold}"

    # Start a new Weights & Biases run for experiment tracking
    with wandb.init(config=config, project=project_name, name=run_name, save_code=True):

        # Create training dataset from precomputed embeddings
        train_dataset = PrecomputedDataset(
            title_embeddings[train_indices],
            desc_embeddings[train_indices],
            sources[train_indices],
            y[train_indices]
        )

        # Create validation dataset from precomputed embeddings
        val_dataset = PrecomputedDataset(
            title_embeddings[val_indices],
            desc_embeddings[val_indices],
            sources[val_indices],
            y[val_indices]
        )

        # Wrap datasets in PyTorch DataLoader for batching
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

        # Log the fold number in Weights & Biases
        wandb.log({"fold": fold})

        # Compute class imbalance ratio to adjust loss function
        num_pos = (y[train_indices] == 1).sum()
        num_neg = (y[train_indices] == 0).sum()
        pos_weight = torch.tensor([num_neg / max(num_pos, 1)], dtype=torch.float).to(device)

        # Initialize model, optimizer, and loss function
        model = ClassifierModel(dropout_p=config['dropout']).to(device)
        optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  # Weighted loss for imbalanced classes

        # Create a learning rate scheduler with linear warmup/decay
        total_steps = len(train_loader) * config['num_epochs']
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        print(f"Training fold {fold}...")

        # Track the best validation F1 score
        best_f1 = 0.0
        best_epoch = 0

        # Training loop across epochs
        for epoch in range(config['num_epochs']):
            model.train()  # Put model in training mode
            total_loss = 0
            all_probs, all_targets = [], []

            # Iterate through training batches
            for x_batch, y_batch in tqdm(train_loader, desc=f"Fold {fold}, Epoch {epoch+1}"):
                optimizer.zero_grad()  # Reset gradients

                # Prepare labels and run forward pass
                y_batch_float = y_batch.to(device).float().unsqueeze(1)
                y_logits = model(x_batch)

                # Compute weighted loss
                loss = criterion(y_logits, y_batch_float)
                total_loss += loss.item()

                # Backpropagation and optimizer update
                loss.backward()
                optimizer.step()
                scheduler.step()

                # Collect predictions and true labels for metrics
                probs = torch.sigmoid(y_logits).detach().cpu().squeeze(1)
                all_probs.append(probs)
                all_targets.append(y_batch.detach().cpu().long())

            # Concatenate predictions and labels across all batches
            all_probs = torch.cat(all_probs)
            all_targets = torch.cat(all_targets)
            all_preds = (all_probs > 0.5).long()  # Convert probs to binary predictions

            # Plot and log confusion matrix for training set
            fig = plot_confusion_matrix_image(all_targets.numpy(), all_preds.numpy(), [0, 1])

            # Compute and log training metrics
            train_metrics = {
                "epoch": epoch,
                "train_loss": total_loss / len(train_loader),
                "train_accuracy": accuracy_score(all_targets, all_preds),
                "train_balanced_accuracy": balanced_accuracy_score(all_targets, all_preds),
                "train_f1": f1_score(all_targets, all_preds),
                "train_precision": precision_score(all_targets, all_preds),
                "train_recall": recall_score(all_targets, all_preds),
                "train_roc_auc": roc_auc_score(all_targets, all_probs),
                "train_confusion_matrix": wandb.Image(fig),
                "learning_rate": optimizer.param_groups[0]['lr']
            }

            wandb.log(train_metrics)
            plt.close(fig)

            # Evaluate on validation set and log metrics
            val_metrics = test_model_wandb(model, val_loader)

            # Update "best" model if F1 improved
            current_f1 = val_metrics["test_f1"]
            if current_f1 > best_f1:
                best_f1 = current_f1
                best_epoch = epoch
                best_metrics = val_metrics.copy()
                best_metrics["best_epoch"] = best_epoch

                # Prefix keys with "best_" for logging
                best_log_metrics = {f"best_{key}": value for key, value in best_metrics.items() if key != "test_confusion_matrix"}
                wandb.log(best_log_metrics)

                print(f"New best validation f1: {best_f1:.4f} at epoch {epoch + 1}")

        # Attach fold info to final best metrics
        best_metrics["fold"] = fold
        print(f"Fold {fold} completed. Best f1: {best_f1:.4f} at epoch {best_epoch + 1}")

        return best_metrics


In [None]:
def run_cross_validation_sweep(config=None):
    """
    Perform cross-validation to evaluate the model's performance across multiple folds.

    Args:
        config (dict, optional): Configuration dictionary containing hyperparameters such as
            - learning_rate: Learning rate for the optimizer.
            - weight_decay: Weight decay (L2 regularization) for the optimizer.
            - dropout: Dropout probability for the model.
            - num_epochs: Number of training epochs.

    Returns:
        list: A list of dictionaries containing the best metrics for each fold.
    """
    if config is None:
        # Default configuration if none is provided
        config = {
            'learning_rate': 1e-4,
            'weight_decay': 0.01,
            'dropout': 0.1,
            'num_epochs': 50
        }

    # Define the project name for Weights & Biases logging
    project_name = 'PROJECT_NAME'

    # Initialize Stratified K-Fold cross-validation with 10 splits
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # List to store the best metrics for each fold
    all_best_metrics = []

    # Iterate through each fold
    for fold, (train_indices, val_indices) in enumerate(skf.split(title_embeddings, y)):
        print(f"\n=== FOLD {fold + 1}/10 ===")

        # Train the model on the current fold and retrieve the best metrics
        best_fold_metrics = train_single_fold(
            config, fold + 1, train_indices, val_indices,
            title_embeddings, desc_embeddings, sources, y, project_name
        )
        all_best_metrics.append(best_fold_metrics)

    # Print a summary of the best metrics across all folds
    print(f"\n=== CROSS-VALIDATION SUMMARY (BEST METRICS) ===")
    metric_names = ['test_accuracy', 'test_balanced_accuracy', 'test_f1', 'test_precision', 'test_recall', 'test_roc_auc']

    for metric_name in metric_names:
        # Compute the mean and standard deviation for each metric
        values = [fold_metrics[metric_name] for fold_metrics in all_best_metrics]
        mean_val = np.mean(values)
        std_val = np.std(values)
        print(f"Best {metric_name}: {mean_val:.4f} ± {std_val:.4f}")

    # Print the best epoch for each fold
    print(f"\nBest epochs for each fold:")
    for i, fold_metrics in enumerate(all_best_metrics):
        print(f"Fold {i+1}: Epoch {fold_metrics['best_epoch'] + 1} (Balanced Accuracy: {fold_metrics['test_balanced_accuracy']:.4f})")

    return all_best_metrics

In [None]:
# Configuration for the experiment
config = {
    'learning_rate': 1e-4,
    'weight_decay': 0.01,
    'dropout': 0.1,
    'num_epochs': 50
}

# Run cross-validation with separate wandb runs for each fold
all_fold_results = run_cross_validation_sweep(config)