# Clickbait Detector with Transformers

In [8]:
import sys
import argparse
import random
import pandas as pd
from pathlib import Path

import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    precision_recall_fscore_support,
    roc_auc_score,
    average_precision_score,
)

import torch
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
)

In [9]:
REPO_ROOT = Path('/content')
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

from utility.dataLoader import load_texts_labels as load_texts_labels_unified

def get_texts_labels_for(dataset: str):
    return load_texts_labels_unified(dataset)

In [10]:
def evaluate(gold, pred, tag: str, y_score=None):
    acc = accuracy_score(gold, pred)
    prec = precision_score(gold, pred, pos_label=1, zero_division=0)
    rec = recall_score(gold, pred, pos_label=1, zero_division=0)
    cm = confusion_matrix(gold, pred, labels=[0, 1])
    print(f"[{tag}] Acc={acc:.3f}  Prec(pos=1)={prec:.3f}  Rec(pos=1)={rec:.3f}")
    print(f"[{tag}] Confusion Matrix:\n{cm}")

    if y_score is not None:
        try:
            p_c, r_c, f1_c, supp_c = precision_recall_fscore_support(
                gold, pred, labels=[0, 1], zero_division=0
            )
            p_macro, r_macro, f1_macro, _ = precision_recall_fscore_support(
                gold, pred, average="macro", zero_division=0
            )
            p_micro, r_micro, f1_micro, _ = precision_recall_fscore_support(
                gold, pred, average="micro", zero_division=0
            )
            try:
                roc_auc = roc_auc_score(gold, y_score)
            except Exception:
                roc_auc = None
            try:
                pr_auc = average_precision_score(gold, y_score)
            except Exception:
                pr_auc = None

            print(f"[{tag}] Per-class:")
            print(
                f"  class 0: prec={p_c[0]:.3f} rec={r_c[0]:.3f} "
                f"f1={f1_c[0]:.3f} support={supp_c[0]}"
            )
            print(
                f"  class 1: prec={p_c[1]:.3f} rec={r_c[1]:.3f} "
                f"f1={f1_c[1]:.3f} support={supp_c[1]}"
            )
            print(
                f"[{tag}] Macro: prec={p_macro:.3f} "
                f"rec={r_macro:.3f} f1={f1_macro:.3f}"
            )
            print(
                f"[{tag}] Micro: prec={p_micro:.3f} "
                f"rec={r_micro:.3f} f1={f1_micro:.3f}"
            )
            print(f"[{tag}] ROC-AUC: {'N/A' if roc_auc is None else f'{roc_auc:.3f}'}")
            print(f"[{tag}] PR-AUC:  {'N/A' if pr_auc is None else f'{pr_auc:.3f}'}")
        except Exception as e:
            print(f"[{tag}] Extended metrics error: {e}")

In [11]:
class ClickbaitDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


def tokenize_texts(tokenizer, texts, max_length: int):
    return tokenizer(
        list(texts),
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

In [12]:
def train_and_evaluate_transformer(
    dataset: str,
    X_texts,
    y,
    model_name: str = "bert-base-uncased",
    max_length: int = 64,
    batch_size: int = 16,
    num_epochs: int = 3,
    learning_rate: float = 2e-5,
    weight_decay: float = 0.01,
    seed: int = 42,
    output_dir: str = "transformer_clickbait",
):
    """
    Transformer analogue of train_and_evaluate_naive_bayes:

    - Deterministic shuffle with seed
    - 80/20 train/test split
    - Optional small validation split from train
    - Fine-tune a transformer model
    - Evaluate on held-out test set with the same metrics as evaluate()
    """
    if not X_texts or not y:
        print(f"[{dataset}] No data loaded; skipping.")
        return

    set_seed(seed)

    # Deterministic shuffle (same spirit as naiveBayes.py)
    docs = list(zip(X_texts, y))
    rnd = random.Random(seed)
    rnd.shuffle(docs)
    X_texts, y = zip(*docs)

    n = len(X_texts)
    if n == 0:
        print(f"[{dataset}] No data after shuffle; skipping.")
        return

    # 80/20 split for train+val / test
    k = max(1, int(0.8 * n))
    train_texts_full = X_texts[:k]
    train_labels_full = y[:k]
    test_texts = X_texts[k:]
    test_labels = y[k:]

    # Log test distribution (to mirror naiveBayes.py)
    pos_test = sum(1 for v in test_labels if v == 1)
    neg_test = len(test_labels) - pos_test
    print(
        f"[{dataset}] Test split distribution: "
        f"pos={pos_test} neg={neg_test} (n={len(test_labels)})"
    )

    # Create a small validation set from the training slice (e.g., 10% of train)
    if len(train_texts_full) > 10:
        val_size = max(1, int(0.1 * len(train_texts_full)))
    else:
        val_size = 1 if len(train_texts_full) > 1 else 0

    if val_size > 0:
        train_texts = train_texts_full[:-val_size]
        train_labels = train_labels_full[:-val_size]
        val_texts = train_texts_full[-val_size:]
        val_labels = train_labels_full[-val_size:]
    else:
        train_texts = train_texts_full
        train_labels = train_labels_full
        val_texts, val_labels = [], []

    # Tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
    )

    # Tokenization
    train_encodings = tokenize_texts(tokenizer, train_texts, max_length=max_length)
    if val_texts:
        val_encodings = tokenize_texts(tokenizer, val_texts, max_length=max_length)
    else:
        val_encodings = None
    test_encodings = tokenize_texts(tokenizer, test_texts, max_length=max_length)

    # Datasets
    train_dataset = ClickbaitDataset(train_encodings, train_labels)
    eval_dataset = (
        ClickbaitDataset(val_encodings, val_labels) if val_encodings is not None else None
    )
    test_dataset = ClickbaitDataset(test_encodings, test_labels)

    # Compute metrics for Trainer (simple accuracy + macro F1)
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        # Macro F1 over (0,1)
        _, _, f1_macro, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        return {"accuracy": acc, "f1_macro": f1_macro}

    # Training arguments
    tag_safe = dataset.replace(" ", "_")
    run_output_dir = f"{output_dir}/{dataset}_{model_name.replace('/', '_')}"

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        eval_strategy="epoch" if eval_dataset is not None else "no",
        save_strategy="epoch" if eval_dataset is not None else "no",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=bool(eval_dataset),
        metric_for_best_model="f1_macro",
        logging_dir=f"{run_output_dir}/logs",
        logging_steps=50,
        report_to="none", # Disable Weights & Biases logging
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics if eval_dataset is not None else None,
    )

    # Train
    print(f"[{dataset}] Starting transformer fine-tuning on {model_name}")
    trainer.train()

    # Evaluate on validation (if exists)
    if eval_dataset is not None:
        val_metrics = trainer.evaluate()
        print(f"[{dataset}] Validation metrics: {val_metrics}")

    # Final evaluation on test set (with full metrics)
    print(f"[{dataset}] Evaluating on held-out test set...")
    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    test_labels_arr = np.array(test_labels, dtype=int)
    preds = np.argmax(logits, axis=-1)

    # Convert logits to probabilities for class 1
    # logits shape: (N, 2) for binary classification
    probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
    y_score = probs[:, 1]

    tag = (
        f"{dataset}][Transformer][model={model_name}]"
        f"[max_len={max_length}][epochs={num_epochs}]"
    )
    evaluate(test_labels_arr, preds, tag, y_score=y_score)


# -----------------------------------------------------------------------------
# Driver (mirrors run_for_dataset / main in naiveBayes.py)
# -----------------------------------------------------------------------------

def run_for_dataset(
    dataset: str,
    X_texts_arg=None,
    y_arg=None,
    model_name: str = "bert-base-uncased",
    max_length: int = 64,
    batch_size: int = 16,
    num_epochs: int = 3,
    learning_rate: float = 2e-5,
    weight_decay: float = 0.01,
    seed: int = 42,
    output_dir: str = "transformer_clickbait",
):
    print(f"\n=== {dataset.capitalize()} (Transformer) ===")
    if X_texts_arg is not None and y_arg is not None:
        X_texts = X_texts_arg
        y = y_arg
    else:
        X_texts, y = get_texts_labels_for(dataset)

    train_and_evaluate_transformer(
        dataset=dataset,
        X_texts=X_texts,
        y=y,
        model_name=model_name,
        max_length=max_length,
        batch_size=batch_size,
        num_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        seed=seed,
        output_dir=output_dir,
    )

In [14]:
import pandas as pd

# Load the kaggle_clickbait.csv file
kaggle_clickbait_df = pd.read_csv('/content/data/kaggle_clickbait.csv')

# Assuming 'headline' and 'clickbait' are the column names
X_texts_kaggle = kaggle_clickbait_df['headline'].tolist()
y_kaggle = kaggle_clickbait_df['clickbait'].tolist()

run_for_dataset(
    dataset="kaggle_clickbait",
    X_texts_arg=X_texts_kaggle,
    y_arg=y_kaggle,
    model_name="bert-base-uncased",
    max_length=128,
    batch_size=32,
    num_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    seed=42,
    output_dir="transformer_clickbait_results",
)


=== Kaggle_clickbait (Transformer) ===
[kaggle_clickbait] Test split distribution: pos=3160 neg=3240 (n=6400)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[kaggle_clickbait] Starting transformer fine-tuning on bert-base-uncased


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.0526,0.040929,0.987891,0.987885
2,0.0128,0.042081,0.989844,0.98984
3,0.0006,0.051593,0.991406,0.991403


[kaggle_clickbait] Validation metrics: {'eval_loss': 0.05159271880984306, 'eval_accuracy': 0.99140625, 'eval_f1_macro': 0.991402702775315, 'eval_runtime': 16.8025, 'eval_samples_per_second': 152.358, 'eval_steps_per_second': 4.761, 'epoch': 3.0}
[kaggle_clickbait] Evaluating on held-out test set...
[kaggle_clickbait][Transformer][model=bert-base-uncased][max_len=128][epochs=3]] Acc=0.990  Prec(pos=1)=0.989  Rec(pos=1)=0.990
[kaggle_clickbait][Transformer][model=bert-base-uncased][max_len=128][epochs=3]] Confusion Matrix:
[[3205   35]
 [  31 3129]]
[kaggle_clickbait][Transformer][model=bert-base-uncased][max_len=128][epochs=3]] Per-class:
  class 0: prec=0.990 rec=0.989 f1=0.990 support=3240
  class 1: prec=0.989 rec=0.990 f1=0.990 support=3160
[kaggle_clickbait][Transformer][model=bert-base-uncased][max_len=128][epochs=3]] Macro: prec=0.990 rec=0.990 f1=0.990
[kaggle_clickbait][Transformer][model=bert-base-uncased][max_len=128][epochs=3]] Micro: prec=0.990 rec=0.990 f1=0.990
[kaggle_cl

In [19]:
import pandas as pd

# Load the kaggle_clickbait.csv file
kaggle_clickbait_df = pd.read_csv('/content/data/kaggle_clickbait.csv')

# Assuming 'headline' and 'clickbait' are the column names
X_texts_kaggle = kaggle_clickbait_df['headline'].tolist()
y_kaggle = kaggle_clickbait_df['clickbait'].tolist()

run_for_dataset(
    dataset="kaggle_clickbait",
    X_texts_arg=X_texts_kaggle,
    y_arg=y_kaggle,
    model_name="distilbert-base-uncased", # Changed from "bert-base-uncased"
    max_length=64, # Changed from 128
    batch_size=64, # Changed from 32
    num_epochs=2, # Changed from 3
    learning_rate=2e-5,
    weight_decay=0.01,
    seed=42,
    output_dir="transformer_clickbait_results",
)


=== Kaggle_clickbait (Transformer) ===
[kaggle_clickbait] Test split distribution: pos=3160 neg=3240 (n=6400)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[kaggle_clickbait] Starting transformer fine-tuning on distilbert-base-uncased


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.039,0.043636,0.986328,0.986326
2,0.0253,0.039983,0.991016,0.991011


[kaggle_clickbait] Validation metrics: {'eval_loss': 0.0399833619594574, 'eval_accuracy': 0.991015625, 'eval_f1_macro': 0.9910114760917934, 'eval_runtime': 4.0752, 'eval_samples_per_second': 628.195, 'eval_steps_per_second': 9.816, 'epoch': 2.0}
[kaggle_clickbait] Evaluating on held-out test set...
[kaggle_clickbait][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Acc=0.988  Prec(pos=1)=0.991  Rec(pos=1)=0.986
[kaggle_clickbait][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Confusion Matrix:
[[3211   29]
 [  45 3115]]
[kaggle_clickbait][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Per-class:
  class 0: prec=0.986 rec=0.991 f1=0.989 support=3240
  class 1: prec=0.991 rec=0.986 f1=0.988 support=3160
[kaggle_clickbait][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Macro: prec=0.988 rec=0.988 f1=0.988
[kaggle_clickbait][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Micro: prec=0.988 rec=

In [34]:
import pandas as pd

# Load the train2.csv file
train2_df = pd.read_csv('/content/data/news_clickbait_dataset/train2.csv')

# Extract 'title' texts and 'label' categories (assuming 'label' is equivalent to 'clickbait')
X_texts_train2 = train2_df['title'].tolist()
y_train2 = train2_df['label'].tolist()

print(f"Loaded {len(X_texts_train2)} headlines and {len(y_train2)} labels from train2.csv")
print(f"First 5 headlines: {X_texts_train2[:5]}")
print(f"First 5 labels: {y_train2[:5]}")

Loaded 21029 headlines and 21029 labels from train2.csv
First 5 headlines: ['China and Economic Reform: Xi Jinping’s Track Record ', 'Trade to Be a Big Topic in Theresa May’s U.S. Visit', 'The Top Beaches In The World, According To National Geographic', 'Sheriff’s Report Provides New Details on Tamir Rice’s Death, but Leaves Questions ', "Surgeon claiming he will transplant volunteer's HEAD to another body says he needs America's help to do it"]
First 5 labels: ['news', 'news', 'clickbait', 'clickbait', 'news']


In [36]:
label_mapping = {'news': 0, 'clickbait': 1}
y_train2_numerical = [label_mapping[label] for label in y_train2]

run_for_dataset(
    dataset="news_clickbait_train2",
    X_texts_arg=X_texts_train2,
    y_arg=y_train2_numerical, # Use the numerical labels
    model_name="distilbert-base-uncased",
    max_length=64,
    batch_size=64,
    num_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    seed=42,
    output_dir="transformer_clickbait_results",
)


=== News_clickbait_train2 (Transformer) ===
[news_clickbait_train2] Test split distribution: pos=839 neg=3367 (n=4206)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[news_clickbait_train2] Starting transformer fine-tuning on distilbert-base-uncased


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4085,0.402454,0.830559,0.67224
2,0.3775,0.402198,0.839477,0.707426


[news_clickbait_train2] Validation metrics: {'eval_loss': 0.40219777822494507, 'eval_accuracy': 0.8394768133174791, 'eval_f1_macro': 0.7074262477580555, 'eval_runtime': 2.6519, 'eval_samples_per_second': 634.268, 'eval_steps_per_second': 10.181, 'epoch': 2.0}
[news_clickbait_train2] Evaluating on held-out test set...
[news_clickbait_train2][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Acc=0.831  Prec(pos=1)=0.623  Rec(pos=1)=0.392
[news_clickbait_train2][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Confusion Matrix:
[[3168  199]
 [ 510  329]]
[news_clickbait_train2][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Per-class:
  class 0: prec=0.861 rec=0.941 f1=0.899 support=3367
  class 1: prec=0.623 rec=0.392 f1=0.481 support=839
[news_clickbait_train2][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Macro: prec=0.742 rec=0.667 f1=0.690
[news_clickbait_train2][Transformer][model=distilbert-base-uncased][ma

In [51]:
X_texts_webis, y_webis = get_texts_labels_for('webis')
print(f"Loaded {len(X_texts_webis)} texts and {len(y_webis)} labels from Webis dataset.")
print(f"First 5 texts: {X_texts_webis[:5]}")
print(f"First 5 labels: {y_webis[:5]}")

Loaded 19484 texts and 19484 labels from Webis dataset.
First 5 texts: ['UK’s response to modern slavery leaving victims destitute while abusers go free', 'this is good', 'The "forgotten" Trump roast: Relive his brutal 2004 thrashing at the New York Friars Club', 'Meet the happiest #dog in the world!', "Tokyo's subway is shut down amid fears over an imminent North Korean missile attack on Japan"]
First 5 labels: [0, 1, 0, 1, 0]


In [52]:
run_for_dataset(
    dataset="webis-data",
    X_texts_arg=X_texts_webis,
    y_arg=y_webis,
    model_name="distilbert-base-uncased",
    max_length=64,
    batch_size=64,
    num_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    seed=42,
    output_dir="transformer_clickbait_results",
)


=== Webis-data (Transformer) ===
[webis-data] Test split distribution: pos=920 neg=2977 (n=3897)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[webis-data] Starting transformer fine-tuning on distilbert-base-uncased


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.354,0.334099,0.84724,0.787516
2,0.3136,0.329098,0.849807,0.797114


[webis-data] Validation metrics: {'eval_loss': 0.3290978670120239, 'eval_accuracy': 0.8498074454428755, 'eval_f1_macro': 0.7971137407842719, 'eval_runtime': 2.489, 'eval_samples_per_second': 625.945, 'eval_steps_per_second': 10.044, 'epoch': 2.0}
[webis-data] Evaluating on held-out test set...
[webis-data][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Acc=0.853  Prec(pos=1)=0.702  Rec(pos=1)=0.654
[webis-data][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Confusion Matrix:
[[2722  255]
 [ 318  602]]
[webis-data][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Per-class:
  class 0: prec=0.895 rec=0.914 f1=0.905 support=2977
  class 1: prec=0.702 rec=0.654 f1=0.678 support=920
[webis-data][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Macro: prec=0.799 rec=0.784 f1=0.791
[webis-data][Transformer][model=distilbert-base-uncased][max_len=64][epochs=2]] Micro: prec=0.853 rec=0.853 f1=0.853
[webis-data][Transfor