In [1]:
!pip install transformers datasets evaluate torch --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, random, numpy as np, pandas as pd, torch, hashlib
from sklearn.model_selection import train_test_split
from datasets import Dataset, Value
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import EarlyStoppingCallback
from torch.nn.functional import softmax
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score, average_precision_score


In [3]:
# Reproducibility setup: fix global seeds and enforce deterministic behavior
# across Python, NumPy, and PyTorch (CPU & GPU).
# Note: disabling cuDNN benchmark and enabling deterministic mode
# may slightly reduce training speed but guarantees reproducible runs.

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [4]:
# Loading + basic cleaning
df = pd.read_csv('fake_news_full_data.csv', index_col=0)
df = df.drop(columns=["date"], errors="ignore")                     # drop date (risk of leakage)
df = df.drop(columns=[c for c in df.columns if c.lower().startswith("unnamed")], errors="ignore")

# prepare text
df["title"] = df["title"].fillna("").astype(str)
df["text"]  = df["text"].fillna("").astype(str)
df["text_all"] = df["title"] + " [SEP] " + df["text"]

print("Before drop_duplicates:", len(df))

df_clean = df.drop_duplicates(subset="text_all").reset_index(drop=True)
print("After  drop_duplicates:", len(df_clean))


Before drop_duplicates: 44680
After  drop_duplicates: 39100


In [5]:
# Split 80/10/10
X = df_clean["text_all"].values
y = df_clean["is_fake"].astype(int).values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=SEED, stratify=y_temp
)

print(f"Sizes → train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")

# Quick check: ensure there are no exact duplicates across splits
print("train∩val :", len(set(X_train) & set(X_val)))
print("train∩test:", len(set(X_train) & set(X_test)))
print("val∩test  :", len(set(X_val)   & set(X_test)))

Sizes → train=31280, val=3910, test=3910
train∩val : 0
train∩test: 0
val∩test  : 0


In [6]:
# Tokenizer + HF Datasets

model_path = "google-bert/bert-base-uncased"
max_len = 128

tokenizer = AutoTokenizer.from_pretrained(model_path)

train_ds = Dataset.from_dict({"text": X_train, "labels": y_train})
val_ds   = Dataset.from_dict({"text": X_val,   "labels": y_val})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": y_test})

# ensure int64 type for labels
train_ds = train_ds.cast_column("labels", Value("int64"))
val_ds   = val_ds.cast_column("labels",   Value("int64"))
test_ds  = test_ds.cast_column("labels",  Value("int64"))

# Define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_len)

train_tok = train_ds.map(preprocess_function, batched=True, remove_columns=["text"])
val_tok   = val_ds.map(preprocess_function,   batched=True, remove_columns=["text"])
test_tok  = test_ds.map(preprocess_function,  batched=True, remove_columns=["text"])

train_tok.set_format(type="torch")
val_tok.set_format(type="torch")
test_tok.set_format(type="torch")

train_tok[:2]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Casting the dataset:   0%|          | 0/31280 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3910 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3910 [00:00<?, ? examples/s]

Map:   0%|          | 0/31280 [00:00<?, ? examples/s]

Map:   0%|          | 0/3910 [00:00<?, ? examples/s]

Map:   0%|          | 0/3910 [00:00<?, ? examples/s]

{'labels': tensor([0, 0]),
 'input_ids': tensor([[  101, 27604,  1011,  2091,  8398,  2515,  2025,  2812,  7822,  2013,
           3343,  1024, 14895,   102,  1006, 26665,  1007,  1011,  1037,  4872,
           2008,  1057,  1012,  1055,  1012,  3951,  2392,  1011,  5479,  6221,
           8398,  2097, 11092,  1037,  2062,  4883,  3049,  2806,  2515,  2025,
           4742,  1037,  7822,  2013,  4563,  6043,  2107,  2004,  2010, 16393,
           2000,  3857,  1037,  2813,  2006,  1996,  4916,  3675,  1010,  2010,
           2327, 11747,  2056,  2006,  4465,  1012,  3026,  8398, 14895,  2703,
          24951, 13028,  7219,  6538,  3951,  4018,  6945,  8096,  1521,  1055,
          19238,  2008,  1996,  2613,  3776,  9587, 24848,  2018,  9828,  2055,
           2010,  6043,  2006,  7521,  2000,  1523,  7966, 19739,  6894,  3468,
           7206,  1012,  1524,  1996, 10722,  4757,  2571,  2058,  8398,  1521,
           1055,  2806,  1998,  9415, 11677,  9857,  1521,  1055,  3951,  1998,


In [7]:
# Create data collator: dynamically pads sequences in a batch to the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
# Define label mappings and load pre-trained model for binary classification
id2label = {0: "REAL", 1: "FAKE"}
label2id = {"REAL": 0, "FAKE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=2, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Metrics
def compute_metrics(eval_pred):
    """
    Compute F1 (macro, weighted) and accuracy
    from model predictions and true labels.
    """
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {
        "f1_macro":    f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted"),
        "accuracy":    (preds == labels).mean(),
    }


In [10]:
# Trainer setup

lr = 2e-5          # learning rate
batch_size = 8     # batch size
num_epochs = 4     # number of training epochs

args = TrainingArguments(
    output_dir="./bert_out",                # output directory
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*4,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",               # log each epoch
    eval_strategy="epoch",                  # evaluate each epoch
    save_strategy="epoch",                  # save each epoch
    load_best_model_at_end=True,            # restore best model
    metric_for_best_model="f1_macro",       # use F1-macro for checkpointing
    greater_is_better=True,
    warmup_ratio=0.1,                       # warm-up phase
    weight_decay=0.01,                      # regularization
    fp16=True,                              # mixed precision
    save_total_limit=2,                     # keep last 2 checkpoints
    max_grad_norm=1.0,                      # gradient clipping
    report_to="none",                       # no external logging
    seed=SEED,                              # reproducibility
)

trainer = Trainer(
    model=model,                 # BERT model
    args=args,                   # training config
    train_dataset=train_tok,     # training set
    eval_dataset=val_tok,        # validation set
    tokenizer=tokenizer,         # tokenizer
    data_collator=data_collator, # batch padding
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)] # early stopping
)

# Train and evaluate
trainer.train()
val_metrics = trainer.evaluate()
val_metrics


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,0.0382,0.006112,0.999227,0.999233,0.999233
2,0.0024,0.005377,0.999485,0.999489,0.999488
3,0.0023,0.004372,0.999485,0.999489,0.999488


{'eval_loss': 0.005377208814024925,
 'eval_f1_macro': 0.9994849103770403,
 'eval_f1_weighted': 0.9994885130159035,
 'eval_accuracy': 0.9994884910485934,
 'eval_runtime': 6.924,
 'eval_samples_per_second': 564.701,
 'eval_steps_per_second': 17.764,
 'epoch': 3.0}

In [11]:
# Evaluate model on test set: predictions, metrics, and confusion matrix
pred_logits_test = trainer.predict(test_tok).predictions
y_pred_test = pred_logits_test.argmax(1)
proba_test = softmax(torch.tensor(pred_logits_test), dim=1).numpy()[:,1]

print("TEST | F1-macro:", round(f1_score(y_test, y_pred_test, average="macro"), 6))
print(classification_report(y_test, y_pred_test, digits=3, target_names=["real(0)","fake(1)"]))

cm_test = pd.DataFrame(confusion_matrix(y_test, y_pred_test),
                       index=["real(0)","fake(1)"],
                       columns=["pred_real(0)","pred_fake(1)"])
cm_test


TEST | F1-macro: 0.999485
              precision    recall  f1-score   support

     real(0)      1.000     0.999     1.000      2120
     fake(1)      0.999     1.000     0.999      1790

    accuracy                          0.999      3910
   macro avg      0.999     1.000     0.999      3910
weighted avg      0.999     0.999     0.999      3910



Unnamed: 0,pred_real(0),pred_fake(1)
real(0),2118,2
fake(1),0,1790


In [12]:
def norm(s):
    """
    Normalize text by lowercasing and collapsing extra spaces.

    Args:
        s (str): Input string.
    Returns:
        str: Normalized string.
    """
    return " ".join(str(s).lower().split())

# Build train/val/test DataFrames
train_df = pd.DataFrame({"text": X_train, "y": y_train})
val_df   = pd.DataFrame({"text": X_val,   "y": y_val})
test_df  = pd.DataFrame({"text": X_test,  "y": y_test})

# Hash normalized texts (SHA-1) for duplicate detection
for d in (train_df, val_df, test_df):
    d["h"] = d["text"].map(lambda t: hashlib.sha1(norm(t).encode("utf-8")).hexdigest())

# Check for exact duplicates across splits
print("Exact dupes train∩val:", len(set(train_df.h) & set(val_df.h)))
print("Exact dupes train∩test:", len(set(train_df.h) & set(test_df.h)))
print("Exact dupes val∩test:", len(set(val_df.h) & set(test_df.h)))


Exact dupes train∩val: 1
Exact dupes train∩test: 0
Exact dupes val∩test: 0


In [13]:
# Ensure reports directory exists
os.makedirs("../reports", exist_ok=True)

def log_row(name, y_true, y_pred, proba):
    """
    Save evaluation metrics and confusion matrix counts
    to ../reports/metrics.csv. Updates existing entry if model name exists.
    """
    cm = confusion_matrix(y_true, y_pred)
    row = pd.DataFrame([{
        "model": name,
        "n": int(len(y_true)),
        "f1_macro": float(round(f1_score(y_true, y_pred, average="macro"), 6)),
        "f1_weighted": float(round(f1_score(y_true, y_pred, average="weighted"), 6)),
        "roc_auc": float(round(roc_auc_score(y_true, proba), 6)),
        "pr_auc": float(round(average_precision_score(y_true, proba), 6)),
        "tn": int(cm[0,0]), "fp": int(cm[0,1]),
        "fn": int(cm[1,0]), "tp": int(cm[1,1]),
    }])

    path = "../reports/metrics.csv"
    # Load existing metrics if file exists, otherwise create new
    met = pd.read_csv(path) if os.path.exists(path) else pd.DataFrame(columns=row.columns)
    # Update entry if model with the same name already exists
    met = met[met.model != name]
    # Append new row and save
    met = pd.concat([met, row], ignore_index=True)
    met.to_csv(path, index=False)
    print("Saved →", path)


# Log validation metrics
from torch.nn.functional import softmax
val_logits = trainer.predict(val_tok).predictions
val_pred = val_logits.argmax(1)
val_proba = softmax(torch.tensor(val_logits), dim=1).numpy()[:,1]
log_row("bert_base_uncased_len128_val", y_val, val_pred, val_proba)

# Log test metrics
log_row("bert_base_uncased_len128_test", y_test, y_pred_test, proba_test)


Saved → ../reports/metrics.csv
Saved → ../reports/metrics.csv


  met = pd.concat([met, row], ignore_index=True)
