In [49]:
!pip install -U transformers datasets evaluate scikit-learn torch pandas



In [50]:
!pip uninstall -y wandb

Found existing installation: wandb 0.21.4
Uninstalling wandb-0.21.4:
  Successfully uninstalled wandb-0.21.4


In [51]:
import pandas as pd
import numpy as np
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import torch
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    __version__ as transformers_version
)
import os  # For file checks
os.environ["WANDB_DISABLED"] = "true"

In [52]:
print("Transformers version:", transformers_version)
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())


Transformers version: 4.56.2
PyTorch version: 2.8.0+cu126
CUDA available: False


In [53]:
# Define full paths for Colab (adjust if files are elsewhere)
base_path = "/content/"
file_paths = {
    "PolitiFact_real_news_content.csv": os.path.join(base_path, "PolitiFact_real_news_content.csv"),
    "PolitiFact_fake_news_content.csv": os.path.join(base_path, "PolitiFact_fake_news_content.csv"),
    "BuzzFeed_real_news_content.csv": os.path.join(base_path, "BuzzFeed_real_news_content.csv"),
    "BuzzFeed_fake_news_content.csv": os.path.join(base_path, "BuzzFeed_fake_news_content.csv"),
}

In [54]:
# Check if files exist
missing_files = [f for f, p in file_paths.items() if not os.path.exists(p)]
if missing_files:
    raise FileNotFoundError(f"Missing files in /content/: {missing_files}. Please upload them via Colab's file uploader.")


In [55]:
# Map files to class labels (0=real, 1=fake)
file_map = {
    "PolitiFact_real_news_content.csv": 0,
    "PolitiFact_fake_news_content.csv": 1,
    "BuzzFeed_real_news_content.csv": 0,
    "BuzzFeed_fake_news_content.csv": 1,
}

In [56]:
# Process each file: Add class label and save as updated_
for fname, label in file_map.items():
    full_path = file_paths[fname]
    df = pd.read_csv(full_path)

    # Keep only title + text (drop others if exist)
    if "title" in df.columns and "text" in df.columns:
        df = df[["title", "text"]]
    else:
        raise ValueError(f"Expected 'title' and 'text' columns in {fname}")

    # Add class column
    df["class"] = label

    # Build new filename
    new_name = "updated_" + fname
    new_path = os.path.join(base_path, new_name)

    # Save
    df.to_csv(new_path, index=False)
    print(f"✅ Saved {new_name} with class={label} (rows={len(df)})")

✅ Saved updated_PolitiFact_real_news_content.csv with class=0 (rows=120)
✅ Saved updated_PolitiFact_fake_news_content.csv with class=1 (rows=120)
✅ Saved updated_BuzzFeed_real_news_content.csv with class=0 (rows=91)
✅ Saved updated_BuzzFeed_fake_news_content.csv with class=1 (rows=91)


In [57]:
# 1) Load all updated files
updated_files = glob(os.path.join(base_path, "updated_*.csv"))
print("Found updated files:", [os.path.basename(f) for f in updated_files])


Found updated files: ['updated_PolitiFact_fake_news_content.csv', 'updated_BuzzFeed_real_news_content.csv', 'updated_BuzzFeed_fake_news_content.csv', 'updated_PolitiFact_real_news_content.csv']


In [58]:
# 2) Concatenate into one dataframe
dfs = [pd.read_csv(f) for f in updated_files]
merged = pd.concat(dfs, ignore_index=True)

In [59]:
# 3) Drop duplicates and clean
merged = merged.drop_duplicates(subset=["title", "text"]).reset_index(drop=True)
merged["title"] = merged["title"].astype(str).str.strip()
merged["text"] = merged["text"].astype(str).str.strip()

In [60]:
# 4) Save final merged dataset
final_merged_path = os.path.join(base_path, "final_fake_news_dataset.csv")
merged.to_csv(final_merged_path, index=False)
print("✅ Final dataset created: final_fake_news_dataset.csv")
print("Shape:", merged.shape)
print("Class distribution:\n", merged["class"].value_counts())
print(merged.head())
# Clean the merged dataset
df = pd.read_csv(final_merged_path)
print("Before cleaning:", df.shape)

✅ Final dataset created: final_fake_news_dataset.csv
Shape: (289, 3)
Class distribution:
 class
1    200
0     89
Name: count, dtype: int64
                                               title  \
0  Trump Just Insulted Millions Who Lost Everythi...   
1  Famous dog killed in spot she waited a year fo...   
2  House oversight panel votes Clinton IT chief i...   
3  America Just Tragically Lost A Country Music I...   
4          Monuments to the Battle for the New South   

                                                text  class  
0  16.8k SHARES SHARE THIS STORY\n\nHillary Clint...      1  
1  Famous dog killed in spot she waited a year fo...      1  
2  Story highlights The House Oversight panel vot...      1  
3  We are absolutely heartbroken to hear about th...      1  
4  Nine years ago, a driver lost control of his p...      1  
Before cleaning: (289, 3)


In [61]:
# 1) Drop completely empty columns
df = df.dropna(axis=1, how="all")
# 2) Drop rows where title or text is missing/empty
df = df.dropna(subset=["title", "text"])  # Remove NaN
df = df[(df["title"].str.strip() != "") & (df["text"].str.strip() != "")]  # Remove empty strings


In [62]:
# 3) Reset index
df = df.reset_index(drop=True)


In [63]:
# 4) Save cleaned dataset
clean_path = os.path.join(base_path, "final_fake_news_dataset_clean.csv")
df.to_csv(clean_path, index=False)
print("After cleaning:", df.shape)
print("Class distribution:\n", df["class"].value_counts())
print(df.head())

After cleaning: (289, 3)
Class distribution:
 class
1    200
0     89
Name: count, dtype: int64
                                               title  \
0  Trump Just Insulted Millions Who Lost Everythi...   
1  Famous dog killed in spot she waited a year fo...   
2  House oversight panel votes Clinton IT chief i...   
3  America Just Tragically Lost A Country Music I...   
4          Monuments to the Battle for the New South   

                                                text  class  
0  16.8k SHARES SHARE THIS STORY\n\nHillary Clint...      1  
1  Famous dog killed in spot she waited a year fo...      1  
2  Story highlights The House Oversight panel vot...      1  
3  We are absolutely heartbroken to hear about th...      1  
4  Nine years ago, a driver lost control of his p...      1  


In [64]:
# 1) Load cleaned dataset
df = pd.read_csv(clean_path)
print("Dataset shape:", df.shape)
print("Class distribution:\n", df["class"].value_counts())
print(df.head())

Dataset shape: (289, 3)
Class distribution:
 class
1    200
0     89
Name: count, dtype: int64
                                               title  \
0  Trump Just Insulted Millions Who Lost Everythi...   
1  Famous dog killed in spot she waited a year fo...   
2  House oversight panel votes Clinton IT chief i...   
3  America Just Tragically Lost A Country Music I...   
4          Monuments to the Battle for the New South   

                                                text  class  
0  16.8k SHARES SHARE THIS STORY\n\nHillary Clint...      1  
1  Famous dog killed in spot she waited a year fo...      1  
2  Story highlights The House Oversight panel vot...      1  
3  We are absolutely heartbroken to hear about th...      1  
4  Nine years ago, a driver lost control of his p...      1  


In [65]:
# 2) Concatenate title + text into one column (input for model)
df["input_text"] = df["title"].astype(str) + " " + df["text"].astype(str)
# 3) Split into train & validation (ensure labels are integers)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["input_text"],
    df["class"],
    test_size=0.2,
    stratify=df["class"],  # Keeps class balance
    random_state=42
)

In [66]:
# Convert labels to lists of integers
train_labels = train_labels.tolist()
val_labels = val_labels.tolist()

print("Training samples:", len(train_texts))
print("Validation samples:", len(val_texts))
print("Sample labels (train):", train_labels[:5])

Training samples: 231
Validation samples: 58
Sample labels (train): [1, 1, 0, 1, 1]


In [67]:
# 1) Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


In [68]:
# 2) Tokenize training and validation sets (THIS CREATES train_encodings, etc.)
train_encodings = tokenizer(
    list(train_texts),
    truncation=True,
    padding=True,
    max_length=256,  # Reduced for efficiency; increase to 512 if needed
    return_tensors=None  # Returns lists (converted to tensors in Dataset)
)
val_encodings = tokenizer(
    list(val_texts),
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors=None
)
print("✅ Tokenization complete!")
print("Encodings keys:", train_encodings.keys())
print("Example input IDs:", train_encodings["input_ids"][0][:20])
print("Attention mask:", train_encodings["attention_mask"][0][:20])

✅ Tokenization complete!
Encodings keys: KeysView({'input_ids': [[101, 8112, 1521, 1055, 4613, 1999, 2047, 2259, 28215, 4841, 2043, 2002, 2758, 2023, 2055, 4491, 999, 8112, 1521, 1055, 4613, 1999, 2047, 2259, 28215, 4841, 2043, 2002, 2758, 2023, 2055, 4491, 999, 1045, 2052, 2360, 2256, 2343, 2003, 11809, 1010, 2021, 2428, 1010, 2023, 3632, 3458, 11809, 1998, 2157, 2046, 1996, 8391, 1997, 24416, 1012, 2002, 2038, 2589, 2070, 3492, 4167, 2757, 2477, 1999, 1996, 2627, 1998, 2038, 2081, 2070, 11757, 16021, 6132, 13043, 1998, 10041, 2594, 8635, 2021, 2023, 1029, 2023, 2003, 2074, 4309, 12419, 2000, 1996, 23961, 2232, 2373, 1012, 1045, 1521, 1049, 16986, 2008, 2065, 2017, 1521, 2128, 3752, 2023, 2157, 2085, 1010, 2017, 2562, 2039, 2007, 1996, 2739, 1010, 2030, 2012, 2560, 2031, 1037, 11765, 2000, 3582, 4911, 2824, 1012, 2008, 2108, 1996, 2553, 1010, 2017, 2525, 2113, 2055, 1996, 3232, 1997, 20109, 2008, 3047, 1999, 2047, 2259, 1998, 2047, 3933, 2058, 1996, 5353, 1010, 2004, 2092, 2004, 1996,

In [69]:
# 1) Wrap tokenized data into a PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = list(labels)  # Ensure list of ints
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item
# Now instantiate (no more NameError!)
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
print(f"Dataset lengths: Train={len(train_dataset)}, Val={len(val_dataset)}")

Dataset lengths: Train=231, Val=58


In [70]:
# 2) Load DistilBERT for binary classification (0=real, 1=fake)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
# 3) Metrics: accuracy + F1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)
    return {
        "accuracy": acc,
        "weighted_f1": f1,
        "macro_f1": macro_f1,
        "precision_weighted": p,
        "recall_weighted": r
    }

In [72]:
# 4) Training configuration
batch_size = 16  # Adjust based on GPU memory (8 for low memory)
# training_args = TrainingArguments(
#     output_dir="./distilbert-fake-news",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_strategy="steps",
#     logging_steps=50,
#     learning_rate=5e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=3,  # Start with 3; monitor loss
#     weight_decay=0.01,
#     warmup_ratio=0.1,
#     load_best_model_at_end=True,
#     metric_for_best_model="weighted_f1",
#     greater_is_better=True,
#     seed=42,
#     fp16=torch.cuda.is_available(),  # Enable mixed precision for faster training on GPU
# )


training_args = TrainingArguments(
    output_dir="./distilbert-fake-news",
    do_eval=True,                    # enable eval
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [73]:
# 5) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [74]:
# 6) Train
print("Starting training...")
train_result = trainer.train()
print("Training complete!")


Starting training...




Step,Training Loss


Training complete!


In [75]:
# 7) Evaluate
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)



Evaluation results: {'eval_loss': 0.6702495217323303, 'eval_accuracy': 0.603448275862069, 'eval_weighted_f1': 0.5401743084501706, 'eval_macro_f1': 0.4136263736263736, 'eval_precision_weighted': 0.5041050903119868, 'eval_recall_weighted': 0.603448275862069, 'eval_runtime': 22.1532, 'eval_samples_per_second': 2.618, 'eval_steps_per_second': 0.181, 'epoch': 3.0}


In [76]:
# Optional: Save the final model
trainer.save_model("./final-distilbert-fake-news-model")
print("Model saved to ./final-distilbert-fake-news-model")

Model saved to ./final-distilbert-fake-news-model
