In [2]:
!pip install torch transformers pandas scikit-learn numpy tqdm




In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split

In [4]:
df_fake = pd.read_csv("/content/Fake.csv.zip")
df_true = pd.read_csv("/content/True.csv.zip")

print("Fake rows:", len(df_fake))
print("True rows:", len(df_true))

df_fake.head(), df_true.head()

Fake rows: 23481
True rows: 21417


(                                               title  \
 0   Donald Trump Sends Out Embarrassing New Year’...   
 1   Drunk Bragging Trump Staffer Started Russian ...   
 2   Sheriff David Clarke Becomes An Internet Joke...   
 3   Trump Is So Obsessed He Even Has Obama’s Name...   
 4   Pope Francis Just Called Out Donald Trump Dur...   
 
                                                 text subject  \
 0  Donald Trump just couldn t wish all Americans ...    News   
 1  House Intelligence Committee Chairman Devin Nu...    News   
 2  On Friday, it was revealed that former Milwauk...    News   
 3  On Christmas day, Donald Trump announced that ...    News   
 4  Pope Francis used his annual Christmas Day mes...    News   
 
                 date  
 0  December 31, 2017  
 1  December 31, 2017  
 2  December 30, 2017  
 3  December 29, 2017  
 4  December 25, 2017  ,
                                                title  \
 0  As U.S. budget fight looms, Republicans flip t...   
 1  U

In [5]:
df_fake["label"] = 0
df_true["label"] = 1

df = pd.concat([df_fake, df_true], ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Total rows:", len(df))
df.head()

Total rows: 44898


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


In [6]:
possible_text_cols = ["text", "content"]
text_col = None

for col in possible_text_cols:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    raise ValueError("No text column found! Send me df.columns output.")


if "title" not in df.columns:
    df["title"] = ""

df = df[["title", text_col, "label"]]
df.columns = ["title", "text", "label"]

df.head()

Unnamed: 0,title,text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [7]:
df.to_csv("train.csv", index=False)
print("Saved train.csv")

Saved train.csv


In [8]:
%%writefile dataset.py
import torch
from torch.utils.data import Dataset

class FakeNewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.texts = (df['title'].fillna('') + ' ' +
                      df['text'].fillna('')).tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item


Writing dataset.py


In [9]:

%%writefile model.py
import torch
import torch.nn as nn
from transformers import AutoModel

class RoBERTaFakeNewsClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", dropout=0.3):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        hidden_size = self.roberta.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_emb = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(self.dropout(cls_emb))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}


Writing model.py


In [10]:

%%writefile eval.py
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def evaluate(model, dataloader, device):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"].numpy()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds = torch.argmax(outputs["logits"], dim=1).cpu().numpy()

            all_labels.extend(labels)
            all_preds.extend(preds)

    return {
        "accuracy": accuracy_score(all_labels, all_preds),
        "precision": precision_score(all_labels, all_preds),
        "recall": recall_score(all_labels, all_preds),
        "f1": f1_score(all_labels, all_preds)
    }


Writing eval.py


In [11]:

%%writefile train.py
import os
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

from dataset import FakeNewsDataset
from model import RoBERTaFakeNewsClassifier
from eval import evaluate

def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    df = pd.read_csv("train.csv")

    train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=42)

    tokenizer = AutoTokenizer.from_pretrained("roberta-base")

    train_dataset = FakeNewsDataset(train_df, tokenizer)
    val_dataset = FakeNewsDataset(val_df, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    model = RoBERTaFakeNewsClassifier()
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    epochs = 2
    total_steps = len(train_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    best_f1 = 0

    for epoch in range(epochs):
        print(f"\n---- Epoch {epoch+1}/{epochs} ----")
        model.train()
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch)["loss"]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        metrics = evaluate(model, val_loader, device)
        print(metrics)

        if metrics["f1"] > best_f1:
            best_f1 = metrics["f1"]
            os.makedirs("outputs", exist_ok=True)
            torch.save(model.state_dict(), "outputs/best_model.pt")
            with open("outputs/metrics.json", "w") as f:
                json.dump(metrics, f, indent=2)

    print("Training complete. Best F1:", best_f1)

if __name__ == "__main__":
    train()


Writing train.py


In [12]:
!python train.py


Using device: cuda
tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 163kB/s]
config.json: 100% 481/481 [00:00<00:00, 3.40MB/s]
vocab.json: 100% 899k/899k [00:00<00:00, 3.63MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 2.72MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 4.03MB/s]
2025-12-06 19:38:12.786267: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765049892.819117    2919 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765049892.829185    2919 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1765049892.853418    2919 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the s