In [1]:
!pip install transformers torch pandas numpy scikit-learn

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
# from transformers.optimization import AdamW  # Correct import location
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
from torch.optim import AdamW




In [2]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [None]:
fake['label'] = 0
true['label'] = 1                                                                                                                                                                                                                                                      e

In [4]:
df = pd.concat([true, fake])

In [5]:
df = pd.concat([true, fake])

In [6]:
df.drop(["title", "subject", "date"], axis=1, inplace=True)
df.head(5)

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [7]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head(5)

Unnamed: 0,text,label
0,"Donald Trump s White House is in chaos, and th...",0
1,Now that Donald Trump is the presumptive GOP n...,0
2,Mike Pence is a huge homophobe. He supports ex...,0
3,SAN FRANCISCO (Reuters) - California Attorney ...,1
4,Twisted reasoning is all that comes from Pelos...,0


In [None]:

import os
import warnings
warnings.filterwarnings('ignore')

# Load and prepare dataset
# Replace with your dataset path

print(f"Dataset size: {len(df)}")
print(f"Class distribution:\n{df['label'].value_counts()}")

# Split data
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'])

# Custom Dataset Class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 256
BATCH_SIZE = 16

# Create data loaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = NewsDataset(
        texts=df.text.values,
        labels=df.label.values,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model = model.to(device)

# FIXED: Remove 'correct_bias' parameter
optimizer = AdamW(model.parameters(), lr=2e-5)  # Correct initialization
epochs = 3

# Training function
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    losses = []

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    return np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    return classification_report(actual_labels, predictions), accuracy_score(actual_labels, predictions)

# Training loop
for epoch in range(epochs):
    print(f'\nEpoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')

    val_report, val_acc = eval_model(model, val_data_loader, device)
    print(f'Validation accuracy: {val_acc:.4f}')
    print(f'Validation classification report:\n{val_report}')

# Final evaluation on test set
test_report, test_acc = eval_model(model, test_data_loader, device)
print(f'\nTest accuracy: {test_acc:.4f}')
print(f'Test classification report:\n{test_report}')

# Save model
output_dir = 'bert_fake_news_detector'
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"\nModel saved to {output_dir}")

# Create a function for inference
def predict_news(text, model_dir='bert_fake_news_detector', max_len=256):
    # Load model and tokenizer
    model = BertForSequenceClassification.from_pretrained(model_dir)
    tokenizer = BertTokenizer.from_pretrained(model_dir)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Preprocess text
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    ).to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        probs = probs.cpu().numpy().flatten()

    return {
        'real': probs[0],
        'fake': probs[1],
        'prediction': 'Fake' if probs[1] > 0.5 else 'Real'
    }

# Test the prediction function
sample_text = "Scientists confirm that eating chocolate daily improves longevity by 50%"
result = predict_news(sample_text)
print(f"\nSample prediction:")
print(f"Text: {sample_text}")
print(f"Prediction: {result['prediction']} (Real: {result['real']:.4f}, Fake: {result['fake']:.4f})")

Dataset size: 44898
Class distribution:
label
0    23481
1    21417
Name: count, dtype: int64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu

Epoch 1/3


In [None]:
# Check for overlapping articles between splits
train_texts = set(train_df['text'])
val_texts = set(val_df['text'])
test_texts = set(test_df['text'])

print(f"Train-Val overlap: {len(train_texts & val_texts)} articles")
print(f"Train-Test overlap: {len(train_texts & test_texts)} articles")
print(f"Val-Test overlap: {len(val_texts & test_texts)} articles")