# Fake News Detection — BERT + CNN + Metadata (Hybrid Ensemble)

This Google Colab notebook trains a **BERT + metadata** model and a **CNN** text model, then builds a **hybrid ensemble**. Dataset: **clmentbisaillon/fake-and-real-news-dataset** (Fake.csv + True.csv).

In [None]:
import zipfile, os
from IPython.display import display
import pandas as pd

print('📦 Upload your dataset ZIP file (e.g., archive.zip) in the left sidebar Files section.')
zip_path = '/content/archive.zip'
extract_path = '/content/fake_news_dataset'

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print('✅ Dataset extracted successfully!')
else:
    print('⚠️ Please upload archive.zip first.')

# Check extracted files
if os.path.exists(extract_path):
    print('📂 Extracted files:', os.listdir(extract_path))
    fake_path = os.path.join(extract_path, 'Fake.csv')
    true_path = os.path.join(extract_path, 'True.csv')
    
    # Quick data preview
    try:
        fake_df = pd.read_csv(fake_path).head()
        true_df = pd.read_csv(true_path).head()
        print('\n📰 Preview of Fake.csv:')
        display(fake_df)
        print('\n📰 Preview of True.csv:')
        display(true_df)
    except Exception as e:
        print('⚠️ Error reading CSV files:', e)
else:
    print('⚠️ Dataset folder not found after extraction.')


In [None]:
import os
print('🔍 Checking dataset availability before training...')
if os.path.exists(fake_path) and os.path.exists(true_path):
    print('✅ Both Fake.csv and True.csv found! Ready for preprocessing and training.')
else:
    raise FileNotFoundError('❌ Dataset files not found! Please ensure archive.zip was uploaded and extracted correctly.')


In [None]:
# Install required packages (run once)
!pip install -q transformers torch torchvision torchaudio --upgrade
!pip install -q tensorflow keras scikit-learn tqdm nltk gensim kaggle
print('Packages installed')

## Kaggle dataset

Upload your `kaggle.json` (API token) when prompted, so the notebook can download the dataset automatically.

Steps:
1. Go to https://www.kaggle.com/ -> Account -> Create API token -> download `kaggle.json`.
2. Upload the file in the next cell.

In [None]:
from google.colab import files
print('Please upload your kaggle.json (Kaggle API token). If you already uploaded, ignore this step.')
uploaded = files.upload()  # choose kaggle.json here
for fn in uploaded.keys():
    print('Uploaded file:', fn)

In [None]:
# Move kaggle.json to ~/.kaggle and set permissions
import os, shutil
if os.path.exists('kaggle.json'):
    os.makedirs('/root/.kaggle', exist_ok=True)
    shutil.move('kaggle.json', '/root/.kaggle/kaggle.json')
    os.chmod('/root/.kaggle/kaggle.json', 0o600)
    print('kaggle.json moved to /root/.kaggle/kaggle.json')
else:
    print('kaggle.json not found; if you already placed it manually, that is fine.')

In [None]:
# Download dataset from Kaggle (clmentbisaillon/fake-and-real-news-dataset)
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset -q
!unzip -o -q fake-and-real-news-dataset.zip -d dataset
print('Dataset downloaded and extracted to ./dataset')

In [None]:
# Imports and NLTK download
import pandas as pd, numpy as np, torch, re, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print('Imports ready')

In [None]:
# Load Fake.csv and True.csv and combine into a single dataframe
fake_df = pd.read_csv('dataset/Fake.csv')
true_df = pd.read_csv('dataset/True.csv')

fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'

df = pd.concat([fake_df, true_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
print('Total samples:', len(df))
df.head()

In [None]:
# Preprocess: combine title + text -> content; create example credibility metadata
df['content'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
df = df.dropna(subset=['content']).drop_duplicates(subset=['content']).reset_index(drop=True)

def pseudo_source_from_title(title):
    if isinstance(title, str):
        t = title.lower()
        if 'nytimes' in t or 'nyt' in t: return 'nytimes.com'
        if 'bbc' in t: return 'bbc.com'
        if 'cnn' in t: return 'cnn.com'
        if 'fox' in t: return 'foxnews.com'
    return 'unknown'

df['source'] = df['title'].apply(pseudo_source_from_title)
credibility_scores = {'bbc.com': 0.95, 'nytimes.com': 0.9, 'cnn.com': 0.85, 'foxnews.com': 0.6, 'unknown': 0.5}
df['credibility'] = df['source'].map(credibility_scores).fillna(0.5)

df['label_enc'] = df['label'].map({'REAL': 0, 'FAKE': 1})

print('Prepared dataframe with credibility metadata')
df[['title','source','credibility','label']].head()

In [None]:
# BERT tokenizer encodings (using transformers)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_texts, test_texts, y_train, y_test, train_meta, test_meta = train_test_split(
    df['content'].values, df['label_enc'].values, df['credibility'].values, test_size=0.2, random_state=42
)

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

from torch.utils.data import Dataset, DataLoader
class NewsDataset(Dataset):
    def __init__(self, encodings, labels, metadata):
        self.encodings = encodings
        self.labels = labels
        self.metadata = metadata
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k,v in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        item['metadata'] = torch.tensor(self.metadata[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, y_train, train_meta)
test_dataset = NewsDataset(test_encodings, y_test, test_meta)
print('Datasets ready', len(train_dataset), len(test_dataset))

In [None]:
# BERT model with metadata head
import torch.nn as nn
from transformers import BertModel

class BertWithMetadata(nn.Module):
    def __init__(self):
        super(BertWithMetadata, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(768 + 1, 128)
        self.fc2 = nn.Linear(128, 2)
    def forward(self, input_ids, attention_mask, metadata):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.pooler_output
        combined = torch.cat((pooled, metadata.unsqueeze(1)), dim=1)
        x = self.dropout(combined)
        x = torch.relu(self.fc1(x))
        logits = self.fc2(x)
        return logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_bert = BertWithMetadata().to(device)
print('BERT model ready on', device)

In [None]:
# Training loop for BERT (small demo: 2 epochs)
from transformers import AdamW
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model_bert.parameters(), lr=2e-5)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
model_bert.train()

for epoch in range(2):
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        metadata = batch['metadata'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_bert(input_ids, attention_mask, metadata)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1} average loss: {total_loss/len(train_loader):.4f}')

torch.save(model_bert.state_dict(), 'bert_metadata_model.pt')
print('Saved bert_metadata_model.pt')

In [None]:
# CNN model (text-only) using Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

stop = set(stopwords.words('english'))
def clean_text_simple(text):
    text = re.sub(r'[^a-zA-Z ]', '', str(text))
    text = text.lower()
    return ' '.join([w for w in text.split() if w not in stop])

df['clean_text'] = df['content'].apply(clean_text_simple)

MAX_WORDS = 5000
MAX_LEN = 200
tokenizer_cnn = Tokenizer(num_words=MAX_WORDS)
tokenizer_cnn.fit_on_texts(df['clean_text'])
X = tokenizer_cnn.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=MAX_LEN)
y_all = np.array(df['label_enc'])

X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X, y_all, test_size=0.2, random_state=42)

model_cnn = Sequential([
    Embedding(MAX_WORDS, 100, input_length=MAX_LEN),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model_cnn.fit(X_train_cnn, y_train_cnn, epochs=5, batch_size=64, validation_split=0.2)
model_cnn.save('cnn_fake_news_model.h5')
print('Saved cnn_fake_news_model.h5')

In [None]:
# Evaluate BERT on test set and CNN, then ensemble
model_bert.eval()
bert_probs = []
with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=8):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        metadata = batch['metadata'].to(device)
        outputs = model_bert(input_ids, attention_mask, metadata)
        probs = torch.softmax(outputs, dim=1)[:,1]
        bert_probs.extend(probs.cpu().numpy())

cnn_probs = model_cnn.predict(X_test_cnn).flatten()

n = min(len(bert_probs), len(cnn_probs))
bert_probs = np.array(bert_probs[:n])
cnn_probs = np.array(cnn_probs[:n])
labels_for_eval = y_test_cnn[:n]

final_probs = 0.7*bert_probs + 0.3*cnn_probs
final_preds = (final_probs > 0.5).astype(int)

print('BERT (sample) Accuracy:', accuracy_score(labels_for_eval, (bert_probs>0.5).astype(int)))
print('CNN (sample) Accuracy:', accuracy_score(labels_for_eval, (cnn_probs>0.5).astype(int)))
print('Ensemble Accuracy:', accuracy_score(labels_for_eval, final_preds))
print('\nClassification report (Ensemble):\n', classification_report(labels_for_eval, final_preds))