<a href="https://colab.research.google.com/github/subha19012005/Ai-Fake-News-Detection/blob/main/Final_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 0️⃣ Install required packages
#!pip install datasets scikit-learn pandas requests

# 1️⃣ Load LIAR2
from datasets import load_dataset
import pandas as pd

dataset_liar2 = load_dataset("chengxuphd/liar2")
liar_texts = dataset_liar2['train']['statement']
liar_labels = dataset_liar2['train']['label']

def binary_label(label):
    return 1 if label == 1 else 0

liar_labels_binary = [binary_label(l) for l in liar_labels]

df_liar2 = pd.DataFrame({
    'content': liar_texts,
    'label': liar_labels_binary
})
print("LIAR2 sample:")
print(df_liar2.head())

# 2️⃣ Load FakeNewsNet CSVs
def load_fakenews_csv(file_path):
    df = pd.read_csv(file_path)
    if 'text' in df.columns and 'title' in df.columns:
        df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
    elif 'title' in df.columns and 'news_url' in df.columns:
        df['content'] = df['title'].fillna('')
    else:
        df['content'] = df['content'].fillna('')
    df['label'] = 0 if 'fake' in file_path.lower() else 1
    return df[['content', 'label']]

files = [
    "BuzzFeed_fake_news_content.csv",
    "BuzzFeed_real_news_content.csv",
    "PolitiFact_fake_news_content.csv",
    "PolitiFact_real_news_content.csv",
    "gossipcop_fake.csv",
    "gossipcop_real.csv",
    "politifact_fake.csv",
    "politifact_real.csv"
]

dfs = [load_fakenews_csv(f) for f in files]
df_fakenewsnet = pd.concat(dfs, ignore_index=True)
print("FakeNewsNet sample:")
print(df_fakenewsnet.head())

# 3️⃣ Combine LIAR2 + FakeNewsNet
df_combined = pd.concat([df_liar2, df_fakenewsnet], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total combined records:", len(df_combined))

# 4️⃣ Load sources_names mapping
df_sources = pd.read_csv("sources_names.csv")  # columns: 'sources', 'label'
print(df_sources.head())

# 5️⃣ Fetch live news from newsdata.io
import requests
from urllib.parse import urlparse

api_key = "pub_410dc8b7c5c7484fb4621c94955c1172"
url = f"https://newsdata.io/api/1/news?apikey={api_key}&language=en"
response = requests.get(url)
data = response.json()

articles = []
for news in data['results'][:20]:
    title = news.get('title', '')
    description = news.get('description', '')
    link = news.get('link', '')
    content = title + " " + description
    articles.append({"content": content, "link": link})

df_live = pd.DataFrame(articles)

# 6️⃣ Weak-label live news using sources_names
df_live['domain'] = df_live['link'].apply(lambda x: urlparse(x).netloc)
df_live = df_live.merge(df_sources, left_on='domain', right_on='source', how='left')
df_live['label'] = df_live['label'].fillna(0).astype(int)  # default 0 if source not found
df_live = df_live[['content','label']]
print("Live news sample:")
print(df_live.head())

# 7️⃣ Combine everything into final dataset
df_final = pd.concat([df_combined, df_live], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total records in final dataset:", len(df_final))

# 8️⃣ Train TF-IDF + Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X = df_final['content']
y = df_final['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_tfidf, y_train)

preds = clf.predict(X_test_tfidf)
print("Final Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# 9️⃣ Predict new headlines
new_articles = ["Breaking: Scientists discovered water on Mars!"]
for article in new_articles:
    pred = clf.predict(vectorizer.transform([article]))[0]
    print(article)
    print("Prediction:", "Real" if pred==1 else "Fake")
    print("-"*50)
    import joblib

# Save TF-IDF vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Save trained Logistic Regression model
joblib.dump(clf, "logreg_model.pkl")

print("Model and vectorizer saved!")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


LIAR2 sample:
                                             content  label
0  90 percent of Americans "support universal bac...      0
1  Last year was one of the deadliest years ever ...      1
2  Bernie Sanders's plan is "to raise your taxes ...      0
3  Voter ID is supported by an overwhelming major...      0
4  Says Barack Obama "robbed Medicare (of) $716 b...      0
FakeNewsNet sample:
                                             content  label
0  Proof The Mainstream Media Is Manipulating The...      0
1  Charity: Clinton Foundation Distributed “Water...      0
2  A Hillary Clinton Administration May be Entire...      0
3  Trump’s Latest Campaign Promise May Be His Mos...      0
4  Website is Down For Maintenance Website is Dow...      0
Total combined records: 41987
            source  label
0          bbc.com      1
1      reuters.com      1
2          cnn.com      1
3  theguardian.com      1
4      nytimes.com      1
Live news sample:
                                          

In [3]:
from google.colab import files

files.download("tfidf_vectorizer.pkl")
files.download("logreg_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#using hugging face

In [None]:
# Uninstall old torch + transformers
!pip uninstall -y torch torchvision torchaudio transformers

# Install compatible versions
!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0
!pip install transformers==4.38.0
!pip install accelerate datasets scikit-learn


Found existing installation: torch 2.8.0+cu126
Uninstalling torch-2.8.0+cu126:
  Successfully uninstalled torch-2.8.0+cu126
Found existing installation: torchvision 0.23.0+cu126
Uninstalling torchvision-0.23.0+cu126:
  Successfully uninstalled torchvision-0.23.0+cu126
Found existing installation: torchaudio 2.8.0+cu126
Uninstalling torchaudio-2.8.0+cu126:
  Successfully uninstalled torchaudio-2.8.0+cu126
Found existing installation: transformers 4.56.1
Uninstalling transformers-4.56.1:
  Successfully uninstalled transformers-4.56.1
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.17.0
  Downloading torchvision-0.17.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.2.0
  Downloading torchaudio-2.2.0-cp312-cp312-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86

In [1]:
!pip install transformers==4.38.0


Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.38.0-py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers

In [2]:
from datasets import load_dataset
import pandas as pd

dataset_liar2 = load_dataset("chengxuphd/liar2")
liar_texts = dataset_liar2['train']['statement']
liar_labels = dataset_liar2['train']['label']

def binary_label(label):
    return 1 if label == 1 else 0

liar_labels_binary = [binary_label(l) for l in liar_labels]

df_liar2 = pd.DataFrame({
    'content': liar_texts,
    'label': liar_labels_binary
})
print("LIAR2 sample:")
print(df_liar2.head())

# 2️⃣ Load FakeNewsNet CSVs
def load_fakenews_csv(file_path):
    df = pd.read_csv(file_path)
    if 'text' in df.columns and 'title' in df.columns:
        df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
    elif 'title' in df.columns and 'news_url' in df.columns:
        df['content'] = df['title'].fillna('')
    else:
        df['content'] = df['content'].fillna('')
    df['label'] = 0 if 'fake' in file_path.lower() else 1
    return df[['content', 'label']]

files = [
    "BuzzFeed_fake_news_content.csv",
    "BuzzFeed_real_news_content.csv",
    "PolitiFact_fake_news_content.csv",
    "PolitiFact_real_news_content.csv",
    "gossipcop_fake.csv",
    "gossipcop_real.csv",
    "politifact_fake.csv",
    "politifact_real.csv"
]

dfs = [load_fakenews_csv(f) for f in files]
df_fakenewsnet = pd.concat(dfs, ignore_index=True)
print("FakeNewsNet sample:")
print(df_fakenewsnet.head())

# 3️⃣ Combine LIAR2 + FakeNewsNet
df_combined = pd.concat([df_liar2, df_fakenewsnet], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total combined records:", len(df_combined))

# 4️⃣ Load sources_names mapping
df_sources = pd.read_csv("sources_names.csv")  # columns: 'sources', 'label'
print(df_sources.head())

# 5️⃣ Fetch live news from newsdata.io
import requests
from urllib.parse import urlparse

api_key = "pub_410dc8b7c5c7484fb4621c94955c1172"
url = f"https://newsdata.io/api/1/news?apikey={api_key}&language=en"
response = requests.get(url)
data = response.json()

articles = []
for news in data['results'][:20]:
    title = news.get('title', '')
    description = news.get('description', '')
    link = news.get('link', '')
    content = title + " " + description
    articles.append({"content": content, "link": link})

df_live = pd.DataFrame(articles)

# 6️⃣ Weak-label live news using sources_names
df_live['domain'] = df_live['link'].apply(lambda x: urlparse(x).netloc)
df_live = df_live.merge(df_sources, left_on='domain', right_on='source', how='left')
df_live['label'] = df_live['label'].fillna(0).astype(int)  # default 0 if source not found
df_live = df_live[['content','label']]
print("Live news sample:")
print(df_live.head())

# 7️⃣ Combine everything into final dataset
df_final = pd.concat([df_combined, df_live], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total records in final dataset:", len(df_final))

#model
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Convert to HF dataset
dataset = Dataset.from_pandas(df_final)

# Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["content"], padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split train/test
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,   # Colab GPU can handle 16
    per_device_eval_batch_size=16,
    num_train_epochs=2,  # try 2–3
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Evaluate
results = trainer.evaluate()
print("Evaluation:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


LIAR2 sample:
                                             content  label
0  90 percent of Americans "support universal bac...      0
1  Last year was one of the deadliest years ever ...      1
2  Bernie Sanders's plan is "to raise your taxes ...      0
3  Voter ID is supported by an overwhelming major...      0
4  Says Barack Obama "robbed Medicare (of) $716 b...      0
FakeNewsNet sample:
                                             content  label
0  Proof The Mainstream Media Is Manipulating The...      0
1  Charity: Clinton Foundation Distributed “Water...      0
2  A Hillary Clinton Administration May be Entire...      0
3  Trump’s Latest Campaign Promise May Be His Mos...      0
4  Website is Down For Maintenance Website is Dow...      0
Total combined records: 41987
            source  label
0          bbc.com      1
1      reuters.com      1
2          cnn.com      1
3  theguardian.com      1
4      nytimes.com      1
Live news sample:
                                          


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

0it [00:00, ?it/s]

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'EncoderDecoderCache' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)

In [None]:
#Using LIghtBGM

In [6]:
# 0️⃣ Install required packages
!pip install datasets scikit-learn pandas requests lightgbm

# 1️⃣ Imports
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import requests
from urllib.parse import urlparse

# 2️⃣ Load LIAR2 dataset
dataset_liar2 = load_dataset("chengxuphd/liar2")
liar_texts = dataset_liar2['train']['statement']
liar_labels = dataset_liar2['train']['label']

def binary_label(label):
    return 1 if label == 1 else 0

liar_labels_binary = [binary_label(l) for l in liar_labels]

df_liar2 = pd.DataFrame({
    'content': liar_texts,
    'label': liar_labels_binary
})
print("LIAR2 sample:")
print(df_liar2.head())

# 3️⃣ Load FakeNewsNet CSVs
def load_fakenews_csv(file_path):
    df = pd.read_csv(file_path)
    if 'text' in df.columns and 'title' in df.columns:
        df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
    elif 'title' in df.columns and 'news_url' in df.columns:
        df['content'] = df['title'].fillna('')
    else:
        df['content'] = df['content'].fillna('')
    df['label'] = 0 if 'fake' in file_path.lower() else 1
    return df[['content', 'label']]

files = [
    "BuzzFeed_fake_news_content.csv",
    "BuzzFeed_real_news_content.csv",
    "PolitiFact_fake_news_content.csv",
    "PolitiFact_real_news_content.csv",
    "gossipcop_fake.csv",
    "gossipcop_real.csv",
    "politifact_fake.csv",
    "politifact_real.csv"
]

dfs = [load_fakenews_csv(f) for f in files]
df_fakenewsnet = pd.concat(dfs, ignore_index=True)
print("FakeNewsNet sample:")
print(df_fakenewsnet.head())

# 4️⃣ Combine LIAR2 + FakeNewsNet
df_combined = pd.concat([df_liar2, df_fakenewsnet], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total combined records:", len(df_combined))

# 5️⃣ Load sources_names mapping (trusted sources)
df_sources = pd.read_csv("sources_names.csv")  # columns: 'sources', 'label'
print(df_sources.head())

# 6️⃣ Fetch live news from newsdata.io
api_key = "pub_410dc8b7c5c7484fb4621c94955c1172"  # your API key here
url = f"https://newsdata.io/api/1/news?apikey={api_key}&language=en"
response = requests.get(url)
data = response.json()

articles = []
for news in data['results'][:20]:
    title = news.get('title', '')
    description = news.get('description', '')
    link = news.get('link', '')
    content = title + " " + description
    articles.append({"content": content, "link": link})

df_live = pd.DataFrame(articles)

# 7️⃣ Weak-label live news using sources_names
df_live['domain'] = df_live['link'].apply(lambda x: urlparse(x).netloc)
df_live = df_live.merge(df_sources, left_on='domain', right_on='source', how='left')
df_live['label'] = df_live['label'].fillna(0).astype(int)  # default 0 if source not found
df_live = df_live[['content','label']]
print("Live news sample:")
print(df_live.head())

# 8️⃣ Combine everything into final dataset
df_final = pd.concat([df_combined, df_live], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total records in final dataset:", len(df_final))

# 9️⃣ Split train-test
X = df_final['content']
y = df_final['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1️⃣0️⃣ TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 1️⃣1️⃣ Train LightGBM
clf = LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    class_weight='balanced',
    random_state=42
)
clf.fit(X_train_tfidf, y_train)

# 1️⃣2️⃣ Evaluate
preds = clf.predict(X_test_tfidf)
print("Final Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# 1️⃣3️⃣ Predict new headlines
new_articles = ["Breaking: Scientists discovered water on Mars!"]
for article in new_articles:
    pred = clf.predict(vectorizer.transform([article]))[0]
    print(article)
    print("Prediction:", "Real ✅" if pred==1 else "Fake ❌")
    print("-"*50)


LIAR2 sample:
                                             content  label
0  90 percent of Americans "support universal bac...      0
1  Last year was one of the deadliest years ever ...      1
2  Bernie Sanders's plan is "to raise your taxes ...      0
3  Voter ID is supported by an overwhelming major...      0
4  Says Barack Obama "robbed Medicare (of) $716 b...      0
FakeNewsNet sample:
                                             content  label
0  Proof The Mainstream Media Is Manipulating The...      0
1  Charity: Clinton Foundation Distributed “Water...      0
2  A Hillary Clinton Administration May be Entire...      0
3  Trump’s Latest Campaign Promise May Be His Mos...      0
4  Website is Down For Maintenance Website is Dow...      0
Total combined records: 41987
            source  label
0          bbc.com      1
1      reuters.com      1
2          cnn.com      1
3  theguardian.com      1
4      nytimes.com      1
Live news sample:
                                          



In [None]:
#XGBoost

In [7]:
# 0️⃣ Install XGBoost if not installed
!pip install xgboost --quiet

from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import requests
from urllib.parse import urlparse
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
dataset_liar2 = load_dataset("chengxuphd/liar2")
liar_texts = dataset_liar2['train']['statement']
liar_labels = dataset_liar2['train']['label']

def binary_label(label):
    return 1 if label == 1 else 0

liar_labels_binary = [binary_label(l) for l in liar_labels]

df_liar2 = pd.DataFrame({
    'content': liar_texts,
    'label': liar_labels_binary
})
print("LIAR2 sample:")
print(df_liar2.head())

# 3️⃣ Load FakeNewsNet CSVs
def load_fakenews_csv(file_path):
    df = pd.read_csv(file_path)
    if 'text' in df.columns and 'title' in df.columns:
        df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
    elif 'title' in df.columns and 'news_url' in df.columns:
        df['content'] = df['title'].fillna('')
    else:
        df['content'] = df['content'].fillna('')
    df['label'] = 0 if 'fake' in file_path.lower() else 1
    return df[['content', 'label']]

files = [
    "BuzzFeed_fake_news_content.csv",
    "BuzzFeed_real_news_content.csv",
    "PolitiFact_fake_news_content.csv",
    "PolitiFact_real_news_content.csv",
    "gossipcop_fake.csv",
    "gossipcop_real.csv",
    "politifact_fake.csv",
    "politifact_real.csv"
]

dfs = [load_fakenews_csv(f) for f in files]
df_fakenewsnet = pd.concat(dfs, ignore_index=True)
print("FakeNewsNet sample:")
print(df_fakenewsnet.head())

# 4️⃣ Combine LIAR2 + FakeNewsNet
df_combined = pd.concat([df_liar2, df_fakenewsnet], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total combined records:", len(df_combined))

# 5️⃣ Load sources_names mapping (trusted sources)
df_sources = pd.read_csv("sources_names.csv")  # columns: 'sources', 'label'
print(df_sources.head())

# 6️⃣ Fetch live news from newsdata.io
api_key = "pub_410dc8b7c5c7484fb4621c94955c1172"  # your API key here
url = f"https://newsdata.io/api/1/news?apikey={api_key}&language=en"
response = requests.get(url)
data = response.json()

articles = []
for news in data['results'][:20]:
    title = news.get('title', '')
    description = news.get('description', '')
    link = news.get('link', '')
    content = title + " " + description
    articles.append({"content": content, "link": link})

df_live = pd.DataFrame(articles)

# 7️⃣ Weak-label live news using sources_names
df_live['domain'] = df_live['link'].apply(lambda x: urlparse(x).netloc)
df_live = df_live.merge(df_sources, left_on='domain', right_on='source', how='left')
df_live['label'] = df_live['label'].fillna(0).astype(int)  # default 0 if source not found
df_live = df_live[['content','label']]
print("Live news sample:")
print(df_live.head())

# 8️⃣ Combine everything into final dataset
df_final = pd.concat([df_combined, df_live], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total records in final dataset:", len(df_final))

# 9️⃣ Split train-test
X = df_final['content']
y = df_final['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1️⃣ Import


# 2️⃣ TF-IDF features (same as before)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 3️⃣ XGBoost Classifier
clf = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',   # suppress warnings
    random_state=42
)
clf.fit(X_train_tfidf, y_train)

# 4️⃣ Evaluate
preds = clf.predict(X_test_tfidf)
print("Final Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# 5️⃣ Predict new headlines
new_articles = ["Breaking: Scientists discovered water on Mars!"]
for article in new_articles:
    pred = clf.predict(vectorizer.transform([article]))[0]
    print(article)
    print("Prediction:", "Real ✅" if pred==1 else "Fake ❌")
    print("-"*50)


LIAR2 sample:
                                             content  label
0  90 percent of Americans "support universal bac...      0
1  Last year was one of the deadliest years ever ...      1
2  Bernie Sanders's plan is "to raise your taxes ...      0
3  Voter ID is supported by an overwhelming major...      0
4  Says Barack Obama "robbed Medicare (of) $716 b...      0
FakeNewsNet sample:
                                             content  label
0  Proof The Mainstream Media Is Manipulating The...      0
1  Charity: Clinton Foundation Distributed “Water...      0
2  A Hillary Clinton Administration May be Entire...      0
3  Trump’s Latest Campaign Promise May Be His Mos...      0
4  Website is Down For Maintenance Website is Dow...      0
Total combined records: 41987
            source  label
0          bbc.com      1
1      reuters.com      1
2          cnn.com      1
3  theguardian.com      1
4      nytimes.com      1
Live news sample:
                                          

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final Accuracy: 0.7479761904761905
              precision    recall  f1-score   support

           0       0.74      0.67      0.70      3758
           1       0.75      0.81      0.78      4642

    accuracy                           0.75      8400
   macro avg       0.75      0.74      0.74      8400
weighted avg       0.75      0.75      0.75      8400

Breaking: Scientists discovered water on Mars!
Prediction: Fake ❌
--------------------------------------------------


In [None]:
#using huggingface

In [1]:
# 0️⃣ Install required packages
#!pip install datasets transformers evaluate pandas scikit-learn requests --quiet

# 1️⃣ Imports
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from urllib.parse import urlparse
import requests
import evaluate

# 2️⃣ Load LIAR2
from datasets import load_dataset
dataset_liar2 = load_dataset("chengxuphd/liar2")
liar_texts = dataset_liar2['train']['statement']
liar_labels = dataset_liar2['train']['label']

def binary_label(label):
    return 1 if label == 1 else 0

liar_labels_binary = [binary_label(l) for l in liar_labels]

df_liar2 = pd.DataFrame({
    'content': liar_texts,
    'label': liar_labels_binary
})
print("LIAR2 sample:")
print(df_liar2.head())

# 3️⃣ Load FakeNewsNet CSVs
def load_fakenews_csv(file_path):
    df = pd.read_csv(file_path)
    if 'text' in df.columns and 'title' in df.columns:
        df['content'] = df['title'].fillna('') + " " + df['text'].fillna('')
    elif 'title' in df.columns and 'news_url' in df.columns:
        df['content'] = df['title'].fillna('')
    else:
        df['content'] = df['content'].fillna('')
    df['label'] = 0 if 'fake' in file_path.lower() else 1
    return df[['content', 'label']]

files = [
    "BuzzFeed_fake_news_content.csv",
    "BuzzFeed_real_news_content.csv",
    "PolitiFact_fake_news_content.csv",
    "PolitiFact_real_news_content.csv",
    "gossipcop_fake.csv",
    "gossipcop_real.csv",
    "politifact_fake.csv",
    "politifact_real.csv"
]

dfs = [load_fakenews_csv(f) for f in files]
df_fakenewsnet = pd.concat(dfs, ignore_index=True)
print("FakeNewsNet sample:")
print(df_fakenewsnet.head())

# 4️⃣ Combine LIAR2 + FakeNewsNet
df_combined = pd.concat([df_liar2, df_fakenewsnet], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total combined records:", len(df_combined))

# 5️⃣ Load sources_names mapping for weak labeling live news
df_sources = pd.read_csv("sources_names.csv")  # columns: 'sources', 'label'
print(df_sources.head())

# 6️⃣ Fetch live news from newsdata.io
api_key = "pub_410dc8b7c5c7484fb4621c94955c1172"  # add your API key
url = f"https://newsdata.io/api/1/news?apikey={api_key}&language=en"
response = requests.get(url)
data = response.json()

articles = []
for news in data['results'][:20]:  # top 20 articles
    title = news.get('title', '')
    description = news.get('description', '')
    link = news.get('link', '')
    content = title + " " + description
    articles.append({"content": content, "link": link})

df_live = pd.DataFrame(articles)

# 7️⃣ Weak-label live news
df_live['domain'] = df_live['link'].apply(lambda x: urlparse(x).netloc)
df_live = df_live.merge(df_sources, left_on='domain', right_on='source', how='left')
df_live['label'] = df_live['label'].fillna(0).astype(int)  # default fake if not in trusted sources
df_live = df_live[['content','label']]
print("Live news sample:")
print(df_live.head())

# 8️⃣ Combine all datasets
df_final = pd.concat([df_combined, df_live], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
print("Total records in final dataset:", len(df_final))

# 9️⃣ Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_final['content'], df_final['label'], test_size=0.2, random_state=42
)

train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

# 1️⃣0️⃣ Tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# 1️⃣1️⃣ Load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# 1️⃣2️⃣ Metrics
accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# 1️⃣3️⃣ Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    report_to='none'
)

# 1️⃣4️⃣ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 1️⃣5️⃣ Train
trainer.train()

# 1️⃣6️⃣ Evaluate
trainer.evaluate()

# 1️⃣7️⃣ Predict new headlines
new_articles = ["Breaking: Scientists discovered water on Mars!"]
tokens = tokenizer(new_articles, padding=True, truncation=True, return_tensors="pt")
outputs = model(**tokens)
pred = np.argmax(outputs.logits.detach().numpy(), axis=1)[0]
print("Prediction:", "Real ✅" if pred==1 else "Fake ❌")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)

In [7]:
!pip install --upgrade --force-reinstall transformers datasets evaluate


Collecting transformers
  Using cached transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Collecting datasets
  Using cached datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting evaluate
  Using cached evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2025.9.18-cp312-cp312-m

In [1]:
import transformers
print(transformers.__version__)  # Should be >= 4.41.0



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

4.56.2


/bin/bash: line 1: 2: No such file or directory


2.3.3
