In [1]:
import joblib
from sklearn.metrics import accuracy_score
import os
import pandas as pd

from datasets import load_dataset
import os
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:


def load_datasets(basepath):
    """
    Load and clean individual fake news datasets from subfolders in basepath.
    Returns a dict of pandas DataFrames keyed by dataset name.
    """
    datasets = {}

    # 1. Aadya Singh_Fakenews: evaluation.csv, test (1).csv, train (2).csv
    folder = os.path.join(basepath, "Aadya Singh  _fake-and_real_news")
    files = ["evaluation.csv", "test (1).csv", "train (2).csv"]
    dfs = [pd.read_csv(os.path.join(folder, f), sep=';') for f in files]
    datasets['Aadya_Singh'] = pd.concat(dfs, ignore_index=True)

    # 2. Bhavik Jikadara - Fake News Detection
    folder = os.path.join(basepath, "Bhavik Jikadara - Fake News Detection")
    fake = pd.read_csv(os.path.join(folder, "fake.csv"))
    true = pd.read_csv(os.path.join(folder, "true.csv"))
    # Assign labels
    fake['label'] = 0   # fake → 0
    true['label'] = 1   # real → 1
    datasets['Bhavik_Jikadara'] = pd.concat([fake, true], ignore_index=True)

    # 3. clmentbisaillon_Fakenews
    folder = os.path.join(basepath, "clmentbisaillon_Fakenews")
    real = pd.read_csv(os.path.join(folder, "True.csv"))
    fake = pd.read_csv(os.path.join(folder, "Fake.csv"))
    # Drop placeholder empty texts
    if 'text' in real.columns:
        real = real[real['text'] != "[empty]"]
    if 'text' in fake.columns:
        fake = fake[fake['text'] != "[empty]"]
    # Assign labels
    real['label'] = 1   # real → 1
    fake['label'] = 0   # fake → 0
    df3 = pd.concat([real, fake], ignore_index=True)
    datasets['clmentbisaillon'] = df3

    # 4. ErfanMoosaviMonazzah - fake-news-detection-dataset-English
    folder = os.path.join(basepath, "ErfanMoosaviMonazzah - fake-news-detection-dataset-English")
    parts = ["test.tsv", "train.tsv", "validation.tsv"]
    dfs = [pd.read_csv(os.path.join(folder, p), sep='\t') for p in parts]
    datasets['ErfanMoosaviMonazzah'] = pd.concat(dfs, ignore_index=True)

    # 5. GonzaloA - fake_news
    folder = os.path.join(basepath, "GonzaloA - fake_news")
    parts = ["test.csv", "train.csv", "evaluation.csv"]
    dfs = [pd.read_csv(os.path.join(folder, p), sep=';') for p in parts]
    datasets['GonzaloA'] = pd.concat(dfs, ignore_index=True)

    # 6. Hassan Amin - fake_or_real_news.csv
    df6 = pd.read_csv(
        os.path.join(basepath, "Hassan Amin-fake_or_real_news.csv/fake_or_real_news.csv")
    )

    # The column in this file is likely named 'label' or 'type'; adjust as needed:
    # Map the textual labels to integers
    df6['label'] = df6['label'].str.upper().map({
        'FAKE': 0,
        'REAL': 1
    })

    datasets['Hassan_Amin'] = df6

    # 7. Meg Risdal_fake_only
    folder = os.path.join(basepath, "Meg Risdal_fake_only")
    df7 = pd.read_csv(os.path.join(folder, "fake.csv"))
    # Drop null titles and non‑English entries
    if 'titel' in df7.columns:
        df7 = df7.dropna(subset=['titel'])
        df7 = df7.rename(columns={'titel': 'title'})
    if 'language' in df7.columns:
        df7 = df7[df7['language'].str.lower() == 'english']
    # Assign label: fake-only → 0
    df7['label'] = 0
    df7 = df7.dropna(subset=['title', 'text'])
    df7 = df7[(df7['title'].str.strip() != '') & (df7['text'].str.strip() != '')]
    datasets['Meg_Risdal'] = df7

    # 8. Ruchi Bhatia_news_articles.csv
    df8 = pd.read_csv(os.path.join(basepath, "Ruchi Bhatia_news_articles.csv/news_articles.csv"))
    # Clean entries
    df8 = df8[~df8['title'].str.lower().isin(['no title', 'newsticker'])]
    df8 = df8[df8['text'].notna()]
    if 'language' in df8.columns:
        df8 = df8[df8['language'].str.lower() == 'english']
    df8['label'] = df8['label'].str.upper().map({
        'Fake': 0,
        'Real': 1
    })
    datasets['Ruchi_Bhatia'] = df8

    # 9. Saurabh Shahane - Fake_News_Classification
    datasets['Saurabh_Shahane'] = pd.read_csv(
        os.path.join(basepath, "Saurabh Shahane - Fake_News_Classification", "WELFake_Dataset.csv")
    )

    # 10. andyP/fake_news_en_opensources
    #df10 = pd.read_csv(
    #    os.path.join(basepath, "bigFakeNews", "opensources_fake_news_cleaned.csv")
    #)
    
    # Keep only 'reliable' and 'fake' types
    #df10 = df10[df10['type'].isin(['reliable', 'fake'])]
    
    # Add 'label' column: 1 for reliable, 0 for fake
    #df10['label'] = df10['type'].map({'reliable': 1, 'fake': 0})
    
    # Rename 'content' column to 'text'
    #df10.rename(columns={'content': 'text'}, inplace=True)
    
    # Store in datasets dictionary
    #datasets['andyP_opensources'] = df10


    return datasets

In [3]:
# Load model trained on C+D+E
model_path = "./models/logreg_B_C_D.joblib"
model = joblib.load(model_path)

In [4]:
# Load all datasets fully
base_path = '../../data'
datasets = load_datasets(base_path)

In [5]:
# Evaluate on each dataset individually
print("Evaluating model logreg_C_D_E.joblib on individual datasets:\n")
for name, df in datasets.items():
    X = df['title'] + '\n' + df['text']
    y = df['label']
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    print(f"Dataset {name}: accuracy = {acc:.8f}")

Evaluating model logreg_C_D_E.joblib on individual datasets:

Dataset Aadya_Singh: accuracy = 0.99250992
Dataset Bhavik_Jikadara: accuracy = 0.99973273
Dataset clmentbisaillon: accuracy = 0.99973273
Dataset ErfanMoosaviMonazzah: accuracy = 0.99959338
Dataset GonzaloA: accuracy = 0.99250992
Dataset Hassan_Amin: accuracy = 0.57663773
Dataset Meg_Risdal: accuracy = 0.87362780


TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=['Fake' 'Real'] and y_pred=[0 1]. Make sure that the predictions provided by the classifier coincides with the true labels.