# Compare Datasets

In [2]:
import pandas as pd
from pathlib import Path

from typing import Dict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import uniform, randint
import joblib

In [3]:
# Dataset B: WELFake by Saurabh Shahane
def load_B(path: str = "../../data/Saurabh Shahane - Fake_News_Classification/WELFake_Dataset.csv") -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
    df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
    return df[['title', 'text', 'label']]


# Dataset C: Fake News by GonzaloA
def load_C(
    train_path: str = "../../data/GonzaloA - fake_news/train_without_reuters.csv",
    val_path: str   = "../../data/GonzaloA - fake_news/evaluation_without_reuters.csv",
    test_path: str  = "../../data/GonzaloA - fake_news/test_without_reuters.csv"
) -> pd.DataFrame:
    parts = []
    for p in [train_path, val_path, test_path]:
        parts.append(
            pd.read_csv(p,sep=';')
        )
    df = pd.concat(parts, ignore_index=True)
    if 'value' in df.columns:
        df = df.rename(columns={'value': 'label'})
    if 'content' in df.columns and 'text' not in df.columns:
        df = df.rename(columns={'content': 'text'})
        
    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

# Dataset D: fake-news-detection-dataset-English by ErfanMoosaviMonazzah
def load_D(
    train_path: str = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv",
    val_path: str   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv",
    test_path: str  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"
) -> pd.DataFrame:
    parts = [
        pd.read_csv(train_path, sep='\t', dtype=str),
        pd.read_csv(val_path, sep='\t', dtype=str),
        pd.read_csv(test_path, sep='\t', dtype=str)
    ]
    df = pd.concat(parts, ignore_index=True)
    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

# Dataset E: Fake News Detection by Bhavik Jikadara
def load_E(
    fake_path: str = "../../data/Bhavik Jikadara - Fake News Detection/fake.csv",
    real_path: str = "../../data/Bhavik Jikadara - Fake News Detection/true.csv"
) -> pd.DataFrame:
    df_fake = pd.read_csv(fake_path, dtype=str)
    df_real = pd.read_csv(real_path, dtype=str)
    df_fake['label'] = '0'
    df_real['label'] = '1'
    df = pd.concat([df_fake, df_real], ignore_index=True)

    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

In [4]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows where 'title' or 'text' is missing or empty.
    """
    return df.dropna(subset=['title', 'text']).query("title != '' and text != ''")


In [5]:
def load_all_full() -> Dict[str, pd.DataFrame]:
    """
    Loads datasets B, C, D, and E in their entirety.
    """
    return {
        'B': load_B(),
        'C': load_C(),
        'D': load_D(),
        'E': load_E()
    }

In [6]:
    raw_dfs = load_all_full()

    datasets = {name: clean_dataframe(df) for name, df in raw_dfs.items()}
    for name, df in datasets.items():
        print(f"Dataset {name}:", df.shape)

Dataset B: (71537, 3)
Dataset C: (40587, 3)
Dataset D: (44267, 3)
Dataset E: (44898, 3)


In [7]:
import itertools
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from datetime import datetime

In [8]:
all_sets = ['B', 'C', 'D', 'E']
results = []

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

In [9]:
Path("models").mkdir(parents=True, exist_ok=True)

for r in range(1, len(all_sets)):
    for combo in itertools.combinations(all_sets, r):        
        model_name = '_'.join(combo)
        model_path = Path(f"models/logreg_{model_name}.joblib")

        df_trainval = pd.concat([datasets[d] for d in combo], ignore_index=True)
        test_sets = [d for d in all_sets if d not in combo]
        df_test = pd.concat([datasets[d] for d in test_sets], ignore_index=True)

        X = df_trainval['title'] + '\n' + df_trainval['text']
        y = df_trainval['label']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        print(f"\n=== Training model on datasets: {combo} | Testing on: {test_sets} ===")
        start_time = datetime.now()
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=13954,
                ngram_range=(1, 2),
                stop_words=None,
                max_df=0.8891916614020643,
                min_df=3
            )),
            ('clf', LogisticRegression(
                C=8.725368061523762,
                class_weight='balanced',
                penalty='elasticnet',
                l1_ratio=0.6760522571867544,
                solver='saga',
                max_iter=1000
            ))
        ])

        if model_path.exists():
            pipeline = joblib.load(model_path)
            print(f"Loaded model from {model_path}")
        else:
            pipeline.fit(X_train, y_train)
            joblib.dump(pipeline, model_path)
            print(f"Trained and saved model to {model_path}")

        end_time = datetime.now()
        print(f"→ Duration: {str(end_time - start_time)}")

        val_preds = pipeline.predict(X_val)
        test_preds = pipeline.predict(df_test['title'] + '\n' + df_test['text'])

        val_acc = accuracy_score(y_val, val_preds)
        test_acc = accuracy_score(df_test['label'], test_preds)
        
        print(f"→ Validation Accuracy: {val_acc:.4f}")

        results.append({
            'train_on': combo,
            'test_on': test_sets,
            'val_accuracy': val_acc,
            'test_accuracy': test_acc
        })


=== Training model on datasets: ('B',) | Testing on: ['C', 'D', 'E'] ===
Loaded model from models\logreg_B.joblib
→ Duration: 0:00:00.102152
→ Validation Accuracy: 0.9770

=== Training model on datasets: ('C',) | Testing on: ['B', 'D', 'E'] ===
Loaded model from models\logreg_C.joblib
→ Duration: 0:00:00.104307
→ Validation Accuracy: 0.9925

=== Training model on datasets: ('D',) | Testing on: ['B', 'C', 'E'] ===
Loaded model from models\logreg_D.joblib
→ Duration: 0:00:00.107388
→ Validation Accuracy: 0.9967

=== Training model on datasets: ('E',) | Testing on: ['B', 'C', 'D'] ===
Loaded model from models\logreg_E.joblib
→ Duration: 0:00:00.098202
→ Validation Accuracy: 0.9988

=== Training model on datasets: ('B', 'C') | Testing on: ['D', 'E'] ===
Loaded model from models\logreg_B_C.joblib
→ Duration: 0:00:00.101757
→ Validation Accuracy: 0.9805

=== Training model on datasets: ('B', 'D') | Testing on: ['C', 'E'] ===
Loaded model from models\logreg_B_D.joblib
→ Duration: 0:00:00.098

In [10]:
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,train_on,test_on,val_accuracy,test_accuracy
0,"(B,)","[C, D, E]",0.977006,0.977773
1,"(C,)","[B, D, E]",0.992486,0.926454
2,"(D,)","[B, C, E]",0.996725,0.921693
3,"(E,)","[B, C, D]",0.998775,0.915878
4,"(B, C)","[D, E]",0.980513,0.996378
5,"(B, D)","[C, E]",0.985881,0.988817
6,"(B, E)","[C, D]",0.987031,0.981733
7,"(C, D)","[B, E]",0.996111,0.901679
8,"(C, E)","[B, D]",0.996374,0.899762
9,"(D, E)","[B, C]",0.999103,0.891682
