# Compare Datasets

In [2]:
import pandas as pd
from pathlib import Path

from typing import Dict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import uniform, randint
import joblib

  from scipy.sparse import csr_matrix, issparse

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\mager\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\mager\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\mager\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start(

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [None]:
def normalize_lengths_random(
    dfs: Dict[str, pd.DataFrame],
    random_state: int = 42
) -> Dict[str, pd.DataFrame]:
    """
    For each DataFrame in dfs, sample down to the minimum length among them.

    Args:
        dfs: dict mapping names to DataFrames.
        random_state: seed for reproducibility.

    Returns:
        dict of sampled DataFrames.
    """
    min_len = min(len(df) for df in dfs.values())
    print(f"Sampling all datasets to {min_len} rows each (smallest dataset size)")
    return {
        name: df.sample(n=min_len, random_state=random_state)
                .reset_index(drop=True)
        for name, df in dfs.items()
    }

In [None]:
# Dataset B: WELFake by Saurabh Shahane
def load_B(path: str = "../../data/Saurabh Shahane - Fake_News_Classification/WELFake_Dataset.csv") -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.rename(columns={'Title': 'title', 'Text': 'text', 'Label': 'label'})
    df['label'] = 1 - df['label'].astype(int)  # Flip labels: 0 → 1, 1 → 0
    return df[['title', 'text', 'label']]


# Dataset C: Fake News by GonzaloA
def load_C(
    train_path: str = "../../data/GonzaloA - fake_news/train_without_reuters.csv",
    val_path: str   = "../../data/GonzaloA - fake_news/evaluation_without_reuters.csv",
    test_path: str  = "../../data/GonzaloA - fake_news/test_without_reuters.csv"
) -> pd.DataFrame:
    parts = []
    for p in [train_path, val_path, test_path]:
        parts.append(
            pd.read_csv(p,sep=';')
        )
    df = pd.concat(parts, ignore_index=True)
    if 'value' in df.columns:
        df = df.rename(columns={'value': 'label'})
    if 'content' in df.columns and 'text' not in df.columns:
        df = df.rename(columns={'content': 'text'})
        
    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

# Dataset D: fake-news-detection-dataset-English by ErfanMoosaviMonazzah
def load_D(
    train_path: str = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/train.tsv",
    val_path: str   = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/validation.tsv",
    test_path: str  = "../../data/ErfanMoosaviMonazzah - fake-news-detection-dataset-English/test.tsv"
) -> pd.DataFrame:
    parts = [
        pd.read_csv(train_path, sep='\t', dtype=str),
        pd.read_csv(val_path, sep='\t', dtype=str),
        pd.read_csv(test_path, sep='\t', dtype=str)
    ]
    df = pd.concat(parts, ignore_index=True)
    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

# Dataset E: Fake News Detection by Bhavik Jikadara
def load_E(
    fake_path: str = "../../data/Bhavik Jikadara - Fake News Detection/fake.csv",
    real_path: str = "../../data/Bhavik Jikadara - Fake News Detection/true.csv"
) -> pd.DataFrame:
    df_fake = pd.read_csv(fake_path, dtype=str)
    df_real = pd.read_csv(real_path, dtype=str)
    df_fake['label'] = '0'
    df_real['label'] = '1'
    df = pd.concat([df_fake, df_real], ignore_index=True)

    df['label'] = df['label'].astype(int)
    return df[['title', 'text', 'label']]

In [None]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows where 'title' or 'text' is missing or empty.
    """
    return df.dropna(subset=['title', 'text']).query("title != '' and text != ''")


In [None]:
def load_all_normalized(random_state: int = 42) -> Dict[str, pd.DataFrame]:
    raw_dfs = {
        'B': load_B(),
        'C': load_C(),
        'D': load_D(),
        'E': load_E()
    }

    # Clean before normalization
    cleaned_dfs = {name: clean_dataframe(df) for name, df in raw_dfs.items()}
    return normalize_lengths_random(cleaned_dfs, random_state=random_state)

In [None]:
    datasets = load_all_normalized()
    for name, df in datasets.items():
        print(f"Dataset {name}:", df.shape)

In [None]:
import itertools
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from datetime import datetime

In [None]:
all_sets = ['B', 'C', 'D', 'E']
results = []

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

In [None]:
Path("models").mkdir(parents=True, exist_ok=True)

for r in range(1, len(all_sets)):
    for combo in itertools.combinations(all_sets, r):        
        model_name = '_'.join(combo)
        model_path = Path(f"models/logreg_{model_name}.joblib")

        df_trainval = pd.concat([datasets[d] for d in combo], ignore_index=True)
        test_sets = [d for d in all_sets if d not in combo]
        df_test = pd.concat([datasets[d] for d in test_sets], ignore_index=True)

        X = df_trainval['title'] + '\n' + df_trainval['text']
        y = df_trainval['label']

        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        print(f"\n=== Training model on datasets: {combo} | Testing on: {test_sets} ===")
        start_time = datetime.now()
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=13954,
                ngram_range=(1, 2),
                stop_words=None,
                max_df=0.8891916614020643,
                min_df=3
            )),
            ('clf', LogisticRegression(
                C=8.725368061523762,
                class_weight='balanced',
                penalty='elasticnet',
                l1_ratio=0.6760522571867544,
                solver='saga',
                max_iter=1000
            ))
        ])

        if model_path.exists():
            pipeline = joblib.load(model_path)
            print(f"Loaded model from {model_path}")
        else:
            pipeline.fit(X_train, y_train)
            joblib.dump(pipeline, model_path)
            print(f"Trained and saved model to {model_path}")

        end_time = datetime.now()
        print(f"→ Duration: {str(end_time - start_time)}")

        val_preds = pipeline.predict(X_val)
        test_preds = pipeline.predict(df_test['title'] + '\n' + df_test['text'])

        val_acc = accuracy_score(y_val, val_preds)
        test_acc = accuracy_score(df_test['label'], test_preds)
        
        print(f"→ Validation Accuracy: {val_acc:.4f}")

        results.append({
            'train_on': combo,
            'test_on': test_sets,
            'val_accuracy': val_acc,
            'test_accuracy': test_acc
        })

In [None]:
results_df = pd.DataFrame(results)
results_df
