In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# adjust to wherever you keep your large CSV
BIG_DATA_CSV = "./bigFakeNews/opensources_fake_news_cleaned.csv"

# other, smaller datasets (already small enough to load in memory)
DATASETS = {
    "Meg_Risdal": "./Meg Risdal_fake_only/fake.csv",
    # add more small ones here if you have them...
}

# chunk size for the big dataset
CHUNKSIZE = 200_000

In [None]:
model_path = "./models/logreg_C_D_E.joblib"
model = joblib.load(model_path)
print(f"Loaded model from {model_path}\n")

In [None]:
print("Evaluating large dataset in chunks:\n")

total_rows = 0
total_correct = 0

# We'll map 'type' → 'label' on the fly
type_to_label = {"reliable": 1, "fake": 0}

for chunk_idx, chunk in enumerate(
        pd.read_csv(BIG_DATA_CSV, 
                    usecols=["type", "content"],  # only load what we need
                    chunksize=CHUNKSIZE)
    ):
    # Rename & filter
    chunk = chunk[chunk["type"].isin(type_to_label)]
    chunk = chunk.rename(columns={"content": "text"})
    chunk["title"] = ""  # if there's no title column, assume empty
    chunk["label"] = chunk["type"].map(type_to_label)

    # Drop any null/empty text
    chunk = chunk.dropna(subset=["text"])
    chunk = chunk[chunk["text"].str.strip() != ""]

    # Predict
    X_chunk = chunk["title"] + "\n" + chunk["text"]
    y_chunk = chunk["label"].values
    y_pred_chunk = model.predict(X_chunk)

    # Accumulate
    correct = np.sum(y_pred_chunk == y_chunk)
    n = len(y_chunk)
    total_correct += correct
    total_rows += n

    # Progress message
    print(f"  • Chunk {chunk_idx+1}: processed {n} rows, "
          f"cumulative: {total_rows} rows, "
          f"chunk accuracy = {correct/n:.4f}")

# Final accuracy
final_acc = total_correct / total_rows
print(f"\nOverall accuracy on '{os.path.basename(BIG_DATA_CSV)}': {final_acc:.8f}")