In [1]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# adjust to wherever you keep your large CSV
BIG_DATA_CSV = "../../data/bigFakeNews/dataFiltered.csv"

# chunk size for the big dataset
CHUNKSIZE = 200_000
model_path = "./models/logreg_B.joblib"
model = joblib.load(model_path)
print("Evaluating large dataset in chunks:\n")

total_rows = 0
total_correct = 0


for chunk_idx, chunk in enumerate(
        pd.read_csv(BIG_DATA_CSV, 
                    usecols=["title", "text", "label"],  # only load what we need
                    chunksize=CHUNKSIZE)
    ):

    # Predict
    X_chunk = chunk["title"] + "\n" + chunk["text"]
    y_chunk = chunk["label"].values
    y_pred_chunk = model.predict(X_chunk)

    # Accumulate
    correct = np.sum(y_pred_chunk == y_chunk)
    n = len(y_chunk)
    total_correct += correct
    total_rows += n

    # Progress message
    print(f"  • Chunk {chunk_idx+1}: processed {n} rows, "
          f"cumulative: {total_rows} rows, "
          f"chunk accuracy = {correct/n:.4f}")

# Final accuracy
final_acc = total_correct / total_rows
print(f"\nOverall accuracy on '{os.path.basename(BIG_DATA_CSV)}': {final_acc:.8f}")