In [1]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

# adjust to wherever you keep your large CSV
BIG_DATA_CSV = "../../data/bigFakeNews/dataFiltered.csv"

# chunk size for the big dataset
CHUNKSIZE = 200_000

In [2]:
model_path = "./models/logreg_B_C_D.joblib"
model = joblib.load(model_path)
print(f"Loaded model from {model_path}\n")

Loaded model from ./models/logreg_B_C_D.joblib



In [7]:
print("Evaluating large dataset in chunks:\n")

total_rows = 0
total_correct = 0


for chunk_idx, chunk in enumerate(
        pd.read_csv(BIG_DATA_CSV, 
                    usecols=["title", "text", "label"],  # only load what we need
                    chunksize=CHUNKSIZE)
    ):

    # Predict
    X_chunk = chunk["title"] + "\n" + chunk["text"]
    y_chunk = chunk["label"].values
    y_pred_chunk = model.predict(X_chunk)

    # Accumulate
    correct = np.sum(y_pred_chunk == y_chunk)
    n = len(y_chunk)
    total_correct += correct
    total_rows += n

    # Progress message
    print(f"  • Chunk {chunk_idx+1}: processed {n} rows, "
          f"cumulative: {total_rows} rows, "
          f"chunk accuracy = {correct/n:.4f}")

# Final accuracy
final_acc = total_correct / total_rows
print(f"\nOverall accuracy on '{os.path.basename(BIG_DATA_CSV)}': {final_acc:.8f}")

Evaluating large dataset in chunks:

  • Chunk 1: processed 200000 rows, cumulative: 200000 rows, chunk accuracy = 0.8201
  • Chunk 2: processed 200000 rows, cumulative: 400000 rows, chunk accuracy = 0.7367
  • Chunk 3: processed 200000 rows, cumulative: 600000 rows, chunk accuracy = 0.7941
  • Chunk 4: processed 200000 rows, cumulative: 800000 rows, chunk accuracy = 0.5949
  • Chunk 5: processed 200000 rows, cumulative: 1000000 rows, chunk accuracy = 0.7966
  • Chunk 6: processed 200000 rows, cumulative: 1200000 rows, chunk accuracy = 0.6428
  • Chunk 7: processed 200000 rows, cumulative: 1400000 rows, chunk accuracy = 0.7651
  • Chunk 8: processed 200000 rows, cumulative: 1600000 rows, chunk accuracy = 0.6289
  • Chunk 9: processed 200000 rows, cumulative: 1800000 rows, chunk accuracy = 0.7548
  • Chunk 10: processed 200000 rows, cumulative: 2000000 rows, chunk accuracy = 0.7375
  • Chunk 11: processed 200000 rows, cumulative: 2200000 rows, chunk accuracy = 0.7368
  • Chunk 12: proce