In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from xgboost import XGBClassifier

# Load data
data = pd.read_csv("week_11_pima.csv")

# Feature columns (all except 'outcome')
X_full = data.drop(columns=['outcome'])
y_full = data['outcome']

# Dataset sizes to test
dataset_sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

# For reproducibility
random_state = 42

print("Dataset full size:", len(data))
print()

for size in dataset_sizes:
    if size > len(data):
        print(f"Requested size {size} greater than dataset size {len(data)}. Skipping.")
        continue

    # Sample dataset without replacement if size < dataset size
    if size < len(data):
        sampled_data = data.sample(n=size, random_state=random_state)
        X = sampled_data.drop(columns=['outcome'])
        y = sampled_data['outcome']
    else:
        X = X_full
        y = y_full

    print(f"Dataset size: {size}")

    # Initialize XGBClassifier with default parameters
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=random_state)

    # Setup 5-fold stratified cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    # Measure time for fitting inside cross-validation (approximate)
    start_time = time.time()

    # Use accuracy as scoring metric
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=1)

    end_time = time.time()
    elapsed = end_time - start_time

    print(f"  Mean CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")
    print(f"  Time taken for 5-fold CV fitting: {elapsed:.2f} seconds")
    print()

print("Done.")

Dataset full size: 10000000

Dataset size: 100


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.8600 (+/- 0.0735)
  Time taken for 5-fold CV fitting: 0.34 seconds

Dataset size: 1000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.9470 (+/- 0.0051)
  Time taken for 5-fold CV fitting: 0.27 seconds

Dataset size: 10000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.9764 (+/- 0.0037)
  Time taken for 5-fold CV fitting: 0.81 seconds

Dataset size: 100000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.9873 (+/- 0.0007)
  Time taken for 5-fold CV fitting: 4.05 seconds

Dataset size: 1000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.9919 (+/- 0.0002)
  Time taken for 5-fold CV fitting: 43.83 seconds

Dataset size: 10000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



  Mean CV Accuracy: 0.9932 (+/- 0.0001)
  Time taken for 5-fold CV fitting: 419.97 seconds

Done.
