In [10]:
import pandas as pd
import numpy as np
import time
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

# Load data
df = pd.read_csv('generated_pima_data1.csv')

X = df.drop(columns=['outcome'])  # Features
y = df['outcome']                 # Target

sizes = [100, 1000, 10000, 100000, 1000000, 10000000]
results = []

for size in sizes:
    print(f"Training on dataset size: {size}")

    # Always sample together to avoid NaNs
    if size > len(df):
        sampled_df = df.sample(n=size, replace=True, random_state=42)
    else:
        sampled_df = df.sample(n=size, random_state=42)

    X_sample = sampled_df.drop(columns=['outcome'])
    y_sample = sampled_df['outcome']

    # Drop rows with NaN values in y_sample
    X_sample = X_sample[y_sample.notna()]
    y_sample = y_sample[y_sample.notna()]

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0)

    start = time.time()
    scores = cross_val_score(model, X_sample, y_sample, cv=5, scoring='accuracy')
    end = time.time()

    results.append({
        'Method used': 'XGBoost (scikit-learn, 5-fold CV)',
        'Dataset size': size,
        'Testing-set predictive performance (Accuracy)': round(np.mean(scores), 4),
        'Time taken for the model to be fit (seconds)': round(end - start, 2)
    })

#   results table
results_df = pd.DataFrame(results)
print("\nSummary Results:\n")
print(results_df)

# Save   CSV
results_df.to_csv('xgb_simple_results.csv', index=False)
print("\nResults saved to xgb_simple_results.csv")

Training on dataset size: 100
Training on dataset size: 1000
Training on dataset size: 10000
Training on dataset size: 100000
Training on dataset size: 1000000
Training on dataset size: 10000000

Summary Results:

                         Method used  Dataset size  \
0  XGBoost (scikit-learn, 5-fold CV)           100   
1  XGBoost (scikit-learn, 5-fold CV)          1000   
2  XGBoost (scikit-learn, 5-fold CV)         10000   
3  XGBoost (scikit-learn, 5-fold CV)        100000   
4  XGBoost (scikit-learn, 5-fold CV)       1000000   
5  XGBoost (scikit-learn, 5-fold CV)      10000000   

   Testing-set predictive performance (Accuracy)  \
0                                         0.9100   
1                                         0.9390   
2                                         0.9766   
3                                         0.9985   
4                                         1.0000   
5                                         1.0000   

   Time taken for the model to be fit (sec