In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer

In [4]:
data = pd.read_csv("dfdata.csv")
data.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,outcome
0,10,133,66,49,130,45.4,0.615,25,1
1,3,106,88,35,75,53.2,0.254,24,0
2,3,109,80,23,135,40.9,1.159,38,0
3,1,100,58,25,180,25.6,0.731,38,0
4,3,87,74,27,126,30.0,0.591,31,0


In [7]:
# Features and target
X_full = data.drop(columns=['outcome'])
y_full = data['outcome']

dataset_sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

results = []

for size in dataset_sizes:
    print(f"\nProcessing dataset size: {size}")

    if size <= len(data):
        # Sample without replacement
        sample_df = data.sample(n=size, random_state=42)
    else:
        # Sample with replacement to reach large sizes
        repeat_times = size // len(data)
        remainder = size % len(data)
        sample_df = pd.concat([data] * repeat_times + [data.sample(n=remainder, random_state=42)], ignore_index=True)

    X = sample_df.drop(columns=['outcome'])
    y = sample_df['outcome']

    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    scoring = {'accuracy': make_scorer(accuracy_score)}

    cv_results = cross_validate(model, X, y, cv=5, scoring=scoring, return_train_score=False)

    mean_accuracy = cv_results['test_accuracy'].mean()
    mean_fit_time = cv_results['fit_time'].mean()

    results.append((size, mean_accuracy, mean_fit_time))

    print(f"Testing-set predictive performance (accuracy): {mean_accuracy:.4f}")
    print(f"Time taken for the model to fit (average per fold): {mean_fit_time:.4f} seconds")

print("\nSummary:")
for size, acc, fit_time in results:
    print(f"Size: {size}, Accuracy: {acc:.4f}, Fit time avg/fold (s): {fit_time:.4f}")


Processing dataset size: 100


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9800
Time taken for the model to fit (average per fold): 0.0322 seconds

Processing dataset size: 1000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9470
Time taken for the model to fit (average per fold): 0.0868 seconds

Processing dataset size: 10000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9776
Time taken for the model to fit (average per fold): 0.2227 seconds

Processing dataset size: 100000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9871
Time taken for the model to fit (average per fold): 0.7634 seconds

Processing dataset size: 1000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9919
Time taken for the model to fit (average per fold): 7.8348 seconds

Processing dataset size: 10000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Testing-set predictive performance (accuracy): 0.9957
Time taken for the model to fit (average per fold): 79.6787 seconds

Summary:
Size: 100, Accuracy: 0.9800, Fit time avg/fold (s): 0.0322
Size: 1000, Accuracy: 0.9470, Fit time avg/fold (s): 0.0868
Size: 10000, Accuracy: 0.9776, Fit time avg/fold (s): 0.2227
Size: 100000, Accuracy: 0.9871, Fit time avg/fold (s): 0.7634
Size: 1000000, Accuracy: 0.9919, Fit time avg/fold (s): 7.8348
Size: 10000000, Accuracy: 0.9957, Fit time avg/fold (s): 79.6787
