In [4]:
pip install pyreadr



In [5]:
# -----------------------------------
# Setup
# -----------------------------------
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

# -----------------------------------
# Load and Prepare Data
# -----------------------------------
# Load Pima Indians Diabetes Dataset
pima = fetch_openml(name='diabetes', version=1, as_frame=True)
df = pima.frame
df.columns = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age', 'diabetes']
df = df.dropna()

# Create outcome column
df['outcome'] = (df['diabetes'] == 'pos').astype(int)
df = df.drop(columns=['diabetes'])

# -----------------------------------
# Function to Generate Synthetic Data
# -----------------------------------
def generate_synthetic_data(size):
    np.random.seed(123)
    sampled_df = df.sample(n=size, replace=True).reset_index(drop=True)
    return sampled_df

# -----------------------------------
# Function to Train XGBoost Model (direct)
# -----------------------------------
def run_xgboost(size):
    print(f"\n[Python XGBoost] Running for dataset size: {size}")

    # Generate synthetic data
    data = generate_synthetic_data(size)
    X = data.drop(columns=['outcome'])
    y = data['outcome']

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

    # Train XGBoost model
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=2, verbosity=0)
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    # Predict and Evaluate
    acc = model.score(X_test, y_test)

    print(f"Testing-set Accuracy: {round(acc * 100, 2)}%")
    print(f"Time Taken (seconds): {round(end - start, 2)}")
    print("-------------------------------------------")

# -----------------------------------
# Main Execution
# -----------------------------------
sizes = [100, 1000, 10000, 100000, 1000000]  # Up to 1 million

for size in sizes:
    run_xgboost(size)


[Python XGBoost] Running for dataset size: 100
Testing-set Accuracy: 100.0%
Time Taken (seconds): 0.05
-------------------------------------------

[Python XGBoost] Running for dataset size: 1000
Testing-set Accuracy: 100.0%
Time Taken (seconds): 0.02
-------------------------------------------

[Python XGBoost] Running for dataset size: 10000
Testing-set Accuracy: 100.0%
Time Taken (seconds): 0.07
-------------------------------------------

[Python XGBoost] Running for dataset size: 100000
Testing-set Accuracy: 100.0%
Time Taken (seconds): 0.27
-------------------------------------------

[Python XGBoost] Running for dataset size: 1000000
Testing-set Accuracy: 100.0%
Time Taken (seconds): 2.82
-------------------------------------------
