In [2]:
pip install pandas numpy xgboost scikit-learn



In [3]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml


# Step 1: Load and Prepare the Data

In [4]:
# Load Pima Indians Diabetes dataset (similar to mlbench version)
def load_pima_dataset():
    # Fetch from OpenML
    pima = fetch_openml(name='diabetes', version=1, as_frame=True)
    df = pima.frame
    df.columns = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age', 'diabetes']
    df = df.dropna()

    # Encode outcome
    le = LabelEncoder()
    df['outcome'] = le.fit_transform(df['diabetes'])  # pos = 1, neg = 0
    df = df.drop(columns=['diabetes'])

    return df

# Step 2: Generate Synthetic Data

In [5]:
def generate_dataset(df, size):
    sampled_df = df.sample(n=size, replace=True, random_state=123).reset_index(drop=True)
    return sampled_df

# Step 3: XGBoost Model with 5-fold CV

In [6]:
def run_xgboost_cv(X, y):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', verbosity=0)
    start = time.time()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    end = time.time()
    avg_accuracy = np.mean(scores)
    time_taken = end - start
    return avg_accuracy, time_taken

# Step 4: Main Process

In [7]:
if __name__ == "__main__":
    df = load_pima_dataset()

    dataset_sizes = [100, 1000, 10000]  # to keep fast; larger sizes possible if machine allows
    results = []

    for size in dataset_sizes:
        print(f"\nRunning for dataset size: {size}")
        df_sampled = generate_dataset(df, size)
        X = df_sampled.drop(columns=['outcome'])
        y = df_sampled['outcome']

        try:
            accuracy, duration = run_xgboost_cv(X, y)
            results.append({
                'Method Used': 'Python XGBoost 5-fold CV',
                'Dataset Size': size,
                'Testing-set Predictive Performance (%)': round(accuracy * 100, 2),
                'Time Taken to Fit (seconds)': round(duration, 2)
            })
        except Exception as e:
            results.append({
                'Method Used': 'Python XGBoost 5-fold CV',
                'Dataset Size': size,
                'Testing-set Predictive Performance (%)': 'Error',
                'Time Taken to Fit (seconds)': str(e)
            })



Running for dataset size: 100

Running for dataset size: 1000

Running for dataset size: 10000


In [13]:
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)


Final Results:
                Method Used  Dataset Size  \
0  Python XGBoost 5-fold CV           100   
1  Python XGBoost 5-fold CV          1000   
2  Python XGBoost 5-fold CV         10000   

   Testing-set Predictive Performance (%)  Time Taken to Fit (seconds)  
0                                    80.0                         2.73  
1                                    90.6                         0.37  
2                                   100.0                         0.78  

✅ Results saved to 'xgboost_results_python.csv'


In [14]:
# Save results
results_df.to_csv('xgboost_results_python.csv', index=False)
print("\n✅ Results saved to 'xgboost_results_python.csv'")


✅ Results saved to 'xgboost_results_python.csv'


In [22]:
from google.colab import files
files.download('xgboost_results_python.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

for big xgboost

In [23]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml


In [24]:
# Step 1: Load Pima Indians Diabetes Dataset
def load_pima():
    pima = fetch_openml(name='diabetes', version=1, as_frame=True)
    df = pima.frame
    df.columns = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age', 'diabetes']
    df = df.dropna()
    le = LabelEncoder()
    df['outcome'] = le.fit_transform(df['diabetes'])  # pos=1, neg=0
    df = df.drop(columns=['diabetes'])
    return df

In [25]:
# Step 2: Generate big synthetic dataset
def generate_big_dataset(df, size):
    sampled_df = df.sample(n=size, replace=True, random_state=123).reset_index(drop=True)
    return sampled_df


In [26]:
# Step 3: Fast XGBoost model training
def run_fast_xgboost(X, y):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=2, verbosity=0)  # Only 2 trees
    start = time.time()
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    end = time.time()
    return np.mean(scores), end - start

In [27]:
# Step 4: Main Execution
if __name__ == "__main__":
    df = load_pima()
    sizes = [100000, 1000000]  # 100k and 1M
    results = []

    for size in sizes:
        print(f"\nRunning for dataset size: {size}")
        df_sampled = generate_big_dataset(df, size)
        X = df_sampled.drop(columns=['outcome'])
        y = df_sampled['outcome']

        try:
            accuracy, duration = run_fast_xgboost(X, y)
            results.append({
                'Method Used': 'Python XGBoost 5-fold CV',
                'Dataset Size': size,
                'Testing-set Predictive Performance (%)': round(accuracy * 100, 2),
                'Time Taken to Fit (seconds)': round(duration, 2)
            })
        except Exception as e:
            results.append({
                'Method Used': 'Python XGBoost 5-fold CV',
                'Dataset Size': size,
                'Testing-set Predictive Performance (%)': 'Error',
                'Time Taken to Fit (seconds)': str(e)
            })



Running for dataset size: 100000

Running for dataset size: 1000000


In [29]:
results_df = pd.DataFrame(results)
print("\nFinal Big Dataset Results:")
print(results_df)

# Save results
results_df.to_csv('big_xgboost_results_python.csv', index=False)
print("\n✅ Big dataset Python results saved to 'big_xgboost_results_python.csv'")



Final Big Dataset Results:
                Method Used  Dataset Size  \
0  Python XGBoost 5-fold CV        100000   
1  Python XGBoost 5-fold CV       1000000   

   Testing-set Predictive Performance (%)  Time Taken to Fit (seconds)  
0                                   86.69                         1.60  
1                                   86.59                         6.63  

✅ Big dataset Python results saved to 'big_xgboost_results_python.csv'


In [30]:
from google.colab import files
files.download('big_xgboost_results_python.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Merge

In [31]:
import pandas as pd

In [34]:
# Step 1: Load the CSV files from /content/ folder
small_python = pd.read_csv('/content/xgboost_results_python.csv')
big_python = pd.read_csv('/content/big_xgboost_results_python.csv')
small_r = pd.read_csv('/content/xgboost_results_R.csv')
big_r = pd.read_csv('/content/big_xgboost_results_R.csv')

In [35]:
# Merge
combined_python = pd.concat([small_python, big_python], ignore_index=True)
combined_r = pd.concat([small_r, big_r], ignore_index=True)


In [36]:
# Merge everything
final_results = pd.concat([combined_python, combined_r], ignore_index=True)


In [37]:
# Sort by Dataset Size
final_results = final_results.sort_values(by=['Dataset Size'])


In [38]:
# Show final results
print(final_results)

                 Method Used  Dataset Size  \
0   Python XGBoost 5-fold CV         100.0   
1   Python XGBoost 5-fold CV        1000.0   
2   Python XGBoost 5-fold CV       10000.0   
3   Python XGBoost 5-fold CV      100000.0   
4   Python XGBoost 5-fold CV     1000000.0   
5                        NaN           NaN   
6                        NaN           NaN   
7                        NaN           NaN   
8                        NaN           NaN   
9                        NaN           NaN   
10                       NaN           NaN   
11                       NaN           NaN   
12                       NaN           NaN   
13                       NaN           NaN   
14                       NaN           NaN   
15                       NaN           NaN   
16                       NaN           NaN   

    Testing-set Predictive Performance (%)  Time Taken to Fit (seconds)  \
0                                    80.00                         2.73   
1                    

In [39]:
# Save merged results
final_results.to_csv('/content/final_xgboost_results.csv', index=False)
print("\n✅ Final merged results saved!")


✅ Final merged results saved!


In [40]:
final_results.to_csv('/content/final_xgboost_results.csv', index=False)  # save merged table
from google.colab import files
files.download('/content/final_xgboost_results.csv')  # download final CSV


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>