In [1]:
#install the necessary libraries
!pip install pyreadr xgboost scikit-learn

Collecting pyreadr
  Downloading pyreadr-0.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading pyreadr-0.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (411 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/411.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m409.6/411.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.7/411.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadr
Successfully installed pyreadr-0.5.3


In [2]:
#import libraries
import pyreadr
import xgboost as xgb
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [3]:
# Defining the function to load all RDS files
def load_rds(filelocation):
    result = pyreadr.read_r(filelocation)
    df = result[None]  # The data is stored under key None when using the data reader from the pyreadr package.
    return df

In [8]:
# Develop a function to implement XGBoost training through 5-Fold CV.

def train_xgboost(df, size_label):
    # Here we will separate the predictors and the outcome
    X = df.drop('outcome', axis=1)
    y = df['outcome']

    # 5-Fold Stratified Cross Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Now we define XGBoost model
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    # Record the start timing
    start_time = time.time()

    # Perform 5-Fold CV
    scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')

    # Record the end timing
    end_time = time.time()
    avg_accuracy = np.mean(scores)
    time_taken = end_time - start_time

    print(f"Dataset Size: {size_label}")
    print(f"Average 5-Fold CV Accuracy: {avg_accuracy:.4f}")
    print(f"Time Taken: {time_taken:.2f} seconds\n")

    return avg_accuracy, time_taken

In [9]:
# Now we are listing the dataset sizes
sizes = [100, 1000, 10000, 100000, 1000000, 10000000]

In [10]:
# Here we run the XGBOOST on all the datasets

results = []

for sz in sizes:
    filelocation = f"synthetic_data_{sz}.rds"  # No folder path needed, manually uploaded
    df = load_rds(filelocation)

    acc, timing = train_xgboost(df, size_label=sz)

    results.append({
        'Method Used': 'XGBoost (Python, scikit-learn, 5-fold CV)',
        'Dataset Size': sz,
        'Testing-set Predictive Performance': round(acc, 4),
        'Time Taken (seconds)': round(timing, 2)
    })

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 100
Average 5-Fold CV Accuracy: 0.8800
Time Taken: 0.26 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 1000
Average 5-Fold CV Accuracy: 0.9490
Time Taken: 0.32 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 10000
Average 5-Fold CV Accuracy: 0.9722
Time Taken: 0.81 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 100000
Average 5-Fold CV Accuracy: 0.9872
Time Taken: 5.82 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 1000000
Average 5-Fold CV Accuracy: 0.9919
Time Taken: 41.72 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Dataset Size: 10000000
Average 5-Fold CV Accuracy: 0.9931
Time Taken: 443.42 seconds



In [11]:
# Now we display the final results table
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Method Used,Dataset Size,Testing-set Predictive Performance,Time Taken (seconds)
0,"XGBoost (Python, scikit-learn, 5-fold CV)",100,0.88,0.26
1,"XGBoost (Python, scikit-learn, 5-fold CV)",1000,0.949,0.32
2,"XGBoost (Python, scikit-learn, 5-fold CV)",10000,0.9722,0.81
3,"XGBoost (Python, scikit-learn, 5-fold CV)",100000,0.9872,5.82
4,"XGBoost (Python, scikit-learn, 5-fold CV)",1000000,0.9919,41.72
5,"XGBoost (Python, scikit-learn, 5-fold CV)",10000000,0.9931,443.42
