In [9]:
# Import necessary libraries
import numpy as np
import json
import pandas as pd
from joblib import dump
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load preprocessed data
preprocessed_data = pd.read_pickle("preprocessed_data.pkl")
X_scaled = preprocessed_data["X_scaled"]
y_resampled = preprocessed_data["y_resampled"]
class_labels = preprocessed_data["class_labels"]

In [3]:
# Function to evaluate the XGBoost model
def evaluate_xgb_model(model, X_test, y_test, class_labels):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(
        y_test,
        predictions,
        target_names=class_labels,
        labels=np.unique(y_test),
        zero_division=0,
    )
    return accuracy, report

In [4]:
# Function to create a stratified subsample
def stratified_subsample(X, y, train_size, test_size):
    n_classes = len(np.unique(y))
    test_size = max(test_size, n_classes)
    if len(X) < train_size + test_size:
        raise ValueError(
            f"Not enough data to create train/test split: train_size={train_size}, "
            f"test_size={test_size}, total_samples={len(X)}."
        )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, stratify=y, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [None]:
# Define the sample sizes
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
train_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
test_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Initialize metrics list
metrics = []

In [None]:
# Train and evaluate models for each sample size
for sample_size, train_size, test_size in zip(sample_sizes, train_sizes, test_sizes):
    try:
        print(
            f"Processing Sample Size: {sample_size} (Train: {train_size}, Test: {test_size})"
        )

        # Subsample data
        X_train, X_test, y_train, y_test = stratified_subsample(
            X_scaled, y_resampled, train_size, test_size
        )

        # Train XGBoost model
        xgb_model = XGBClassifier(
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softmax",
            num_class=len(np.unique(y_resampled)),
            eval_metric="mlogloss",
        )
        xgb_model.fit(X_train, y_train)

        # Evaluate model
        accuracy, report = evaluate_xgb_model(xgb_model, X_test, y_test, class_labels)
        print(f"Sample Size {sample_size}: Accuracy {accuracy:.4f}")
        print("Classification Report:")
        print(report)

        # Store metrics
        metrics.append(
            {
                "sample_size": sample_size,
                "train_size": train_size,
                "test_size": test_size,
                "accuracy": accuracy,
                "classification_report": report,
            }
        )
    except ValueError as e:
        print(f"Skipping sample size {sample_size}: {e}")

Processing Sample Size: 25 (Train: 20, Test: 5)
Sample Size 25: Accuracy 0.1818
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.50      1.00      0.67         1
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         1
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00         1
            Stress-related Disorders       0.00      0.00      0.00         1
                             Healthy       0.00      0.00      0.00         1
                            Diabetes       1.00      1.00      1.00         1
                             Anaemia       0.00      0.00      0.00         1
                     Atherosclerosis       0.00      0.00      0.00         1
                          Arrhythmia       0.00      0.00      0.00         1
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00         1
               Autonomic Dysfunction  

In [10]:
dump(xgb_model, "xgb_model.joblib")
print("Model saved successfully as 'xgb_model.joblib'")

Model saved successfully as 'xgb_model.joblib'
