In [1]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN

In [2]:
preprocess_file = 'C:/Users/syafi/Desktop/syafiq-project/classification-task/preprocessed_data.pkl'

# Load preprocessed data
with open(preprocess_file, "rb") as f:
    X_scaled, y, scaler = pickle.load(f)

In [3]:
# Number of classes
num_classes = len(np.unique(y))

# Class labels for all 11 classes
class_labels = [
    "Hypertension",
    "Cardiovascular Disease (CVD)",
    "Chronic Fatigue Syndrome (CFS)",
    "Stress-related Disorders",
    "Healthy",
    "Diabetes",
    "Anaemia",
    "Atherosclerosis",
    "Arrhythmia",
    "Respiratory Disease (COPD or Asthma)",
    "Autonomic Dysfunction"
]

# Using ADASYN instead of SMOTE
1. Adapts the sampling process based on the distribution of minority samples
- it creates more synthetic data for regions where minority samples are sparse or harder to learn.
- fewer samples are generated in well-represented regions of the minority class.
2. Reduces the risk of overfitting by focusing on under-represented regions, improving the classifier's generalisation.
3. Prioritises generating synthetic samples where the model struggles, potentially leading to more realistic samples for challenging cases.

In [4]:
# Applying ADASYN for balancing the class distribution
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_scaled, y)

# Display the class distribution after resampling
print("Class Distribution After ADASYN:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After ADASYN:
Disease Classification
4     228888
6     221850
2     220409
1     219974
0     219956
5     219690
9     219643
3     219628
8     219610
10    218289
7     217810
Name: count, dtype: int64


In [5]:
# Function to evaluate XGBoost model
def evaluate_xgb_model(model, X_test, y_test):
    dtest = xgb.DMatrix(X_test)
    predictions = model.predict(dtest)

    # If predictions are probabilities, take the class with highest probability
    if predictions.ndim > 1:  # multi-class case
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions.astype(int)

    accuracy = accuracy_score(y_test, predictions)
    unique_labels = np.unique(y_test)  # Get unique labels in the test set
    report = classification_report(
        y_test,
        predictions,
        target_names=class_labels,
        labels=unique_labels,
        zero_division=0,
    )

    return accuracy, report

In [6]:
def stratified_subsample(X, y, train_size, test_size):
    """
    Creates a stratified subset of data for training and testing based on sample sizes.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_size, test_size=test_size, stratify=y, random_state=42
    )
    return X_train, X_test, y_train, y_test

In [7]:
def hyperparameter_tuning(X, y):
    """
    Manually tune hyperparameters for XGBoost.
    """
    # Define hyperparameters to tune
    best_params = {
        "max_depth": 6,  # Example hyperparameters
        "learning_rate": 0.1,
        "n_estimators": 100,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "objective": "multi:softmax",  # Multi-class classification
        "num_class": len(np.unique(y)),  # Number of classes
    }

    # We can iterate over different combinations or grid search manually here
    return best_params

In [8]:
# Container for results
xgb_results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

In [9]:
# Apply hyperparameter tuning on the resampled dataset
best_params = hyperparameter_tuning(X_resampled, y_resampled)
print(f"Best Hyperparameters: {best_params}")

# Iterative training loop
for sample_size in [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]:
    # Ensure sample size fits within available data
    if len(X_resampled) < sample_size:
        print(f"Skipping sample size {sample_size} due to insufficient data.")
        continue

    X_train_subset, X_test_subset, y_train_subset, y_test_subset = stratified_subsample(
        X_resampled, y_resampled, train_size=sample_size, test_size=sample_size // 2
    )

    # Train XGBoost model
    dtrain = xgb.DMatrix(X_train_subset, label=y_train_subset)
    dtest = xgb.DMatrix(X_test_subset, label=y_test_subset)
    xgb_model = xgb.train(best_params, dtrain, num_boost_round=100, verbose_eval=False)

    # Evaluate model
    accuracy, report = evaluate_xgb_model(xgb_model, X_test_subset, y_test_subset)
    print(f"Sample Size {sample_size}: Accuracy {accuracy:.4f}")
    print("Classification Report:")
    print(report)

    # Store results and track the best model
    xgb_results.append({"sample_size": sample_size, "accuracy": accuracy, "classification_report": report})
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = xgb_model
        best_sample_size = sample_size

Best Hyperparameters: {'max_depth': 6, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'multi:softmax', 'num_class': 11}


Parameters: { "n_estimators" } are not used.



Sample Size 25: Accuracy 0.4167
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.00      0.00      0.00         1
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         1
      Chronic Fatigue Syndrome (CFS)       0.33      1.00      0.50         1
            Stress-related Disorders       0.33      1.00      0.50         1
                             Healthy       1.00      0.50      0.67         2
                            Diabetes       0.50      1.00      0.67         1
                             Anaemia       1.00      1.00      1.00         1
                     Atherosclerosis       0.00      0.00      0.00         1
                          Arrhythmia       0.00      0.00      0.00         1
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00         1
               Autonomic Dysfunction       0.00      0.00      0.00         1

       

Parameters: { "n_estimators" } are not used.



Sample Size 50: Accuracy 0.7200
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.67      1.00      0.80         2
        Cardiovascular Disease (CVD)       0.50      0.50      0.50         2
      Chronic Fatigue Syndrome (CFS)       1.00      0.33      0.50         3
            Stress-related Disorders       0.67      1.00      0.80         2
                             Healthy       1.00      0.67      0.80         3
                            Diabetes       0.67      1.00      0.80         2
                             Anaemia       1.00      0.33      0.50         3
                     Atherosclerosis       0.50      1.00      0.67         2
                          Arrhythmia       1.00      1.00      1.00         2
Respiratory Disease (COPD or Asthma)       0.67      1.00      0.80         2
               Autonomic Dysfunction       1.00      0.50      0.67         2

       

Parameters: { "n_estimators" } are not used.



Sample Size 75: Accuracy 0.6486
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      0.67      0.80         3
        Cardiovascular Disease (CVD)       1.00      0.75      0.86         4
      Chronic Fatigue Syndrome (CFS)       0.50      0.75      0.60         4
            Stress-related Disorders       1.00      1.00      1.00         3
                             Healthy       0.60      0.75      0.67         4
                            Diabetes       0.67      0.67      0.67         3
                             Anaemia       0.00      0.00      0.00         4
                     Atherosclerosis       0.50      1.00      0.67         3
                          Arrhythmia       1.00      0.67      0.80         3
Respiratory Disease (COPD or Asthma)       0.50      0.33      0.40         3
               Autonomic Dysfunction       0.50      0.67      0.57         3

       

Parameters: { "n_estimators" } are not used.



Sample Size 100: Accuracy 0.7000
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.75      0.60      0.67         5
        Cardiovascular Disease (CVD)       0.67      0.80      0.73         5
      Chronic Fatigue Syndrome (CFS)       0.71      1.00      0.83         5
            Stress-related Disorders       1.00      1.00      1.00         4
                             Healthy       0.67      0.40      0.50         5
                            Diabetes       0.60      0.60      0.60         5
                             Anaemia       0.25      0.20      0.22         5
                     Atherosclerosis       1.00      1.00      1.00         4
                          Arrhythmia       1.00      0.50      0.67         4
Respiratory Disease (COPD or Asthma)       0.80      1.00      0.89         4
               Autonomic Dysfunction       0.50      0.75      0.60         4

      

Parameters: { "n_estimators" } are not used.



Sample Size 250: Accuracy 0.8960
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.69      0.82      0.75        11
        Cardiovascular Disease (CVD)       0.91      0.83      0.87        12
      Chronic Fatigue Syndrome (CFS)       0.92      1.00      0.96        12
            Stress-related Disorders       1.00      1.00      1.00        11
                             Healthy       1.00      1.00      1.00        12
                            Diabetes       1.00      0.64      0.78        11
                             Anaemia       0.92      0.92      0.92        12
                     Atherosclerosis       0.79      1.00      0.88        11
                          Arrhythmia       1.00      0.91      0.95        11
Respiratory Disease (COPD or Asthma)       0.80      0.73      0.76        11
               Autonomic Dysfunction       0.92      1.00      0.96        11

      

Parameters: { "n_estimators" } are not used.



Sample Size 500: Accuracy 0.9520
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      0.87      0.93        23
        Cardiovascular Disease (CVD)       0.96      0.96      0.96        23
      Chronic Fatigue Syndrome (CFS)       0.92      0.96      0.94        23
            Stress-related Disorders       1.00      0.83      0.90        23
                             Healthy       1.00      1.00      1.00        23
                            Diabetes       1.00      1.00      1.00        23
                             Anaemia       1.00      1.00      1.00        23
                     Atherosclerosis       0.92      1.00      0.96        22
                          Arrhythmia       0.95      0.95      0.95        22
Respiratory Disease (COPD or Asthma)       0.81      0.91      0.86        23
               Autonomic Dysfunction       0.96      1.00      0.98        22

      

Parameters: { "n_estimators" } are not used.



Sample Size 750: Accuracy 0.9920
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       0.97      1.00      0.99        34
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        34
      Chronic Fatigue Syndrome (CFS)       0.97      1.00      0.99        34
            Stress-related Disorders       1.00      1.00      1.00        34
                             Healthy       1.00      1.00      1.00        35
                            Diabetes       1.00      0.97      0.99        34
                             Anaemia       1.00      1.00      1.00        34
                     Atherosclerosis       1.00      0.97      0.99        34
                          Arrhythmia       1.00      1.00      1.00        34
Respiratory Disease (COPD or Asthma)       0.97      1.00      0.99        34
               Autonomic Dysfunction       1.00      0.97      0.99        34

      

Parameters: { "n_estimators" } are not used.



Sample Size 1000: Accuracy 0.9880
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      0.96      0.98        45
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        46
      Chronic Fatigue Syndrome (CFS)       0.98      1.00      0.99        46
            Stress-related Disorders       0.98      0.98      0.98        45
                             Healthy       0.98      1.00      0.99        47
                            Diabetes       1.00      1.00      1.00        45
                             Anaemia       1.00      1.00      1.00        46
                     Atherosclerosis       0.98      0.98      0.98        45
                          Arrhythmia       1.00      0.98      0.99        45
Respiratory Disease (COPD or Asthma)       0.98      0.98      0.98        45
               Autonomic Dysfunction       0.98      1.00      0.99        45

     

Parameters: { "n_estimators" } are not used.



Sample Size 2500: Accuracy 0.9960
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00       113
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       114
      Chronic Fatigue Syndrome (CFS)       0.99      0.99      0.99       114
            Stress-related Disorders       1.00      1.00      1.00       113
                             Healthy       0.99      0.99      0.99       118
                            Diabetes       0.99      0.99      0.99       113
                             Anaemia       1.00      1.00      1.00       114
                     Atherosclerosis       1.00      1.00      1.00       112
                          Arrhythmia       0.98      0.98      0.98       113
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00       113
               Autonomic Dysfunction       1.00      1.00      1.00       113

     

Parameters: { "n_estimators" } are not used.



Sample Size 5000: Accuracy 0.9984
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00       227
        Cardiovascular Disease (CVD)       1.00      0.99      1.00       227
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00       227
            Stress-related Disorders       1.00      1.00      1.00       226
                             Healthy       1.00      1.00      1.00       236
                            Diabetes       1.00      1.00      1.00       226
                             Anaemia       1.00      1.00      1.00       229
                     Atherosclerosis       1.00      1.00      1.00       225
                          Arrhythmia       1.00      1.00      1.00       226
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00       226
               Autonomic Dysfunction       0.99      1.00      0.99       225

     

Parameters: { "n_estimators" } are not used.



Sample Size 7500: Accuracy 0.9955
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00       340
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       340
      Chronic Fatigue Syndrome (CFS)       1.00      0.99      1.00       341
            Stress-related Disorders       1.00      1.00      1.00       339
                             Healthy       0.99      0.99      0.99       354
                            Diabetes       0.99      1.00      1.00       340
                             Anaemia       1.00      1.00      1.00       343
                     Atherosclerosis       0.99      1.00      1.00       337
                          Arrhythmia       0.99      0.98      0.99       339
Respiratory Disease (COPD or Asthma)       1.00      0.99      0.99       340
               Autonomic Dysfunction       1.00      1.00      1.00       337

     

Parameters: { "n_estimators" } are not used.



Sample Size 10000: Accuracy 0.9976
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00       453
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       453
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00       454
            Stress-related Disorders       1.00      1.00      1.00       453
                             Healthy       1.00      1.00      1.00       472
                            Diabetes       1.00      1.00      1.00       453
                             Anaemia       1.00      1.00      1.00       457
                     Atherosclerosis       1.00      1.00      1.00       449
                          Arrhythmia       0.99      0.99      0.99       453
Respiratory Disease (COPD or Asthma)       1.00      0.99      1.00       453
               Autonomic Dysfunction       1.00      1.00      1.00       450

    

Parameters: { "n_estimators" } are not used.



Sample Size 20000: Accuracy 0.9981
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00       907
        Cardiovascular Disease (CVD)       1.00      1.00      1.00       907
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00       909
            Stress-related Disorders       1.00      1.00      1.00       905
                             Healthy       1.00      0.99      1.00       944
                            Diabetes       1.00      1.00      1.00       906
                             Anaemia       1.00      1.00      1.00       914
                     Atherosclerosis       1.00      1.00      1.00       898
                          Arrhythmia       0.99      1.00      0.99       905
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00       905
               Autonomic Dysfunction       1.00      1.00      1.00       900

    

Parameters: { "n_estimators" } are not used.



Sample Size 30000: Accuracy 0.9993
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00      1360
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      1360
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      1363
            Stress-related Disorders       1.00      1.00      1.00      1358
                             Healthy       1.00      1.00      1.00      1415
                            Diabetes       1.00      1.00      1.00      1359
                             Anaemia       1.00      1.00      1.00      1372
                     Atherosclerosis       1.00      1.00      1.00      1347
                          Arrhythmia       1.00      1.00      1.00      1358
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      1358
               Autonomic Dysfunction       1.00      1.00      1.00      1350

    

Parameters: { "n_estimators" } are not used.



Sample Size 40000: Accuracy 0.9978
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00      1813
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      1814
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      1817
            Stress-related Disorders       1.00      1.00      1.00      1811
                             Healthy       1.00      0.99      0.99      1887
                            Diabetes       1.00      1.00      1.00      1811
                             Anaemia       1.00      1.00      1.00      1829
                     Atherosclerosis       1.00      1.00      1.00      1796
                          Arrhythmia       0.99      1.00      0.99      1811
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      1811
               Autonomic Dysfunction       1.00      1.00      1.00      1800

    

Parameters: { "n_estimators" } are not used.



Sample Size 50000: Accuracy 0.9981
Classification Report:
                                      precision    recall  f1-score   support

                        Hypertension       1.00      1.00      1.00      2267
        Cardiovascular Disease (CVD)       1.00      1.00      1.00      2267
      Chronic Fatigue Syndrome (CFS)       1.00      1.00      1.00      2272
            Stress-related Disorders       1.00      1.00      1.00      2263
                             Healthy       1.00      0.99      1.00      2359
                            Diabetes       1.00      1.00      1.00      2264
                             Anaemia       1.00      1.00      1.00      2286
                     Atherosclerosis       1.00      1.00      1.00      2245
                          Arrhythmia       0.99      1.00      0.99      2263
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00      2264
               Autonomic Dysfunction       1.00      1.00      1.00      2250

    

In [10]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_xgb_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")


Best model saved as best_xgb_model_sample_size_30000.pkl with accuracy 0.9993
