In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import pickle

In [2]:
# Load preprocessed data
preprocessed_data = pd.read_pickle("C:/Users/syafi/Desktop/syafiq-project/classification-task/model/saved_data/preprocessed_data.pkl")
X_scaled = preprocessed_data["X_scaled"]
y_resampled = preprocessed_data["y_resampled"]
class_labels = preprocessed_data["class_labels"]

In [3]:
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

In [4]:
# Function to create a stratified subsample
def stratified_subsample(X, y, train_size, random_state=42):
    if len(X) < train_size:
        raise ValueError(f"Insufficient data for sample size {train_size}.")
    X_sample, _, y_sample, _ = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=random_state
    )
    return X_sample, y_sample

In [5]:
# Parameter grid for GridSearchCV
param_grid = {
    "n_estimators": [50, 100, 150],
    "learning_rate": [0.01, 0.1, 1.0],
}

In [6]:
# Step 1: Initialize and run GridSearchCV
grid_search = GridSearchCV(
    estimator=AdaBoostClassifier(random_state=42),
    param_grid=param_grid,
    scoring="accuracy",
    cv=StratifiedKFold(3),
    verbose=2,
    n_jobs=-1,
)

# Create a sample for hyperparameter tuning
X_sample, y_sample = stratified_subsample(X_scaled, y_resampled, train_size=50000)
grid_search.fit(X_sample, y_sample)

# Extract the best parameters
best_params = grid_search.best_params_
print("Best Parameters Found by GridSearchCV:")
print(best_params)

Fitting 3 folds for each of 9 candidates, totalling 27 fits




Best Parameters Found by GridSearchCV:
{'learning_rate': 0.1, 'n_estimators': 50}


In [7]:
# Step 2: Evaluate across sample sizes
results = {}
for sample_size in sample_sizes:
    try:
        X_sampled, y_sampled = stratified_subsample(X_scaled, y_resampled, sample_size)

        # Split data into training and testing
        test_size = max(0.2, len(class_labels) / len(y_sampled))
        stratify = y_sampled if len(y_sampled) >= len(class_labels) else None

        X_train, X_test, y_train, y_test = train_test_split(
            X_sampled, y_sampled, test_size=0.2, stratify=stratify, random_state=42
        )

        # Use the best parameters for training
        ada_model = AdaBoostClassifier(**best_params, random_state=42)
        ada_model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = ada_model.predict(X_test)
        print(f"Classification Report for Sample Size {sample_size}:\n")
        print(classification_report(y_test, y_pred, target_names=class_labels))

        # Save results, e.g., accuracy
        results[sample_size] = accuracy_score(y_test, y_pred)

    except ValueError as e:
        print(f"Skipping sample size {sample_size}: {e}")

Skipping sample size 25: The test_size = 5 should be greater or equal to the number of classes = 11
Skipping sample size 50: The test_size = 10 should be greater or equal to the number of classes = 11


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 75:

                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00         1
                          Arrhythmia       1.00      1.00      1.00         1
                     Atherosclerosis       0.00      0.00      0.00         2
               Autonomic Dysfunction       0.33      1.00      0.50         1
        Cardiovascular Disease (CVD)       0.50      0.50      0.50         2
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00         2
                            Diabetes       0.33      1.00      0.50         1
                             Healthy       0.00      0.00      0.00         1
                        Hypertension       0.00      0.00      0.00         2
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00         1
            Stress-related Disorders       0.25      1.00      0.40         1

                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 100:

                                      precision    recall  f1-score   support

                             Anaemia       0.50      0.50      0.50         2
                          Arrhythmia       0.00      0.00      0.00         1
                     Atherosclerosis       0.00      0.00      0.00         2
               Autonomic Dysfunction       0.00      0.00      0.00         2
        Cardiovascular Disease (CVD)       0.50      0.50      0.50         2
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00         2
                            Diabetes       1.00      0.50      0.67         2
                             Healthy       0.50      1.00      0.67         2
                        Hypertension       0.00      0.00      0.00         2
Respiratory Disease (COPD or Asthma)       1.00      1.00      1.00         1
            Stress-related Disorders       0.33      1.00      0.50         2

                  



Classification Report for Sample Size 250:

                                      precision    recall  f1-score   support

                             Anaemia       0.83      1.00      0.91         5
                          Arrhythmia       0.50      0.25      0.33         4
                     Atherosclerosis       0.83      1.00      0.91         5
               Autonomic Dysfunction       1.00      0.75      0.86         4
        Cardiovascular Disease (CVD)       0.75      0.75      0.75         4
      Chronic Fatigue Syndrome (CFS)       1.00      0.80      0.89         5
                            Diabetes       1.00      1.00      1.00         4
                             Healthy       1.00      1.00      1.00         4
                        Hypertension       0.50      1.00      0.67         5
Respiratory Disease (COPD or Asthma)       1.00      0.80      0.89         5
            Stress-related Disorders       1.00      0.60      0.75         5

                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 500:

                                      precision    recall  f1-score   support

                             Anaemia       0.83      0.56      0.67         9
                          Arrhythmia       0.00      0.00      0.00         9
                     Atherosclerosis       0.00      0.00      0.00         9
               Autonomic Dysfunction       1.00      1.00      1.00         9
        Cardiovascular Disease (CVD)       1.00      0.78      0.88         9
      Chronic Fatigue Syndrome (CFS)       0.42      0.56      0.48         9
                            Diabetes       1.00      0.33      0.50         9
                             Healthy       1.00      0.89      0.94         9
                        Hypertension       0.26      0.78      0.39         9
Respiratory Disease (COPD or Asthma)       0.62      1.00      0.77        10
            Stress-related Disorders       0.25      0.33      0.29         9

                  



Classification Report for Sample Size 750:

                                      precision    recall  f1-score   support

                             Anaemia       0.93      1.00      0.97        14
                          Arrhythmia       1.00      0.23      0.38        13
                     Atherosclerosis       0.81      1.00      0.90        13
               Autonomic Dysfunction       0.93      1.00      0.97        14
        Cardiovascular Disease (CVD)       1.00      0.86      0.92        14
      Chronic Fatigue Syndrome (CFS)       0.59      0.71      0.65        14
                            Diabetes       0.86      0.86      0.86        14
                             Healthy       1.00      0.92      0.96        13
                        Hypertension       0.67      0.71      0.69        14
Respiratory Disease (COPD or Asthma)       0.75      0.92      0.83        13
            Stress-related Disorders       0.80      0.86      0.83        14

                  



Classification Report for Sample Size 1000:

                                      precision    recall  f1-score   support

                             Anaemia       1.00      0.83      0.91        18
                          Arrhythmia       0.50      0.06      0.10        18
                     Atherosclerosis       0.73      0.89      0.80        18
               Autonomic Dysfunction       1.00      1.00      1.00        19
        Cardiovascular Disease (CVD)       0.94      0.83      0.88        18
      Chronic Fatigue Syndrome (CFS)       0.79      0.61      0.69        18
                            Diabetes       1.00      0.83      0.91        18
                             Healthy       1.00      1.00      1.00        19
                        Hypertension       0.30      0.67      0.41        18
Respiratory Disease (COPD or Asthma)       0.82      1.00      0.90        18
            Stress-related Disorders       0.69      0.61      0.65        18

                 



Classification Report for Sample Size 2500:

                                      precision    recall  f1-score   support

                             Anaemia       0.96      1.00      0.98        46
                          Arrhythmia       1.00      0.76      0.86        45
                     Atherosclerosis       0.80      0.82      0.81        45
               Autonomic Dysfunction       1.00      1.00      1.00        45
        Cardiovascular Disease (CVD)       1.00      0.96      0.98        46
      Chronic Fatigue Syndrome (CFS)       0.94      0.74      0.83        46
                            Diabetes       1.00      1.00      1.00        45
                             Healthy       1.00      1.00      1.00        45
                        Hypertension       0.62      0.72      0.67        46
Respiratory Disease (COPD or Asthma)       0.85      1.00      0.92        46
            Stress-related Disorders       0.74      0.82      0.78        45

                 



Classification Report for Sample Size 5000:

                                      precision    recall  f1-score   support

                             Anaemia       0.93      0.99      0.96        91
                          Arrhythmia       1.00      0.58      0.74        91
                     Atherosclerosis       0.78      0.81      0.80        91
               Autonomic Dysfunction       0.94      1.00      0.97        91
        Cardiovascular Disease (CVD)       0.88      0.93      0.90        90
      Chronic Fatigue Syndrome (CFS)       0.61      0.66      0.63        91
                            Diabetes       1.00      0.77      0.87        91
                             Healthy       1.00      1.00      1.00        91
                        Hypertension       0.68      0.79      0.73        91
Respiratory Disease (COPD or Asthma)       0.84      0.95      0.89        91
            Stress-related Disorders       0.82      0.85      0.83        91

                 



Classification Report for Sample Size 7500:

                                      precision    recall  f1-score   support

                             Anaemia       0.98      0.93      0.96       136
                          Arrhythmia       0.00      0.00      0.00       136
                     Atherosclerosis       0.80      0.80      0.80       137
               Autonomic Dysfunction       1.00      1.00      1.00       136
        Cardiovascular Disease (CVD)       0.83      0.99      0.91       137
      Chronic Fatigue Syndrome (CFS)       0.72      0.58      0.65       137
                            Diabetes       1.00      0.81      0.89       136
                             Healthy       1.00      1.00      1.00       136
                        Hypertension       0.31      0.77      0.44       137
Respiratory Disease (COPD or Asthma)       0.81      0.99      0.89       136
            Stress-related Disorders       0.79      0.41      0.54       136

                 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 10000:

                                      precision    recall  f1-score   support

                             Anaemia       0.99      0.96      0.97       182
                          Arrhythmia       1.00      0.69      0.82       181
                     Atherosclerosis       0.79      0.91      0.85       182
               Autonomic Dysfunction       1.00      1.00      1.00       182
        Cardiovascular Disease (CVD)       0.94      1.00      0.97       182
      Chronic Fatigue Syndrome (CFS)       1.00      0.85      0.92       182
                            Diabetes       1.00      0.94      0.97       182
                             Healthy       0.99      1.00      1.00       182
                        Hypertension       0.69      0.92      0.79       182
Respiratory Disease (COPD or Asthma)       0.85      0.94      0.89       181
            Stress-related Disorders       0.98      0.87      0.92       182

                



Classification Report for Sample Size 20000:

                                      precision    recall  f1-score   support

                             Anaemia       0.97      0.96      0.96       364
                          Arrhythmia       1.00      0.71      0.83       363
                     Atherosclerosis       0.79      0.97      0.87       363
               Autonomic Dysfunction       0.95      1.00      0.98       364
        Cardiovascular Disease (CVD)       0.99      0.99      0.99       364
      Chronic Fatigue Syndrome (CFS)       0.91      0.89      0.90       364
                            Diabetes       1.00      0.99      0.99       364
                             Healthy       0.99      0.99      0.99       363
                        Hypertension       0.60      0.83      0.70       364
Respiratory Disease (COPD or Asthma)       0.95      0.79      0.86       363
            Stress-related Disorders       0.96      0.81      0.88       364

                



Classification Report for Sample Size 30000:

                                      precision    recall  f1-score   support

                             Anaemia       0.97      0.98      0.98       546
                          Arrhythmia       1.00      0.70      0.83       545
                     Atherosclerosis       0.79      0.94      0.86       545
               Autonomic Dysfunction       0.98      1.00      0.99       545
        Cardiovascular Disease (CVD)       0.97      0.99      0.98       546
      Chronic Fatigue Syndrome (CFS)       0.96      0.86      0.91       546
                            Diabetes       1.00      0.98      0.99       545
                             Healthy       0.99      1.00      0.99       545
                        Hypertension       0.62      0.85      0.72       546
Respiratory Disease (COPD or Asthma)       0.90      0.83      0.86       546
            Stress-related Disorders       0.94      0.84      0.89       545

                



Classification Report for Sample Size 40000:

                                      precision    recall  f1-score   support

                             Anaemia       0.96      0.94      0.95       728
                          Arrhythmia       1.00      0.69      0.82       727
                     Atherosclerosis       0.77      0.93      0.85       727
               Autonomic Dysfunction       0.99      1.00      0.99       727
        Cardiovascular Disease (CVD)       0.89      1.00      0.94       727
      Chronic Fatigue Syndrome (CFS)       0.96      0.88      0.92       727
                            Diabetes       1.00      0.87      0.93       727
                             Healthy       0.99      1.00      0.99       727
                        Hypertension       0.66      0.82      0.73       727
Respiratory Disease (COPD or Asthma)       0.87      0.92      0.89       728
            Stress-related Disorders       0.96      0.85      0.90       728

                



Classification Report for Sample Size 50000:

                                      precision    recall  f1-score   support

                             Anaemia       0.98      0.92      0.95       909
                          Arrhythmia       1.00      0.71      0.83       909
                     Atherosclerosis       0.75      0.96      0.84       909
               Autonomic Dysfunction       0.97      1.00      0.99       909
        Cardiovascular Disease (CVD)       0.92      1.00      0.96       909
      Chronic Fatigue Syndrome (CFS)       0.93      0.91      0.92       909
                            Diabetes       1.00      0.92      0.96       909
                             Healthy       1.00      1.00      1.00       909
                        Hypertension       0.66      0.82      0.73       909
Respiratory Disease (COPD or Asthma)       0.91      0.87      0.89       910
            Stress-related Disorders       0.95      0.82      0.88       909

                

In [None]:
# Save the AdaBoost model
with open("adaboost_model.pkl", "wb") as f:
    pickle.dump(ada_model, f)

In [None]:
# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("adaboost_results.csv", index=False)
print("\nResults saved to 'adaboost_results.csv'")