In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load the preprocessed data
with open(
    r"C:\Users\syafi\Desktop\syafiq-project\new classification task\model\saved_data\preprocessed_data.pkl", "rb",
) as f:
    X_train, X_test, y_train, y_test = pickle.load(f)
    
# Display train and test size
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 1177208
Test size: 294302


In [3]:
# Define function to evaluate model
def evaluate_model(model, X_test_subset, y_test_subset, class_labels):
    predictions = model.predict(X_test_subset)
    accuracy = accuracy_score(y_test_subset, predictions)
    unique_classes = np.unique(y_test_subset)
    dynamic_labels = [class_labels[i] for i in unique_classes]
    report = classification_report(
        y_test_subset,
        predictions,
        target_names=dynamic_labels,
        labels=unique_classes,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, unique_classes, dynamic_labels

In [4]:
# Flatten classification report
def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flat_report = {
        f"{label}_{metric}": value
        for label, metrics in report.items()
        if isinstance(metrics, dict)
        for metric, value in metrics.items()
    }
    flat_report.update(
        {
            "sample_size": sample_size,
            "train_size": train_size,
            "test_size": test_size,
            "accuracy": accuracy,
        }
    )
    return flat_report

In [5]:
# Expand the test set for better results
def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

In [6]:
# Define experiment sizes
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Class labels
class_labels = ["Atherosclerosis", "Hypertension", "Cardiovascular Disease (CVD)", "Chronic Fatigue Syndrome (CFS)", 
                "Respiratory Disease (COPD or Asthma)", "Stress-related Disorders", "Arrhythmias", "Healthy", 
                "Autonomic Dysfunction", "Diabetes", "Anaemia"]

In [7]:
# Container for results
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

# Iterate through sample, training, and testing sizes
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sizes, testing_sizes
):
    # Select subset of training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand the test set for alignment
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(
        X_test_subset, y_test_subset, repeat_factor
    )

    # Fit the AdaBoost model
    ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_model.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions, unique_classes, dynamic_labels = evaluate_model(
        ada_model, X_test_expanded, y_test_expanded, class_labels
    )

    # Print results for the current iteration
    print(f"\nAdaBoost Sample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(
        classification_report(
            y_test_expanded,
            predictions,
            target_names=dynamic_labels,
            labels=unique_classes,
            zero_division=0,
        )
    )

    # Flatten report and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Update the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = ada_model
        best_sample_size = sample_size
        print(
            f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
        )




AdaBoost Sample size 25 - Accuracy: 0.0000
                                precision    recall  f1-score   support

  Cardiovascular Disease (CVD)       0.00      0.00      0.00      11.0
Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
      Stress-related Disorders       0.00      0.00      0.00      11.0
                   Arrhythmias       0.00      0.00      0.00      11.0
                      Diabetes       0.00      0.00      0.00      11.0

                     micro avg       0.00      0.00      0.00      55.0
                     macro avg       0.00      0.00      0.00      55.0
                  weighted avg       0.00      0.00      0.00      55.0


AdaBoost Sample size 50 - Accuracy: 0.2000
                                precision    recall  f1-score   support

               Atherosclerosis       0.12      1.00      0.22        11
                  Hypertension       0.00      0.00      0.00        11
  Cardiovascular Disease (CVD)       1.00   




AdaBoost Sample size 100 - Accuracy: 0.3000
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00        22
                        Hypertension       0.20      1.00      0.33        22
        Cardiovascular Disease (CVD)       0.00      0.00      0.00        11
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        11
            Stress-related Disorders       1.00      1.00      1.00        44
                         Arrhythmias       0.00      0.00      0.00        33
               Autonomic Dysfunction       0.00      0.00      0.00        11
                            Diabetes       0.00      0.00      0.00        11
                             Anaemia       0.00      0.00      0.00        44

                            accuracy                           0.30       220
                 




AdaBoost Sample size 500 - Accuracy: 0.1800
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00       132
                        Hypertension       0.12      1.00      0.21        99
        Cardiovascular Disease (CVD)       0.33      0.25      0.29        44
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        99
Respiratory Disease (COPD or Asthma)       0.70      0.78      0.74        99
            Stress-related Disorders       0.00      0.00      0.00       121
                         Arrhythmias       0.00      0.00      0.00        99
                             Healthy       0.00      0.00      0.00        77
               Autonomic Dysfunction       0.09      0.11      0.10        99
                            Diabetes       0.00      0.00      0.00        99
                             Anaemia       0.00      0.00      0.00       132

                 




AdaBoost Sample size 1000 - Accuracy: 0.2300
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00       220
                        Hypertension       0.17      1.00      0.29       275
        Cardiovascular Disease (CVD)       0.71      0.56      0.62        99
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00       209
Respiratory Disease (COPD or Asthma)       0.78      0.88      0.82       176
            Stress-related Disorders       0.00      0.00      0.00       220
                         Arrhythmias       0.00      0.00      0.00       253
                             Healthy       0.00      0.00      0.00       143
               Autonomic Dysfunction       0.07      0.12      0.09       176
                            Diabetes       0.00      0.00      0.00       198
                             Anaemia       0.00      0.00      0.00       231

                




AdaBoost Sample size 2500 - Accuracy: 0.2020
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00       605
                        Hypertension       0.15      1.00      0.26       616
        Cardiovascular Disease (CVD)       0.73      0.35      0.47       253
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00       594
Respiratory Disease (COPD or Asthma)       0.65      0.90      0.76       341
            Stress-related Disorders       0.00      0.00      0.00       495
                         Arrhythmias       0.00      0.00      0.00       594
                             Healthy       0.00      0.00      0.00       429
               Autonomic Dysfunction       0.13      0.21      0.16       462
                            Diabetes       0.00      0.00      0.00       528
                             Anaemia       0.00      0.00      0.00       583

                




AdaBoost Sample size 5000 - Accuracy: 0.2140
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      1166
                        Hypertension       0.15      1.00      0.26      1276
        Cardiovascular Disease (CVD)       0.71      0.39      0.51       418
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      1155
Respiratory Disease (COPD or Asthma)       0.72      0.91      0.81       704
            Stress-related Disorders       0.00      0.00      0.00      1089
                         Arrhythmias       0.00      0.00      0.00      1133
                             Healthy       0.00      0.00      0.00       990
               Autonomic Dysfunction       0.19      0.30      0.23       902
                            Diabetes       0.00      0.00      0.00      1001
                             Anaemia       0.00      0.00      0.00      1166

                




AdaBoost Sample size 7500 - Accuracy: 0.2113
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      1694
                        Hypertension       0.15      1.00      0.26      1892
        Cardiovascular Disease (CVD)       0.75      0.41      0.53       561
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      1672
Respiratory Disease (COPD or Asthma)       0.73      0.92      0.81       946
            Stress-related Disorders       0.00      0.00      0.00      1562
                         Arrhythmias       0.00      0.00      0.00      1815
                             Healthy       0.00      0.00      0.00      1584
               Autonomic Dysfunction       0.20      0.37      0.26      1331
                            Diabetes       0.00      0.00      0.00      1727
                             Anaemia       0.00      0.00      0.00      1716

                




AdaBoost Sample size 10000 - Accuracy: 0.2185
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      2189
                        Hypertension       0.15      1.00      0.27      2585
        Cardiovascular Disease (CVD)       0.80      0.44      0.57       825
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      2266
Respiratory Disease (COPD or Asthma)       0.73      0.93      0.82      1309
            Stress-related Disorders       0.00      0.00      0.00      2167
                         Arrhythmias       0.00      0.00      0.00      2365
                             Healthy       0.00      0.00      0.00      1980
               Autonomic Dysfunction       0.20      0.36      0.26      1760
                            Diabetes       0.00      0.00      0.00      2266
                             Anaemia       0.00      0.00      0.00      2288

               




AdaBoost Sample size 20000 - Accuracy: 0.2200
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      4543
                        Hypertension       0.15      1.00      0.26      4950
        Cardiovascular Disease (CVD)       0.82      0.47      0.60      1529
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      4532
Respiratory Disease (COPD or Asthma)       0.76      0.94      0.84      2684
            Stress-related Disorders       0.00      0.00      0.00      4224
                         Arrhythmias       0.00      0.00      0.00      4719
                             Healthy       0.00      0.00      0.00      4048
               Autonomic Dysfunction       0.23      0.41      0.29      3586
                            Diabetes       0.00      0.00      0.00      4598
                             Anaemia       0.00      0.00      0.00      4587

               




AdaBoost Sample size 30000 - Accuracy: 0.2205
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      6369
                        Hypertension       0.15      1.00      0.26      7436
        Cardiovascular Disease (CVD)       0.82      0.47      0.60      2365
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      6765
Respiratory Disease (COPD or Asthma)       0.75      0.94      0.83      3927
            Stress-related Disorders       0.00      0.00      0.00      6391
                         Arrhythmias       0.00      0.00      0.00      7304
                             Healthy       0.00      0.00      0.00      6479
               Autonomic Dysfunction       0.23      0.43      0.30      5346
                            Diabetes       0.00      0.00      0.00      6787
                             Anaemia       0.00      0.00      0.00      6831

               




AdaBoost Sample size 40000 - Accuracy: 0.2194
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00      8701
                        Hypertension       0.15      1.00      0.25      9691
        Cardiovascular Disease (CVD)       0.82      0.49      0.61      3201
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      8987
Respiratory Disease (COPD or Asthma)       0.75      0.93      0.83      5126
            Stress-related Disorders       0.00      0.00      0.00      8360
                         Arrhythmias       0.00      0.00      0.00      9526
                             Healthy       0.00      0.00      0.00      8899
               Autonomic Dysfunction       0.25      0.45      0.32      7194
                            Diabetes       0.00      0.00      0.00      9174
                             Anaemia       0.00      0.00      0.00      9141

               




AdaBoost Sample size 50000 - Accuracy: 0.2158
                                      precision    recall  f1-score   support

                     Atherosclerosis       0.00      0.00      0.00     11110
                        Hypertension       0.14      1.00      0.25     11858
        Cardiovascular Disease (CVD)       0.82      0.49      0.62      4180
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00     11143
Respiratory Disease (COPD or Asthma)       0.74      0.93      0.82      6314
            Stress-related Disorders       0.00      0.00      0.00     10714
                         Arrhythmias       0.00      0.00      0.00     11770
                             Healthy       0.00      0.00      0.00     11286
               Autonomic Dysfunction       0.24      0.45      0.31      8866
                            Diabetes       0.00      0.00      0.00     11209
                             Anaemia       0.00      0.00      0.00     11550

               

In [None]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_adaboost_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")

In [None]:
# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("adaboost_results.csv", index=False)
print("\nResults saved to 'adaboost_results.csv'")

In [None]:
# Display train and test set details
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")
print("\nClass Distribution in Test Set:")
print(pd.Series(y_test).value_counts())