In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load the preprocessed data
with open("C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/preprocessed_data.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

with open("C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/label_mapping.pkl", "rb") as f:
    label_mapping = pickle.load(f)
class_labels = list(label_mapping.keys())

# Display train and test size
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 1344719
Test size: 336180


In [3]:
# Define function to evaluate model
def evaluate_model(model, X_test_subset, y_test_subset, class_labels):
    predictions = model.predict(X_test_subset)
    accuracy = accuracy_score(y_test_subset, predictions)
    unique_classes = np.unique(y_test_subset)
    dynamic_labels = [class_labels[i] for i in unique_classes]
    report = classification_report(
        y_test_subset,
        predictions,
        target_names=dynamic_labels,
        labels=unique_classes,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, unique_classes, dynamic_labels

In [4]:
# Flatten classification report
def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    flat_report = {
        f"{label}_{metric}": value
        for label, metrics in report.items()
        if isinstance(metrics, dict)
        for metric, value in metrics.items()
    }
    flat_report.update(
        {
            "sample_size": sample_size,
            "train_size": train_size,
            "test_size": test_size,
            "accuracy": accuracy,
        }
    )
    return flat_report

In [5]:
# Expand the test set for better results
def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

In [6]:
# Define experiment sizes
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Container for results
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

In [7]:
# Iterate through sample, training, and testing sizes
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sizes, testing_sizes
):
    # Select subset of training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand the test set for alignment
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(
        X_test_subset, y_test_subset, repeat_factor
    )

    # Fit the AdaBoost model
    ada_model = AdaBoostClassifier(n_estimators=50, random_state=42)
    ada_model.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions, unique_classes, dynamic_labels = evaluate_model(
        ada_model, X_test_expanded, y_test_expanded, class_labels
    )

    # Print results for the current iteration
    print(f"\nAdaBoost Sample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(
        classification_report(
            y_test_expanded,
            predictions,
            target_names=dynamic_labels,
            labels=unique_classes,
            zero_division=0,
        )
    )

    # Flatten report and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Update the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = ada_model
        best_sample_size = sample_size
        print(
            f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
        )




AdaBoost Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

               Autonomic Dysfunction       0.00      0.00      0.00      11.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0

                           micro avg       0.00      0.00      0.00      55.0
                           macro avg       0.00      0.00      0.00      55.0
                        weighted avg       0.00      0.00      0.00      55.0






AdaBoost Sample size 50 - Accuracy: 0.2000
                                      precision    recall  f1-score   support

                         Arrhythmias       0.00      0.00      0.00        11
               Autonomic Dysfunction       0.00      0.00      0.00        11
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       0.00      0.00      0.00        11
                             Healthy       1.00      0.50      0.67        22
                        Hypertension       0.00      0.00      0.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        22
            Stress-related Disorders       0.11      1.00      0.20        11

                            accuracy                           0.20       110
                           macro avg       0.14      0.19      0.11       110
                        weighted avg       0.21      0.20      0.15       110

New best model fo




AdaBoost Sample size 75 - Accuracy: 0.1333
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        11
                         Arrhythmias       0.07      1.00      0.13        11
               Autonomic Dysfunction       0.00      0.00      0.00        22
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                            Diabetes       0.00      0.00      0.00        11
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        33
            Stress-related Disorders       0.00      0.00      0.00        22

                            accuracy                           0.13       165
                           macro avg       0.12      0.22      0.13       165
                  




AdaBoost Sample size 100 - Accuracy: 0.1000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        33
                         Arrhythmias       0.05      1.00      0.10        11
               Autonomic Dysfunction       0.00      0.00      0.00        22
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        22
                            Diabetes       0.00      0.00      0.00        22
                             Healthy       0.00      0.00      0.00        33
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        44
            Stress-related Disorders       0.00      0.00      0.00        22

                            accuracy                           0.10       220
                           macro avg       0.12      0.22      0.12       220
                 




AdaBoost Sample size 250 - Accuracy: 0.2000
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00        33
                         Arrhythmias       0.00      0.00      0.00        33
               Autonomic Dysfunction       0.00      0.00      0.00        88
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        44
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        66
                            Diabetes       0.00      0.00      0.00        55
                             Healthy       0.00      0.00      0.00        55
                        Hypertension       0.00      0.00      0.00        66
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        44
            Stress-related Disorders       0.13      1.00      0.23        66

                            accuracy                           0.20       550
                 




AdaBoost Sample size 500 - Accuracy: 0.2300
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       110
                         Arrhythmias       0.00      0.00      0.00        66
                     Atherosclerosis       1.00      0.33      0.50        33
               Autonomic Dysfunction       0.00      0.00      0.00       143
        Cardiovascular Disease (CVD)       1.00      1.00      1.00        77
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00       121
                            Diabetes       0.00      0.00      0.00        88
                             Healthy       0.00      0.00      0.00        66
                        Hypertension       0.29      0.29      0.29       154
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00       121
            Stress-related Disorders       0.14      1.00      0.25       121

                 




AdaBoost Sample size 750 - Accuracy: 0.2333
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       187
                         Arrhythmias       0.00      0.00      0.00        99
                     Atherosclerosis       0.83      0.45      0.59       121
               Autonomic Dysfunction       0.00      0.00      0.00       209
        Cardiovascular Disease (CVD)       0.79      0.92      0.85       132
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00       132
                            Diabetes       0.00      0.00      0.00       110
                             Healthy       0.00      0.00      0.00       110
                        Hypertension       0.28      0.28      0.28       198
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00       198
            Stress-related Disorders       0.12      1.00      0.22       154

                 




AdaBoost Sample size 1000 - Accuracy: 0.2700
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       231
                         Arrhythmias       0.00      0.00      0.00       154
                     Atherosclerosis       0.88      0.47      0.61       165
               Autonomic Dysfunction       0.00      0.00      0.00       264
        Cardiovascular Disease (CVD)       0.80      0.94      0.86       187
      Chronic Fatigue Syndrome (CFS)       0.14      1.00      0.24       220
                            Diabetes       0.00      0.00      0.00       132
                             Healthy       0.00      0.00      0.00       154
                        Hypertension       0.41      0.41      0.41       297
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00       198
            Stress-related Disorders       0.00      0.00      0.00       198

                




AdaBoost Sample size 2500 - Accuracy: 0.2760
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       517
                         Arrhythmias       0.00      0.00      0.00       462
                     Atherosclerosis       0.88      0.55      0.67       605
               Autonomic Dysfunction       0.00      0.00      0.00       495
        Cardiovascular Disease (CVD)       0.74      0.91      0.82       517
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00       539
                            Diabetes       0.00      0.00      0.00       396
                             Healthy       0.00      0.00      0.00       484
                        Hypertension       0.35      0.47      0.40       583
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00       462
            Stress-related Disorders       0.12      1.00      0.21       440

                




AdaBoost Sample size 5000 - Accuracy: 0.2940
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00       968
                         Arrhythmias       0.00      0.00      0.00       891
                     Atherosclerosis       0.93      0.52      0.66      1089
               Autonomic Dysfunction       0.00      0.00      0.00      1034
        Cardiovascular Disease (CVD)       0.75      0.96      0.84      1122
      Chronic Fatigue Syndrome (CFS)       0.15      1.00      0.25      1089
                            Diabetes       0.00      0.00      0.00       792
                             Healthy       0.00      0.00      0.00      1012
                        Hypertension       0.34      0.45      0.39      1133
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00       869
            Stress-related Disorders       0.00      0.00      0.00      1001

                




AdaBoost Sample size 7500 - Accuracy: 0.2820
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      1463
                         Arrhythmias       0.00      0.00      0.00      1375
                     Atherosclerosis       0.89      0.51      0.65      1507
               Autonomic Dysfunction       0.00      0.00      0.00      1507
        Cardiovascular Disease (CVD)       0.74      0.94      0.83      1573
      Chronic Fatigue Syndrome (CFS)       0.14      1.00      0.25      1650
                            Diabetes       0.00      0.00      0.00      1254
                             Healthy       0.00      0.00      0.00      1628
                        Hypertension       0.34      0.46      0.39      1639
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      1408
            Stress-related Disorders       0.00      0.00      0.00      1496

                




AdaBoost Sample size 10000 - Accuracy: 0.2735
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      1914
                         Arrhythmias       0.00      0.00      0.00      1980
                     Atherosclerosis       0.89      0.52      0.65      1936
               Autonomic Dysfunction       0.00      0.00      0.00      2057
        Cardiovascular Disease (CVD)       0.75      0.94      0.84      2068
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      2211
                            Diabetes       0.00      0.00      0.00      1606
                             Healthy       0.14      1.00      0.24      2101
                        Hypertension       0.33      0.45      0.38      2145
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      1892
            Stress-related Disorders       0.00      0.00      0.00      2090

               




AdaBoost Sample size 20000 - Accuracy: 0.2747
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      3762
                         Arrhythmias       0.00      0.00      0.00      4059
                     Atherosclerosis       0.87      0.58      0.70      3883
               Autonomic Dysfunction       0.00      0.00      0.00      3806
        Cardiovascular Disease (CVD)       0.78      0.92      0.84      4290
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      4169
                            Diabetes       0.00      0.00      0.00      3641
                             Healthy       0.00      0.00      0.00      4345
                        Hypertension       0.30      0.44      0.36      4268
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      3773
            Stress-related Disorders       0.13      1.00      0.23      4004

               




AdaBoost Sample size 30000 - Accuracy: 0.2700
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      5731
                         Arrhythmias       0.00      0.00      0.00      6226
                     Atherosclerosis       0.87      0.59      0.71      5753
               Autonomic Dysfunction       0.00      0.00      0.00      5962
        Cardiovascular Disease (CVD)       0.78      0.92      0.84      6204
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      6039
                            Diabetes       0.00      0.00      0.00      5698
                             Healthy       0.00      0.00      0.00      6358
                        Hypertension       0.28      0.43      0.34      6171
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      5797
            Stress-related Disorders       0.13      1.00      0.24      6061

               




AdaBoost Sample size 40000 - Accuracy: 0.2721
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      8019
                         Arrhythmias       0.00      0.00      0.00      8217
                     Atherosclerosis       0.88      0.63      0.73      7986
               Autonomic Dysfunction       0.00      0.00      0.00      8052
        Cardiovascular Disease (CVD)       0.78      0.92      0.85      8283
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      8052
                            Diabetes       0.00      0.00      0.00      7711
                             Healthy       0.00      0.00      0.00      8162
                        Hypertension       0.28      0.43      0.34      8063
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      7568
            Stress-related Disorders       0.13      1.00      0.23      7887

               




AdaBoost Sample size 50000 - Accuracy: 0.2721
                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00      9856
                         Arrhythmias       0.00      0.00      0.00     10296
                     Atherosclerosis       0.88      0.63      0.74     10109
               Autonomic Dysfunction       0.00      0.00      0.00     10329
        Cardiovascular Disease (CVD)       0.78      0.92      0.84     10153
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      9966
                            Diabetes       0.00      0.00      0.00      9625
                             Healthy       0.00      0.00      0.00     10197
                        Hypertension       0.27      0.42      0.33      9889
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      9537
            Stress-related Disorders       0.13      1.00      0.24     10043

               

In [8]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_adaboost_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")


Best model saved as best_adaboost_model_sample_size_5000.pkl with accuracy 0.2940


In [9]:
# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("adaboost_results.csv", index=False)
print("\nResults saved to 'adaboost_results.csv'")


Results saved to 'adaboost_results.csv'


In [10]:
# Display train and test set details
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")
print("\nClass Distribution in Test Set:")
print(pd.Series(y_test).value_counts())

Train size: 1344719
Test size: 336180

Class Distribution in Test Set:
Disease Classification
8     30562
3     30562
9     30562
7     30562
10    30562
1     30562
6     30562
2     30562
4     30562
5     30561
0     30561
Name: count, dtype: int64
