In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import pickle

In [2]:
# Load preprocessed data
preprocessed_data_path = "C:/Users/syafi/Desktop/syafiq-project/classification-task/model/saved_data/preprocessed_data.pkl"
original_features_path = "C:/Users/syafi/Desktop/syafiq-project/classification-task/model/saved_data/original_features.pkl"

preprocessed_data = pd.read_pickle(preprocessed_data_path)
X_scaled = preprocessed_data["X_scaled"]
y_resampled = preprocessed_data["y_resampled"]
class_labels = preprocessed_data["class_labels"]

original_features = pd.read_pickle(original_features_path)

In [3]:
# Sample sizes for training
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

In [4]:
def stratified_subsample(X, y, train_size, random_state=42):
    if len(X) < train_size:
        raise ValueError(f"Insufficient data for sample size {train_size}.")
    X_sample, _, y_sample, _ = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=random_state
    )
    return X_sample, y_sample

In [5]:
results = {}
best_model = None
best_accuracy = 0
best_sample_size = 0

for sample_size in sample_sizes:
    try:
        # Create a stratified subsample
        X_sample, y_sample = stratified_subsample(
            X_scaled, y_resampled, train_size=sample_size
        )

        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X_sample, y_sample, test_size=0.2, stratify=y_sample, random_state=42
        )

        # Initialize and train Naive Bayes model
        nb_model = GaussianNB()
        nb_model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = nb_model.predict(X_test)

        # Evaluate the model
        report = classification_report(
            y_test, y_pred, target_names=class_labels, output_dict=True, zero_division=0
        )
        accuracy = report["accuracy"]
        print(f"Classification Report for Sample Size {sample_size}:\n")
        print(classification_report(y_test, y_pred, target_names=class_labels))

        # Save results for the current sample size
        results[sample_size] = {"accuracy": accuracy, "classification_report": report}

        # Update the best model based on accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = nb_model
            best_sample_size = sample_size
            print(
                f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
            )

    except ValueError as e:
        print(f"Skipping sample size {sample_size}: {e}")

Skipping sample size 25: The test_size = 5 should be greater or equal to the number of classes = 11
Skipping sample size 50: The test_size = 10 should be greater or equal to the number of classes = 11
Classification Report for Sample Size 75:

                                      precision    recall  f1-score   support

                             Anaemia       0.00      0.00      0.00         1
                          Arrhythmia       0.50      1.00      0.67         1
                     Atherosclerosis       0.50      0.50      0.50         2
               Autonomic Dysfunction       0.00      0.00      0.00         1
        Cardiovascular Disease (CVD)       0.00      0.00      0.00         2
      Chronic Fatigue Syndrome (CFS)       0.67      1.00      0.80         2
                            Diabetes       0.33      1.00      0.50         1
                             Healthy       0.50      1.00      0.67         1
                        Hypertension       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 100:

                                      precision    recall  f1-score   support

                             Anaemia       1.00      0.50      0.67         2
                          Arrhythmia       0.33      1.00      0.50         1
                     Atherosclerosis       1.00      0.50      0.67         2
               Autonomic Dysfunction       1.00      1.00      1.00         2
        Cardiovascular Disease (CVD)       1.00      0.50      0.67         2
      Chronic Fatigue Syndrome (CFS)       0.40      1.00      0.57         2
                            Diabetes       0.00      0.00      0.00         2
                             Healthy       0.67      1.00      0.80         2
                        Hypertension       0.00      0.00      0.00         2
Respiratory Disease (COPD or Asthma)       0.50      1.00      0.67         1
            Stress-related Disorders       0.00      0.00      0.00         2

                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report for Sample Size 250:

                                      precision    recall  f1-score   support

                             Anaemia       1.00      0.60      0.75         5
                          Arrhythmia       0.44      1.00      0.62         4
                     Atherosclerosis       1.00      1.00      1.00         5
               Autonomic Dysfunction       1.00      1.00      1.00         4
        Cardiovascular Disease (CVD)       1.00      0.50      0.67         4
      Chronic Fatigue Syndrome (CFS)       0.62      1.00      0.77         5
                            Diabetes       1.00      1.00      1.00         4
                             Healthy       0.67      1.00      0.80         4
                        Hypertension       1.00      0.20      0.33         5
Respiratory Disease (COPD or Asthma)       0.71      1.00      0.83         5
            Stress-related Disorders       1.00      0.20      0.33         5

                  

In [None]:
# Save the model
with open("nb_model", "wb") as model_file:
    pickle.dump(best_model, model_file)
    print(f"\nModel saved as {model_file} with accuracy {best_accuracy:.4f}")