In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load preprocessed data
with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/preprocessed_data.pkl",
    "rb",
) as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/label_mapping.pkl", "rb"
) as f:
    label_mapping = pickle.load(f)
class_labels = list(label_mapping.keys())

# Display train and test sizes
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 1344719, Test size: 336180


In [3]:
# Define sizes for experiments
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

In [4]:
def evaluate_model(model, X_test, y_test, class_labels):
    """
    Evaluate the model and return classification metrics.
    """
    # Predictions
    predictions = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)

    # Generate dynamic class labels
    unique_classes = np.unique(y_test)
    dynamic_labels = [class_labels[i] for i in unique_classes]

    # Generate classification report
    report = classification_report(
        y_test,
        predictions,
        target_names=dynamic_labels,
        labels=unique_classes,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, unique_classes, dynamic_labels

In [5]:
def flatten_classification_report(report, sample_size, train_size, test_size, accuracy):
    """
    Flatten the classification report for easier analysis and saving.
    """
    flat_report = {
        f"{label}_{metric}": value
        for label, metrics in report.items()
        if isinstance(metrics, dict)  # Exclude keys like 'accuracy'
        for metric, value in metrics.items()
    }
    flat_report.update(
        {
            "sample_size": sample_size,
            "train_size": train_size,
            "test_size": test_size,
            "accuracy": accuracy,
        }
    )
    return flat_report

In [6]:
def expand_test_set(X_test, y_test, num_classes, repeat_factor):
    """
    Expand the test set by repeating samples and ensuring all classes are represented.
    """
    if len(X_test) == 0 or len(y_test) == 0:
        raise ValueError("Test set is empty. Verify the test data size and inputs.")

    # Expand test features by repeating
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)

    # Expand test labels by repeating
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)

    # Ensure sizes match
    if len(X_test_expanded) != len(y_test_expanded):
        raise ValueError("Mismatch in sizes of expanded test features and labels.")

    return X_test_expanded, y_test_expanded

In [7]:
# Container for results
results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

# Updated main loop for evaluating different sample sizes
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sizes, testing_sizes
):
    # Select subset of training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Debugging: Check initial sizes
    print("Initial Test Set Size:", len(X_test_subset), len(y_test_subset))

    # Expand test set
    repeat_factor = 11
    num_classes = len(class_labels)
    X_test_expanded, y_test_expanded = expand_test_set(
        X_test_subset, y_test_subset, num_classes, repeat_factor
    )

    # Debugging: Check expanded sizes
    print("Expanded Test Set Size:", len(X_test_expanded), len(y_test_expanded))

    # Fit the Naive Bayes model
    nb_model = GaussianNB()
    nb_model.fit(X_train_subset, y_train_subset)

    # Evaluate the model
    accuracy, report, predictions, unique_classes, dynamic_labels = evaluate_model(
        nb_model, X_test_expanded, y_test_expanded, class_labels
    )

    # Print results for the current iteration
    print(f"\nNaive Bayes Sample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(
        classification_report(
            y_test_expanded,
            predictions,
            target_names=dynamic_labels,
            labels=unique_classes,
            zero_division=0,
        )
    )

    # Flatten report and store results
    flat_report = flatten_classification_report(
        report, sample_size, train_size, test_size, accuracy
    )
    results.append(flat_report)

    # Update the best model based on accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = nb_model
        best_sample_size = sample_size
        print(
            f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
        )

Initial Test Set Size: 5 5
Expanded Test Set Size: 55 55

Naive Bayes Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

               Autonomic Dysfunction       0.00      0.00      0.00      11.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0

                           micro avg       0.00      0.00      0.00      55.0
                           macro avg       0.00      0.00      0.00      55.0
                        weighted avg       0.00      0.00      0.00      55.0

Initial Test Set Size: 10 10
Expanded Test Set Size: 110 110

Naive Bayes Sample size 50 - Accuracy: 0.1000
                                      precision    recall  f1-score   support

    

In [8]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_nb_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")


Best model saved as best_nb_model_sample_size_40000.pkl with accuracy 0.9341


In [9]:
# Save results to a CSV file for further analysis
results_df = pd.DataFrame(results)
results_df.to_csv("naive_bayes_results.csv", index=False)
print("\nResults saved to 'naive_bayes_results.csv'")


Results saved to 'naive_bayes_results.csv'


In [10]:
# Display train and test set details
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")
print("\nClass Distribution in Test Set:")
print(y_test.value_counts())

Train size: 1344719
Test size: 336180

Class Distribution in Test Set:
Disease Classification
8     30562
3     30562
9     30562
7     30562
10    30562
1     30562
6     30562
2     30562
4     30562
5     30561
0     30561
Name: count, dtype: int64
