In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D, Input
from keras.utils import to_categorical
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load preprocessed data
with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/preprocessed_data.pkl",
    "rb",
) as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/label_mapping.pkl",
    "rb",
) as f:
    label_mapping = pickle.load(f)
class_labels = list(label_mapping.keys())

# Display train and test sizes
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

Train size: 1344719
Test size: 336180


In [3]:
# Number of classes
num_classes = len(class_labels)

# Convert labels to categorical
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

In [4]:
# Define test set function
def expand_test_set(X_test, y_test, repeat_factor):
    """
    Expand the test set by repeating each sample multiple times
    """
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

In [5]:
# Evaluate the CNN Model
def evaluate_cnn_model(model, X_test_subset, y_test_subset, class_labels):
    """
    Evaluate the CNN Model and return classification metrics with dynamic class labels
    """

    # Predictions
    predictions = model.predict(X_test_subset)
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_subset, predictions)

    # Dynamically identify the classes present in the test set
    unique_classes = np.unique(y_test_subset)

    # Ensure labels and target names are consistent
    labels = list(unique_classes)  # Labels to include in the report
    dynamic_labels = [class_labels[label] for label in labels]  # Map labels to name

    # Generate classification report with consistent labels and target names
    report = classification_report(
        y_test_subset,
        predictions,
        labels=labels,
        target_names=dynamic_labels,
        zero_division=0,
        output_dict=True,
    )

    return accuracy, report, predictions, dynamic_labels

In [6]:
# Build CNN Model
def build_cnn(input_shape, num_classes):
    """
    Build a CNN model with Input Layer

    Args:
        input shape (tuple): Shape of the input data
        num_classes (int): Number of output classes
    Returns:
        Compiled CNN model
    """
    model = Sequential(
        [
            Input(shape=input_shape),
            Conv1D(filters=32, kernel_size=3, activation="relu"),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Conv1D(filters=64, kernel_size=3, activation="relu"),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Flatten(),
            Dense(128, activation="relu"),
            Dropout(0.5),
            Dense(num_classes, activation="softmax"),
        ]
    )
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    return model

In [7]:
# Define sizes for experiments
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Container for results
cnn_results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

In [8]:
# Iterate through sample, training, and testing sizes
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sizes, testing_sizes
):
    # Select subset of training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train_cat[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand the test set for alignment
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(
        X_test_subset, y_test_subset, repeat_factor
    )
    y_test_expanded_cat = to_categorical(y_test_expanded, num_classes=num_classes)

    # Build the CNN model
    input_shape = (X_train_subset.shape[1], 1)  # Assuming 1D input data
    cnn_model = build_cnn(input_shape=input_shape, num_classes=num_classes)

    # Reshape data for CNN input
    X_train_subset = X_train_subset.reshape(-1, X_train_subset.shape[1], 1)
    X_test_expanded = X_test_expanded.reshape(-1, X_test_expanded.shape[1], 1)

    # Train the CNN model
    cnn_model.fit(X_train_subset, y_train_subset, epochs=10, batch_size=32, verbose=0)

    # Evaluate the CNN model
    accuracy, report, predictions, dynamic_labels = evaluate_cnn_model(
        cnn_model, X_test_expanded, y_test_expanded, class_labels
    )

    # Print results for the current iteration
    print(f"\nCNN Sample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(
        classification_report(
            y_test_expanded,
            predictions,
            labels=np.unique(y_test_expanded),
            target_names=dynamic_labels,
            zero_division=0,
        )
    )

    # Store results
    report_flattened = {
        **{
            f"{label}_{metric}": value
            for label, metrics in report.items()
            if isinstance(metrics, dict)
            for metric, value in metrics.items()
        },
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    cnn_results.append(report_flattened)

    # Update the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = cnn_model
        best_sample_size = sample_size
        print(
            f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
        )

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step

CNN Sample size 25 - Accuracy: 0.0000
                                      precision    recall  f1-score   support

               Autonomic Dysfunction       0.00      0.00      0.00      11.0
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00      11.0
                             Healthy       0.00      0.00      0.00      11.0
                        Hypertension       0.00      0.00      0.00      11.0
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00      11.0

                           micro avg       0.00      0.00      0.00      55.0
                           macro avg       0.00      0.00      0.00      55.0
                        weighted avg       0.00      0.00      0.00      55.0

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step

CNN Sample size 50 - Accuracy: 0.1000
                                      precision    recall  f1-score   su

In [9]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_cnn_model_sample_size_{best_sample_size}.h5"
    best_model.save(best_model_file)
    print(
        f"\nBest CNN model saved as {best_model_file} with accuracy {best_accuracy:.4f}"
    )




Best CNN model saved as best_cnn_model_sample_size_50000.h5 with accuracy 0.9220


In [10]:
# Save results to a CSV file for further analysis
cnn_results_df = pd.DataFrame(cnn_results)
cnn_results_df.to_csv("cnn_results.csv", index=False)
print("\nResults saved to 'cnn_results.csv'")


Results saved to 'cnn_results.csv'
