In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Dropout, MaxPooling1D, Input
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pickle

In [2]:
# Load preprocessed data
preprocessed_data = pd.read_pickle("C:/Users/syafi/Desktop/syafiq-project/classification-task/model/saved_data/preprocessed_data.pkl")
X_scaled = preprocessed_data["X_scaled"]
y_resampled = preprocessed_data["y_resampled"]
class_labels = preprocessed_data["class_labels"]

original_features = pd.read_pickle("C:/Users/syafi/Desktop/syafiq-project/classification-task/model/saved_data/original_features.pkl")

In [3]:
# Sample sizes for training
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]

In [4]:
# Function to create a stratified subsample
def stratified_subsample(X, y, train_size, random_state=42):
    if len(X) < train_size:
        raise ValueError(f"Insufficient data for sample size {train_size}.")
    X_sample, _, y_sample, _ = train_test_split(
        X, y, train_size=train_size, stratify=y, random_state=random_state
    )
    return X_sample, y_sample

In [5]:
def build_cnn(input_shape, num_classes):
    model = Sequential(
        [
            Input(shape=input_shape),
            Conv1D(filters=32, kernel_size=3, activation="relu"),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Conv1D(filters=64, kernel_size=3, activation="relu"),
            MaxPooling1D(pool_size=2),
            Dropout(0.25),
            Flatten(),
            Dense(128, activation="relu"),
            Dropout(0.5),
            Dense(num_classes, activation="softmax"),
        ]
    )
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    return model

In [None]:
# Initialize and train CNN with different sample sizes
results = {}

for sample_size in sample_sizes:
    try:
        X_sample, y_sample = stratified_subsample(X_scaled, y_resampled, sample_size)

        test_size = max(0.2, len(class_labels) / len(y_sample))
        stratify = y_sample if len(y_sample) >= len(class_labels) else None

        X_train, X_test, y_train, y_test = train_test_split(
            X_sample, y_sample, test_size=test_size, stratify=stratify, random_state=42
        )

        # Reshape data for CNN
        input_shape = (X_train.shape[1], 1)
        X_train = X_train.to_numpy().reshape(-1, X_train.shape[1], 1)
        X_test = X_test.to_numpy().reshape(-1, X_test.shape[1], 1)

        y_train = to_categorical(y_train, num_classes=len(class_labels))
        y_test_labels = y_test  # Save original labels for evaluation
        y_test = to_categorical(y_test, num_classes=len(class_labels))

        # Build and train CNN model
        cnn_model = build_cnn(input_shape=input_shape, num_classes=len(class_labels))
        cnn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=2)

        # Evaluate the model
        y_pred = cnn_model.predict(X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)
        accuracy = accuracy_score(y_test_labels, y_pred_classes)
        print(f"Classification Report for Sample Size {sample_size}:")
        print(
            classification_report(
                y_test_labels, y_pred_classes, target_names=class_labels
            )
        )

        results[sample_size] = {
            "accuracy": accuracy,
            "classification_report": classification_report(
                y_test_labels,
                y_pred_classes,
                target_names=class_labels,
                output_dict=True,
                zero_division=True
            ),
        }

    except ValueError as e:
        print(f"Skipping sample size {sample_size}: {e}")

In [None]:
# Save the best model (assuming last one is the best for simplicity)
cnn_model.save("cnn_model.h5")
print("CNN model saved as cnn_model.h5")

In [None]:
# Load and visualize results
for sample_size, metrics in results.items():
    print(f"Sample Size: {sample_size} - Accuracy: {metrics['accuracy']:.4f}")