In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
file_path = r"dataset.csv"  # Update this path to your CSV file location
df = pd.read_csv(file_path)

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Separate features and labels
labels = df['class']
features = df.drop(columns=['frame_name', 'class'])

# Ensure all features are numeric
features = features.apply(pd.to_numeric, errors='coerce')
features = features.fillna(0)

# Convert the DataFrame to a NumPy array
features_array = features.to_numpy(dtype=np.float32)  # Ensure the array is of type float32

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# One-hot encode the labels
labels_categorical = to_categorical(labels_encoded)

# Split the dataset into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(features_array, labels_categorical, test_size=0.3, random_state=42)

# Define a function to create the model
def create_model(input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(80, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# K-Fold Cross-Validation
k = 6 # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

val_accuracies = []
val_losses = []
test_accuracies = []
test_losses = []

for train_index, val_index in kf.split(X_train_full):
    X_train, X_val = X_train_full[train_index], X_train_full[val_index]
    y_train, y_val = y_train_full[train_index], y_train_full[val_index]
    
    # Create and train the model
    model = create_model((features_array.shape[1],), labels_categorical.shape[1])
    history = model.fit(X_train, y_train, epochs=70, batch_size=5, validation_data=(X_val, y_val), callbacks=[early_stopping], shuffle=True)
    
    # Evaluate the model on validation set
    val_loss, val_acc = model.evaluate(X_val, y_val)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)
    
    # Evaluate the model on test set (using the same test set for simplicity)
    test_loss, test_acc = model.evaluate(X_test, y_test)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)

# Calculate mean and standard deviation for validation and test metrics
mean_val_loss = np.mean(val_losses)
std_val_loss = np.std(val_losses)
mean_val_acc = np.mean(val_accuracies)
std_val_acc = np.std(val_accuracies)

mean_test_loss = np.mean(test_losses)
std_test_loss = np.std(test_losses)
mean_test_acc = np.mean(test_accuracies)
std_test_acc = np.std(test_accuracies)

# Print final validation and test metrics
print(f"Validation Loss: {mean_val_loss:.4f} ± {std_val_loss:.4f}")
print(f"Validation Accuracy: {mean_val_acc:.4f} ± {std_val_acc:.4f}")
print(f"Test Loss: {mean_test_loss:.4f} ± {std_test_loss:.4f}")
print(f"Test Accuracy: {mean_test_acc:.4f} ± {std_test_acc:.4f}")

# Save the last trained model
model.save("model.h5")

# Plot training and validation accuracy for the last fold
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

# Plot training and validation loss for the last fold
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

plt.show()

# Check class distribution in the test set
test_class_distribution = np.argmax(y_test, axis=1)
print("Test set class distribution:", np.bincount(test_class_distribution))

# Check class distribution in the predictions
predictions = np.argmax(model.predict(X_test), axis=1)
print("Predicted class distribution:", np.bincount(predictions))

# Generate confusion matrix
conf_matrix = confusion_matrix(test_class_distribution, predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.title('Confusion Matrix')
plt.show()
