# ==================================================================
# Mushroom Edibility Classification (Neural Networks)
# ==================================================================
# Goal: Build a reliable classification system to determine whether a mushroom 
# is edible or poisonous based on physical and biological characteristics.
# This supports real-world food safety, foraging risk assessment, and
# biological classification use cases.
# Key Tasks:
#   1. Explore and understand mushroom feature distributions.
#   2. Encode categorical variables using One-Hot Encoding.
#   3. Train a neural network classifier on fully encoded features.
#   4. Apply PCA to reduce dimensionality while retaining ~95% variance.
#   5. Train a second neural network on PCA-transformed features.
#   6. Evaluate both models using confusion matrices, ROC curves,
#      Precision–Recall curves, and full classification metrics.
#   7. Compare performance between the encoded and PCA models.
#   8. Save the best-performing model for future prediction (mushroom_pca_model.h5).
# Tools: Python, pandas, numpy, matplotlib, seaborn, scikit-learn, TensorFlow/Keras
# Dataset: UCI Machine Learning Repository (Mushroom dataset)

In [None]:
# ---------- IMPORT LIBRARIES ----------
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.decomposition import PCA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from pathlib import Path

In [None]:
# -------------------- REPRODUCIBILITY --------------------
np.random.seed(42)
tf.random.set_seed(42)

# ---------- PROJECT SETUP ----------
project_dir = Path.cwd()

fig_dir     = project_dir / "figures"
models_dir  = project_dir / "models"
outputs_dir = project_dir / "outputs"

for d in [fig_dir, models_dir, outputs_dir]:
    d.mkdir(exist_ok=True)

def save_plot(filename, width=8, height=5, dpi=300):
    """
    Save the current Matplotlib figure into the figures/ folder.
    """
    plt.gcf().set_size_inches(width, height)
    plt.savefig(fig_dir / filename, dpi=dpi, bbox_inches="tight")

def save_output(df, filename):
    """
    Save a pandas DataFrame into the outputs/ directory as CSV.
    """
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

# -------------------- PLOT STYLE AND SAVE --------------------
sns.set(style="whitegrid")
plt.rc("font", size=14)

def plot_and_save(x, y, xlabel, ylabel, title, filename):
    """Generic helper for simple line plots (e.g., ROC, PR, accuracy curves)."""
    plt.figure(figsize=(6, 5))
    plt.plot(x, y)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.tight_layout()
    plt.savefig(fig_dir / filename, dpi=300, bbox_inches="tight")
    plt.show()

def save_output(df: pd.DataFrame, filename: str):
    """Save a DataFrame as CSV into outputs/."""
    filepath = outputs_dir / filename
    df.to_csv(filepath, index=False)
    print(f"Saved output: {filepath}")
    return filepath

In [None]:
# -------------------- LOAD DATA --------------------
col_names = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
    'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
    'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color',
    'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'
]

data_path = project_dir / "mushroom_classification_data.csv"

mushroom = pd.read_csv(
    data_path,
    header=None,
    names=col_names
)

print("Dataset shape:", mushroom.shape)
print(mushroom.head(), "\n")
print("Missing values per column:\n", mushroom.isnull().sum(), "\n")
print("Class distribution:\n", mushroom['class'].value_counts(), "\n")

In [None]:
# -------------------- SPLIT DATA --------------------
X = mushroom.drop(columns=['class'])
y = mushroom['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -------------------- PREPROCESSING (ONE-HOT + LABEL ENCODING) --------------------
one_hot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
preprocessor = ColumnTransformer([('cat', one_hot, X.columns)], remainder='drop')

X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.transform(y_test)

print("Encoded feature dimension:", X_train_enc.shape[1])

In [None]:
# -------------------- MODEL A - NEURAL NETWORK (ENCODED FEATURES) --------------------
model = Sequential([
    Dense(16, activation='relu', input_dim=X_train_enc.shape[1]),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    X_train_enc, y_train_enc,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# --- Training curves (Encoded model) ---
epochs = range(1, len(history.history['accuracy']) + 1)
plot_and_save(
    epochs,
    history.history['accuracy'],
    xlabel="Epoch",
    ylabel="Accuracy",
    title="Training Accuracy — Encoded Model",
    filename="training_accuracy_encoded.png"
)
plot_and_save(
    epochs,
    history.history['val_accuracy'],
    xlabel="Epoch",
    ylabel="Validation Accuracy",
    title="Validation Accuracy — Encoded Model",
    filename="val_accuracy_encoded.png"
)

# -------------------- EVALUATION - ENCODED MODEL --------------------
y_pred_prob_enc = model.predict(X_test_enc).flatten()
y_pred_enc = (y_pred_prob_enc > 0.5).astype(int)

cm_enc = confusion_matrix(y_test_enc, y_pred_enc)
ConfusionMatrixDisplay(cm_enc, display_labels=label_enc.classes_).plot()
plt.title("Confusion Matrix — Encoded Features")
plt.tight_layout()
plt.savefig(fig_dir / "confusion_matrix_encoded.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n=== Classification Report — Encoded Model ===")
report_enc = classification_report(
    y_test_enc, y_pred_enc, target_names=label_enc.classes_
)
print(report_enc)

# ROC curve — Encoded model
fpr_enc, tpr_enc, _ = roc_curve(y_test_enc, y_pred_prob_enc)
roc_auc_enc = auc(fpr_enc, tpr_enc)
plot_and_save(
    fpr_enc,
    tpr_enc,
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"ROC Curve — Encoded Model (AUC = {roc_auc_enc:.3f})",
    filename="roc_encoded.png"
)

# Precision–Recall curve — Encoded model
precision_enc, recall_enc, _ = precision_recall_curve(y_test_enc, y_pred_prob_enc)
ap_enc = average_precision_score(y_test_enc, y_pred_prob_enc)
plot_and_save(
    recall_enc,
    precision_enc,
    xlabel="Recall",
    ylabel="Precision",
    title=f"Precision–Recall Curve — Encoded Model (AP = {ap_enc:.3f})",
    filename="pr_curve_encoded.png"
)

# -------------------- PCA TRANSFORMATION --------------------
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_enc)
X_test_pca = pca.transform(X_test_enc)

print("Original encoded dimension:", X_train_enc.shape[1])
print("PCA-reduced dimension (95% variance):", X_train_pca.shape[1])

In [None]:
# -------------------- MODEL B - NEURAL NETWORK (PCA FEATURES) --------------------
model_pca = Sequential([
    Dense(16, activation='relu', input_dim=X_train_pca.shape[1]),
    Dense(1, activation='sigmoid')
])

model_pca.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history_pca = model_pca.fit(
    X_train_pca, y_train_enc,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# --- Training curves (PCA model) ---
epochs_pca = range(1, len(history_pca.history['accuracy']) + 1)
plot_and_save(
    epochs_pca,
    history_pca.history['accuracy'],
    xlabel="Epoch",
    ylabel="Accuracy",
    title="Training Accuracy — PCA Model",
    filename="training_accuracy_pca.png"
)
plot_and_save(
    epochs_pca,
    history_pca.history['val_accuracy'],
    xlabel="Epoch",
    ylabel="Validation Accuracy",
    title="Validation Accuracy — PCA Model",
    filename="val_accuracy_pca.png"
)

# -------------------- EVALUATION - PCA MODEL --------------------
y_pred_prob_pca = model_pca.predict(X_test_pca).flatten()
y_pred_pca = (y_pred_prob_pca > 0.5).astype(int)

cm_pca = confusion_matrix(y_test_enc, y_pred_pca)
ConfusionMatrixDisplay(cm_pca, display_labels=label_enc.classes_).plot()
plt.title("Confusion Matrix — PCA Features")
plt.tight_layout()
plt.savefig(fig_dir / "confusion_matrix_pca.png", dpi=300, bbox_inches="tight")
plt.show()

print("\n=== Classification Report — PCA Model ===")
report_pca = classification_report(
    y_test_enc, y_pred_pca, target_names=label_enc.classes_
)
print(report_pca)

# ROC curve — PCA model
fpr_pca, tpr_pca, _ = roc_curve(y_test_enc, y_pred_prob_pca)
roc_auc_pca = auc(fpr_pca, tpr_pca)
plot_and_save(
    fpr_pca,
    tpr_pca,
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"ROC Curve — PCA Model (AUC = {roc_auc_pca:.3f})",
    filename="roc_pca.png"
)

# Precision–Recall curve — PCA model
precision_pca, recall_pca, _ = precision_recall_curve(y_test_enc, y_pred_prob_pca)
ap_pca = average_precision_score(y_test_enc, y_pred_prob_pca)
plot_and_save(
    recall_pca,
    precision_pca,
    xlabel="Recall",
    ylabel="Precision",
    title=f"Precision–Recall Curve — PCA Model (AP = {ap_pca:.3f})",
    filename="pr_curve_pca.png"
)

In [None]:
# -------------------- METRICS COMPARISON SUMMARY --------------------
metrics_summary = pd.DataFrame([
    {
        "model": "Encoded NN",
        "accuracy": accuracy_score(y_test_enc, y_pred_enc),
        "precision": precision_score(y_test_enc, y_pred_enc),
        "recall": recall_score(y_test_enc, y_pred_enc),
        "f1": f1_score(y_test_enc, y_pred_enc),
        "roc_auc": roc_auc_enc,
        "avg_precision": ap_enc,
    },
    {
        "model": "PCA NN",
        "accuracy": accuracy_score(y_test_enc, y_pred_pca),
        "precision": precision_score(y_test_enc, y_pred_pca),
        "recall": recall_score(y_test_enc, y_pred_pca),
        "f1": f1_score(y_test_enc, y_pred_pca),
        "roc_auc": roc_auc_pca,
        "avg_precision": ap_pca,
    },
])

print("\n=== Model Comparison Summary ===")
print(metrics_summary.round(4))

# Save metrics summary and classification reports
save_output(metrics_summary.round(4), "metrics_summary_mushroom_nn.csv")

with open(outputs_dir / "classification_report_encoded.txt", "w") as f:
    f.write("=== Classification Report — Encoded Model ===\n")
    f.write(report_enc)

with open(outputs_dir / "classification_report_pca.txt", "w") as f:
    f.write("=== Classification Report — PCA Model ===\n")
    f.write(report_pca)

In [None]:
# -------------------- SAVE BEST MODEL (PCA) --------------------
best_model = model_pca
model_path = models_dir / "mushroom_pca_model.h5"
best_model.save(model_path)
print(f"\nSaved best model to: {model_path}")

# ---------- PIPELINE COMPLETE ----------
print("Pipeline complete:")
print("- Project setup (figures/, outputs/, models/ directories)")
print("- Data loading & basic validation (shape, missingness, class balance)")
print("- Stratified train/test split (class distribution preserved)")
print("- Preprocessing: One-Hot Encoding (categorical features) + Label Encoding (target)")
print("- Model A training: Neural Network on fully encoded features")
print("- Model A evaluation: confusion matrix, classification report, ROC, Precision–Recall")
print("- Dimensionality reduction: PCA retaining ~95% variance")
print("- Model B training: Neural Network on PCA-transformed features")
print("- Model B evaluation: confusion matrix, classification report, ROC, Precision–Recall")
print("- Model comparison summary saved to outputs/")
print("- Artifacts saved to /figures, /outputs, and /models")
print("- Best model saved as mushroom_pca_model.h5")