In [3]:
# ===== IMPORTS =====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import lime
from lime import lime_tabular

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ==========================================================
# STEP 1. FEATURE EXTRACTION USING VGG16
# ==========================================================
def extract_vgg16_features(dataset_path, image_size=(224, 224), batch_size=32):
    datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
    generator = datagen.flow_from_directory(
        dataset_path,
        target_size=image_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False
    )

    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    model = Model(inputs=base_model.input, outputs=base_model.output)

    features = model.predict(generator, verbose=1)
    features_flat = features.reshape(features.shape[0], -1)
    labels = generator.classes
    class_names = list(generator.class_indices.keys())

    return features_flat, labels, class_names


# ==========================================================
# STEP 2. FEATURE CORRELATION & HEATMAP
# ==========================================================
def plot_feature_correlation(features):
    corr_matrix = np.corrcoef(features.T[:100])  # first 100 for readability
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
    plt.title("Feature Correlation Heatmap (First 100 Features)")
    plt.show()


# ==========================================================
# STEP 3. PCA DIMENSIONALITY REDUCTION
# ==========================================================
def apply_pca(features, variance_ratio):
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    pca = PCA(variance_ratio)
    reduced_features = pca.fit_transform(scaled_features)
    return reduced_features, pca


# ==========================================================
# STEP 4. CLASSIFICATION FUNCTION
# ==========================================================
def classify_models(X, y):
    results = []
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "SVM": SVC(kernel='rbf', probability=True, random_state=42)
    }

    for name, model in models.items():
        model.fit(X, y)
        preds = model.predict(X)
        acc = accuracy_score(y, preds)
        results.append({"Model": name, "Accuracy": acc})
        print(f"\n{name} Accuracy: {acc:.4f}")
        print(classification_report(y, preds))

    results_df = pd.DataFrame(results)
    return results_df, models


# ==========================================================
# STEP 5. SEQUENTIAL FEATURE SELECTION
# ==========================================================
def sequential_feature_selection(model, X, y, n_features=50):
    sfs = SequentialFeatureSelector(model, n_features_to_select=n_features, direction='forward')
    sfs.fit(X, y)
    X_selected = sfs.transform(X)
    return X_selected


# ==========================================================
# STEP 6. LIME AND SHAP EXPLAINABILITY
# ==========================================================
def explain_with_lime_shap(model, X, feature_names):
    explainer = lime_tabular.LimeTabularExplainer(
        X,
        feature_names=feature_names,
        mode="classification"
    )
    exp = explainer.explain_instance(X[0], model.predict_proba)
    exp.show_in_notebook(show_table=True)

    shap_explainer = shap.Explainer(model, X)
    shap_values = shap_explainer(X)
    shap.summary_plot(shap_values, X, feature_names=feature_names)


In [None]:
# ======== MAIN PIPELINE ========

dataset_dir = "balanced"

# Step 1: Feature extraction
features, labels, class_names = extract_vgg16_features(dataset_dir)

# Step 2: Correlation heatmap
plot_feature_correlation(features)

reduced_99, pca_99 = apply_pca(features, 0.99)
results_99, models_99 = classify_models(reduced_99, labels)
print("\nPCA 99% Results:\n", results_99)


reduced_95, pca_95 = apply_pca(features, 0.95)
results_95, models_95 = classify_models(reduced_95, labels)
print("\nPCA 95% Results:\n", results_95)

# Step 4: Sequential Feature Selection
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
selected_features = sequential_feature_selection(rf_model, reduced_95, labels)
results_sfs, _ = classify_models(selected_features, labels)

# Step 5: Explainability (LIME & SHAP)
explain_with_lime_shap(rf_model, selected_features, [f"f{i}" for i in range(selected_features.shape[1])])


Found 22020 images belonging to 5 classes.


  self._warn_if_super_not_called()


[1m371/689[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m51:34[0m 10s/step

In [None]:
# Save your experiment results for IEEE report
results_99.to_csv("results_PCA99.csv", index=False)
results_95.to_csv("results_PCA95.csv", index=False)
results_sfs.to_csv("results_SFS.csv", index=False)
print("✅ Results saved: results_PCA99.csv, results_PCA95.csv, results_SFS.csv")
