In [2]:
# ==============================================
# Robust SHAP explainability for your Keras + ColumnTransformer pipeline
# Saves: shap_summary.png, shap_bar.png
# ==============================================
import os, csv, pickle, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
import shap
from sklearn.model_selection import train_test_split
import tensorflow as tf

DATA_MATHS = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer\archive (1)\Maths.csv"
DATA_PORT  = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer\archive (1)\Portuguese.csv"
OUTPUT_DIR = r"C:\Users\sagni\Downloads\Dynamic Curriculum Designer"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------- tiny robust loader (CSV or Excel-in-disguise) ----------
def is_zip_or_xlsx(p):
    try:
        with open(p, "rb") as f: return f.read(2) == b"PK"
    except: return False
def robust_read_any(p):
    if not os.path.exists(p): raise FileNotFoundError(p)
    if is_zip_or_xlsx(p):
        import openpyxl; return pd.read_excel(p, engine="openpyxl")
    for enc in ["utf-8","utf-8-sig","cp1252","latin1"]:
        for sep in ["; ", ";", ",", "\t", "|"]:
            try:
                df = pd.read_csv(p, encoding=enc, sep=sep, engine="python")
                if df.shape[1] > 1: return df
            except: pass
    raise RuntimeError(f"Failed to parse {p}")

# --------- load & prep (same logic as training) ----------
df_m = robust_read_any(DATA_MATHS)
df_p = robust_read_any(DATA_PORT)
common_cols = sorted(set(df_m.columns).intersection(set(df_p.columns)))
df = pd.concat([df_m[common_cols], df_p[common_cols]], ignore_index=True)

if "G3" not in df.columns:
    raise ValueError("Column 'G3' missing; check the files.")
df = df.dropna(subset=["G3"]).copy()
df["target"] = (pd.to_numeric(df["G3"], errors="coerce") >= 10).astype(int)
for c in ["G1","G2","G3"]:
    if c in df.columns: df.drop(columns=[c], inplace=True)

# --------- load artifacts & align columns exactly as during training ----------
with open(os.path.join(OUTPUT_DIR, "preprocessor.pkl"), "rb") as f:
    preproc = pickle.load(f)

cat_cols = list(preproc.transformers_[0][2])
num_cols = list(preproc.transformers_[1][2])
expected_raw_cols = cat_cols + num_cols

# Make sure all expected columns exist and order is identical
for col in expected_raw_cols:
    if col not in df.columns: df[col] = np.nan
X_all = df[expected_raw_cols]
y_all = df["target"].astype(int).values

# Same split seed as before
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

# Transform with the already-fitted preprocessor
X_train_proc = preproc.transform(X_train)
X_test_proc  = preproc.transform(X_test)

# --------- load model (.keras preferred; .h5 fallback) ----------
try:
    model = tf.keras.models.load_model(os.path.join(OUTPUT_DIR, "model.keras"))
except Exception:
    model = tf.keras.models.load_model(os.path.join(OUTPUT_DIR, "model.h5"))

# --------- SHAP: unified API + stable masker over training data ----------
masker = shap.maskers.Independent(X_train_proc, max_samples=min(200, X_train_proc.shape[0]))
explainer = shap.Explainer(model, masker)   # auto-picks Deep/Gradient explainer for TF models

# Use up to 500 test samples for speed
X_for_shap = X_test_proc[: min(500, X_test_proc.shape[0])]

sv = explainer(X_for_shap)  # Explanation object on recent SHAP versions
# Normalize to 2-D (n_samples, n_features)
values = getattr(sv, "values", sv)           # older versions return plain ndarray
if values.ndim == 3:                         # sometimes (n_samples, n_features, n_outputs)
    values = values[:, :, 0]                 # take the first (binary) output
# Handle rare off-by-one (bias term appended)
if values.shape[1] != X_for_shap.shape[1]:
    # If there's an extra column, drop it; else raise a helpful error
    if values.shape[1] == X_for_shap.shape[1] + 1:
        values = values[:, :X_for_shap.shape[1]]
    else:
        raise AssertionError(f"SHAP values feature dim {values.shape[1]} != data dim {X_for_shap.shape[1]}")

# --------- Feature names from the preprocessor ----------
def get_feature_names(preproc):
    # Try sklearn >=1.0 API first
    try:
        return preproc.get_feature_names_out()
    except Exception:
        # Manual fallback
        ohe = preproc.transformers_[0][1]
        cat_cols = preproc.transformers_[0][2]
        num_cols = preproc.transformers_[1][2]
        ohe_names = []
        for col, cats in zip(cat_cols, ohe.categories_):
            ohe_names += [f"{col}={c}" for c in cats]
        return np.array(ohe_names + list(num_cols))

feature_names = get_feature_names(preproc)
# Align if sklearn added transformer prefixes like 'cat__', 'num__'
if feature_names.shape[0] != X_for_shap.shape[1]:
    # Try stripping prefixes separated by '__'
    cleaned = np.array([n.split("__", 1)[-1] for n in feature_names])
    if cleaned.shape[0] == X_for_shap.shape[1]:
        feature_names = cleaned
    else:
        # Final fallback: generate generic names to avoid plotting failure
        feature_names = np.array([f"f{i}" for i in range(X_for_shap.shape[1])])

# --------- Beeswarm summary plot ----------
plt.figure()
shap.summary_plot(values, X_for_shap, feature_names=feature_names, show=False)
summary_path = os.path.join(OUTPUT_DIR, "shap_summary.png")
plt.tight_layout(); plt.savefig(summary_path, dpi=150); plt.close()
print(f"[INFO] Saved SHAP summary -> {summary_path}")

# --------- Mean |SHAP| bar plot ----------
plt.figure()
shap.summary_plot(values, X_for_shap, feature_names=feature_names, plot_type="bar", show=False)
bar_path = os.path.join(OUTPUT_DIR, "shap_bar.png")
plt.tight_layout(); plt.savefig(bar_path, dpi=150); plt.close()
print(f"[INFO] Saved SHAP bar plot -> {bar_path}")


PermutationExplainer explainer: 211it [00:31,  6.59it/s]                                                                                                   


[INFO] Saved SHAP summary -> C:\Users\sagni\Downloads\Dynamic Curriculum Designer\shap_summary.png
[INFO] Saved SHAP bar plot -> C:\Users\sagni\Downloads\Dynamic Curriculum Designer\shap_bar.png
