In [1]:
# === MindPal: Train + Accuracy Graph + Confusion Matrix Heatmaps ===
# Saves: mindpal_model.h5, mindpal_model.json, mindpal_model.yaml, mindpal_preprocess.pkl
#        accuracy.png, loss.png, confusion_matrix.png, confusion_matrix_norm.png
#        training_history.csv, classification_report.txt, predictions.csv

import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# -----------------------------
# Paths
# -----------------------------
CSV_PATH = r"C:\Users\sagni\Downloads\Mind Pal\archive\data.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Mind Pal"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Helper transformers (picklable)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow numeric features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

def to_tf_sparse(spmat):
    spmat = spmat.tocoo()
    idx = np.vstack((spmat.row, spmat.col)).T
    return tf.SparseTensor(indices=idx, values=spmat.data, dense_shape=spmat.shape)

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Pick target automatically (common names → else last column)
CANDIDATE_TARGETS = ["target", "label", "y", "class", "output"]
target_col = next((c for c in CANDIDATE_TARGETS if c in df.columns), df.columns[-1])

X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

# -----------------------------
# Column typing
# -----------------------------
numeric_cols   = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols       = X_df.select_dtypes(include=["object"]).columns.tolist()
datetime_cols  = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Try parse date-like object columns
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        X_df[c] = parsed
        datetime_cols.append(c)
        if c in obj_cols: obj_cols.remove(c)
    except Exception:
        pass

# Heuristic: long strings => text, short strings => categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    (text_cols if s.str.len().mean() > 40 else cat_cols).append(c)

# -----------------------------
# Preprocessing
# -----------------------------
transformers = []

if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

if cat_cols:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

for c in text_cols:
    txt_pipe = Pipeline([
        ("select",  ColumnSelector(c)),
        ("impute",  SimpleImputer(strategy="constant", fill_value="")),
        ("to1d",    To1DString()),
        ("tfidf",   TfidfVectorizer(max_features=5000))
    ])
    transformers.append((f"text_{c}", txt_pipe, [c]))

if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3)

# -----------------------------
# Split + encode target
# -----------------------------
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
n_classes = len(np.unique(y))

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y if n_classes > 1 else None
)

# Fit transform
X_train = preprocess.fit_transform(X_train_df, y_train)
X_test  = preprocess.transform(X_test_df)

# Densify if feature count is modest
densify = hasattr(X_train, "toarray") and (X_train.shape[1] <= 50000)
if densify:
    X_train_ = X_train.toarray()
    X_test_  = X_test.toarray()
    input_dim = X_train_.shape[1]
    use_sparse = False
else:
    input_dim = X_train.shape[1]
    use_sparse = hasattr(X_train, "tocsr")

# -----------------------------
# Build & train model
# -----------------------------
tf.random.set_seed(42)

def build_model(input_dim, n_classes):
    m = Sequential()
    m.add(Dense(128, activation="relu", input_shape=(input_dim,)))
    m.add(Dropout(0.3))
    m.add(Dense(64, activation="relu"))
    if n_classes <= 2:
        m.add(Dense(1, activation="sigmoid"))
        m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        m.add(Dense(n_classes, activation="softmax"))
        m.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

model = build_model(input_dim=input_dim, n_classes=n_classes)

if use_sparse:
    train_X = tf.sparse.to_dense(to_tf_sparse(X_train))
    test_X  = tf.sparse.to_dense(to_tf_sparse(X_test))
else:
    train_X, test_X = (X_train_ if densify else X_train), (X_test_ if densify else X_test)

history = model.fit(
    train_X, y_train,
    validation_data=(test_X, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

# -----------------------------
# Save history CSV
# -----------------------------
hist_df = pd.DataFrame(history.history)
hist_df.index = np.arange(1, len(hist_df) + 1)  # epoch numbering from 1
hist_csv = os.path.join(OUT_DIR, "training_history.csv")
hist_df.to_csv(hist_csv, index_label="epoch")

# -----------------------------
# Plot: Accuracy & Loss graphs (Matplotlib only)
# -----------------------------
def plot_curve(y_series, y_val_series, title, ylabel, out_path):
    plt.figure(figsize=(8, 5))
    plt.plot(hist_df.index, y_series, label="Train")
    if y_val_series is not None:
        plt.plot(hist_df.index, y_val_series, label="Validation")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

acc_key = "accuracy" if "accuracy" in hist_df.columns else "acc"
val_acc_key = "val_accuracy" if "val_accuracy" in hist_df.columns else ("val_acc" if "val_acc" in hist_df.columns else None)

plot_curve(
    hist_df[acc_key],
    hist_df[val_acc_key] if val_acc_key in hist_df.columns else None,
    "Model Accuracy", "Accuracy",
    os.path.join(OUT_DIR, "accuracy.png")
)

plot_curve(
    hist_df["loss"],
    hist_df["val_loss"] if "val_loss" in hist_df.columns else None,
    "Model Loss", "Loss",
    os.path.join(OUT_DIR, "loss.png")
)

# -----------------------------
# Predictions, Confusion Matrix & Heatmaps
# -----------------------------
probs = model.predict(test_X, verbose=0)

if n_classes <= 2:
    y_pred = (probs.ravel() >= 0.5).astype(int)
    class_names = [str(c) for c in label_encoder.classes_]
else:
    y_pred = np.argmax(probs, axis=1)
    class_names = [str(c) for c in label_encoder.classes_]

# Save predictions CSV
pred_csv = os.path.join(OUT_DIR, "predictions.csv")
pd.DataFrame({
    "true_label": label_encoder.inverse_transform(y_test),
    "pred_label": label_encoder.inverse_transform(y_pred)
}).to_csv(pred_csv, index=False)

# Classification report
report_txt = os.path.join(OUT_DIR, "classification_report.txt")
with open(report_txt, "w", encoding="utf-8") as f:
    f.write(classification_report(y_test, y_pred, target_names=class_names))

# Confusion matrices
cm = confusion_matrix(y_test, y_pred, labels=np.arange(len(class_names)))
cm_norm = cm.astype("float") / (cm.sum(axis=1, keepdims=True) + 1e-12)

def plot_cm(cm_mat, labels, title, out_path, normalize=False):
    plt.figure(figsize=(8, 6))
    im = plt.imshow(cm_mat, interpolation="nearest", aspect="auto")
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45, ha="right")
    plt.yticks(tick_marks, labels)

    # annotate
    thresh = cm_mat.max() / 2. if cm_mat.size else 0.5
    for i in range(cm_mat.shape[0]):
        for j in range(cm_mat.shape[1]):
            val = cm_mat[i, j]
            txt = f"{val:.2f}" if normalize else f"{int(val)}"
            plt.text(j, i, txt, ha="center", va="center",
                     color="white" if val > thresh else "black", fontsize=9)

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

plot_cm(cm, class_names, "Confusion Matrix (Counts)", os.path.join(OUT_DIR, "confusion_matrix.png"), normalize=False)
plot_cm(cm_norm, class_names, "Confusion Matrix (Row-Normalized)", os.path.join(OUT_DIR, "confusion_matrix_norm.png"), normalize=True)

# -----------------------------
# Save core artifacts
# -----------------------------
# 1) Keras H5
h5_path = os.path.join(OUT_DIR, "mindpal_model.h5")
model.save(h5_path)

# 2) JSON config
json_path = os.path.join(OUT_DIR, "mindpal_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)

# 3) YAML config
yaml_path = os.path.join(OUT_DIR, "mindpal_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)

# 4) Preprocessing bundle
pkl_path = os.path.join(OUT_DIR, "mindpal_preprocess.pkl")
joblib.dump({
    "preprocess": preprocess,
    "label_encoder": label_encoder,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols
}, pkl_path, compress=3, protocol=4)

print("\n=== Saved Artifacts to:", OUT_DIR, "===")
print(" - accuracy.png")
print(" - loss.png")
print(" - confusion_matrix.png")
print(" - confusion_matrix_norm.png")
print(" - training_history.csv")
print(" - classification_report.txt")
print(" - predictions.csv")
print(" - mindpal_model.h5 / .json / .yaml / mindpal_preprocess.pkl")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.7310 - loss: 0.6284 - val_accuracy: 0.8407 - val_loss: 0.4191
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8558 - loss: 0.3567 - val_accuracy: 0.8407 - val_loss: 0.3536
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8582 - loss: 0.2423 - val_accuracy: 0.8508 - val_loss: 0.2784
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9499 - loss: 0.1377 - val_accuracy: 0.8949 - val_loss: 0.2518
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9931 - loss: 0.0571 - val_accuracy: 0.8983 - val_loss: 0.2677
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9979 - loss: 0.0203 - val_accuracy: 0.9017 - val_loss: 0.3118
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━




=== Saved Artifacts to: C:\Users\sagni\Downloads\Mind Pal ===
 - accuracy.png
 - loss.png
 - confusion_matrix.png
 - confusion_matrix_norm.png
 - training_history.csv
 - classification_report.txt
 - predictions.csv
 - mindpal_model.h5 / .json / .yaml / mindpal_preprocess.pkl
