In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
from scipy import sparse as sp

# ==============================
# Paths
# ==============================
CSV_PATH = r"C:\Users\sagni\Downloads\Mind Pal\archive\data.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Mind Pal"
os.makedirs(OUT_DIR, exist_ok=True)

# ==============================
# Helper Transformers (picklable)
# ==============================
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # Return 2D frame (n,1)
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow numeric features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        # record output columns for feature names stability
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

def to_tf_sparse(spmat):
    spmat = spmat.tocoo()
    idx = np.vstack((spmat.row, spmat.col)).T
    return tf.SparseTensor(indices=idx, values=spmat.data, dense_shape=spmat.shape)

# ==============================
# Load
# ==============================
df = pd.read_csv(CSV_PATH)

# --- Choose target column:
CANDIDATE_TARGETS = ["target", "label", "y", "class", "output"]
target_col = next((c for c in CANDIDATE_TARGETS if c in df.columns), df.columns[-1])

X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

# ==============================
# Column typing (auto-detect)
# ==============================
numeric_cols = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols     = X_df.select_dtypes(include=["object"]).columns.tolist()
datetime_cols = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Try parsing date-like object columns
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        X_df[c] = parsed
        datetime_cols.append(c)
        if c in obj_cols: obj_cols.remove(c)
    except Exception:
        pass

# Heuristic: long strings -> text, short strings -> categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    if s.str.len().mean() > 40:
        text_cols.append(c)
    else:
        cat_cols.append(c)

# ==============================
# Build preprocessors (no lambdas)
# ==============================
transformers = []

# Numeric
if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

# Categorical
if cat_cols:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

# Text: one TF-IDF per text column
for c in text_cols:
    txt_pipe = Pipeline([
        ("select",   ColumnSelector(c)),
        ("imputer",  SimpleImputer(strategy="constant", fill_value="")),
        ("to1d",     To1DString()),
        ("tfidf",    TfidfVectorizer(max_features=5000))
    ])
    transformers.append((f"text_{c}", txt_pipe, [c]))

# Datetime expansion
if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3)

# ==============================
# Train/Val split & Target encoding
# ==============================
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
n_classes = len(np.unique(y))

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y if n_classes > 1 else None
)

# Fit preprocess and transform
X_train = preprocess.fit_transform(X_train_df, y_train)
X_test  = preprocess.transform(X_test_df)

# Densify if feature count is modest
densify = hasattr(X_train, "toarray") and (X_train.shape[1] <= 50000)
if densify:
    X_train_ = X_train.toarray()
    X_test_  = X_test.toarray()
    input_dim = X_train_.shape[1]
    use_sparse = False
else:
    input_dim = X_train.shape[1]
    use_sparse = hasattr(X_train, "tocsr")

# ==============================
# Build & train Keras model
# ==============================
tf.random.set_seed(42)

def build_model(input_dim, n_classes):
    m = Sequential()
    m.add(Dense(128, activation="relu", input_shape=(input_dim,)))
    m.add(Dropout(0.3))
    m.add(Dense(64, activation="relu"))
    if n_classes <= 2:
        m.add(Dense(1, activation="sigmoid"))
        m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        m.add(Dense(n_classes, activation="softmax"))
        m.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

model = build_model(input_dim=input_dim, n_classes=n_classes)

if use_sparse:
    train_X = tf.sparse.to_dense(to_tf_sparse(X_train))
    test_X  = tf.sparse.to_dense(to_tf_sparse(X_test))
else:
    train_X, test_X = (X_train_ if densify else X_train), (X_test_ if densify else X_test)

history = model.fit(
    train_X, y_train,
    validation_data=(test_X, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

# ==============================
# Eval
# ==============================
probs = model.predict(test_X, verbose=0)
y_pred = (probs.ravel() >= 0.5).astype(int) if n_classes <= 2 else np.argmax(probs, axis=1)
acc = accuracy_score(y_test, y_pred)
print(f"[INFO] Validation accuracy: {acc:.4f}")

# ==============================
# Save artifacts
# ==============================
# 1) H5
h5_path = os.path.join(OUT_DIR, "mindpal_model.h5")
model.save(h5_path)
print(f"[SAVE] H5 -> {h5_path}")

# 2) JSON config (safer than to_json string)
json_path = os.path.join(OUT_DIR, "mindpal_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)
print(f"[SAVE] JSON -> {json_path}")

# 3) YAML config
yaml_path = os.path.join(OUT_DIR, "mindpal_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)
print(f"[SAVE] YAML -> {yaml_path}")

# 4) Preprocessing bundle (PKL) — now picklable (no lambdas/locals)
pkl_path = os.path.join(OUT_DIR, "mindpal_preprocess.pkl")
joblib.dump({
    "preprocess": preprocess,
    "label_encoder": label_encoder,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols
}, pkl_path, compress=3, protocol=4)
print(f"[SAVE] PKL -> {pkl_path}")

print("\n=== Done ===")
print(f"Target column: {target_col}")
print(f"Detected -> numeric: {numeric_cols} | categorical: {cat_cols} | text: {text_cols} | datetime: {datetime_cols}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.8157 - loss: 0.6055 - val_accuracy: 0.8407 - val_loss: 0.4148
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8558 - loss: 0.3379 - val_accuracy: 0.8407 - val_loss: 0.3388
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8587 - loss: 0.2220 - val_accuracy: 0.8610 - val_loss: 0.2738
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9509 - loss: 0.1317 - val_accuracy: 0.9017 - val_loss: 0.2467
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9968 - loss: 0.0549 - val_accuracy: 0.9051 - val_loss: 0.2662
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9972 - loss: 0.0178 - val_accuracy: 0.9017 - val_loss: 0.3086
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━



[INFO] Validation accuracy: 0.9085
[SAVE] H5 -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.h5
[SAVE] JSON -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.json
[SAVE] YAML -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.yaml
[SAVE] PKL -> C:\Users\sagni\Downloads\Mind Pal\mindpal_preprocess.pkl

=== Done ===
Target column: Answer.t1.work.raw
Detected -> numeric: [] | categorical: [] | text: ['Answer'] | datetime: []
