In [5]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf

# ==============================
# Paths
# ==============================
CSV_PATH = r"C:\Users\sagni\Downloads\Mind Pal\archive\data.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Mind Pal"
os.makedirs(OUT_DIR, exist_ok=True)

# ==============================
# Load
# ==============================
df = pd.read_csv(CSV_PATH)

# --- Choose target column:
CANDIDATE_TARGETS = ["target", "label", "y", "class", "output"]
target_col = next((c for c in CANDIDATE_TARGETS if c in df.columns), df.columns[-1])

X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

# ==============================
# Column typing (auto-detect)
# ==============================
numeric_cols = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols     = X_df.select_dtypes(include=["object"]).columns.tolist()

# Heuristic: treat as "text" if avg string length > 40 chars; else small categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    (text_cols if s.str.len().mean() > 40 else cat_cols).append(c)

# Datetime columns (if already parsed)
datetime_cols = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Try parsing obvious date-like object columns
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        X_df[c] = parsed
        datetime_cols.append(c)
        if c in cat_cols: cat_cols.remove(c)
        if c in text_cols: text_cols.remove(c)
    except Exception:
        pass

class DateTimeExpand(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# ==============================
# Build preprocessors
# ==============================
transformers = []

# Numeric
if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

# Categorical
if cat_cols:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

# ---- FIXED TEXT PIPELINE ----
def make_text_pipe(col_name: str) -> Pipeline:
    # 1) select a single-column DataFrame (2D)
    # 2) impute '' (returns 2D NumPy array)
    # 3) ravel to 1D array of strings (no DataFrame indexing here)
    # 4) TF-IDF on 1D strings
    return Pipeline([
        ("select_df", FunctionTransformer(lambda df: df[[col_name]], validate=False)),
        ("imputer",   SimpleImputer(strategy="constant", fill_value="")),
        ("ravel_1d",  FunctionTransformer(lambda x: x.astype(str).ravel(), validate=False)),
        ("tfidf",     TfidfVectorizer(max_features=5000))
    ])

for c in text_cols:
    transformers.append((f"text_{c}", make_text_pipe(c), [c]))

# Datetime
if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

# If for some reason no transformers were added, fall back to a passthrough to avoid errors
if not transformers:
    preprocess = "passthrough"
else:
    preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3)

# ==============================
# Train/Val split & Target encoding
# ==============================
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
n_classes = len(np.unique(y))

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y if n_classes > 1 else None
)

# Fit preprocess and transform
if preprocess == "passthrough":
    X_train = X_train_df.values
    X_test  = X_test_df.values
else:
    X_train = preprocess.fit_transform(X_train_df, y_train)
    X_test  = preprocess.transform(X_test_df)

# Densify heuristically if feature space is modest
if hasattr(X_train, "toarray"):
    densify = X_train.shape[1] <= 50000
else:
    densify = False

if densify and hasattr(X_train, "toarray"):
    X_train_ = X_train.toarray()
    X_test_  = X_test.toarray()
    input_dim = X_train_.shape[1]
    use_sparse = False
else:
    input_dim = X_train.shape[1] if preprocess != "passthrough" else X_train.shape[1]
    use_sparse = hasattr(X_train, "tocsr")

# ==============================
# Build & train Keras model
# ==============================
tf.random.set_seed(42)

def build_model(input_dim, n_classes):
    model = Sequential()
    model.add(Dense(128, activation="relu", input_shape=(input_dim,)))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    if n_classes <= 2:
        model.add(Dense(1, activation="sigmoid"))
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        model.add(Dense(n_classes, activation="softmax"))
        model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

model = build_model(input_dim=input_dim, n_classes=n_classes)

if use_sparse:
    def to_tf_sparse(spmat):
        spmat = spmat.tocoo()
        indices = np.vstack((spmat.row, spmat.col)).T
        return tf.SparseTensor(indices=indices, values=spmat.data, dense_shape=spmat.shape)
    X_train_tf = to_tf_sparse(X_train)
    X_test_tf  = to_tf_sparse(X_test)
    train_X = tf.sparse.to_dense(X_train_tf)
    test_X  = tf.sparse.to_dense(X_test_tf)
else:
    train_X, test_X = (X_train_ if densify else X_train), (X_test_ if densify else X_test)

history = model.fit(
    train_X, y_train,
    validation_data=(test_X, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

# ==============================
# Eval
# ==============================
probs = model.predict(test_X, verbose=0)
y_pred = (probs.ravel() >= 0.5).astype(int) if n_classes <= 2 else np.argmax(probs, axis=1)
acc = accuracy_score(y_test, y_pred)
print(f"[INFO] Validation accuracy: {acc:.4f}")

# ==============================
# Save artifacts
# ==============================
# 1) H5
h5_path = os.path.join(OUT_DIR, "mindpal_model.h5")
model.save(h5_path)
print(f"[SAVE] H5 -> {h5_path}")

# 2) JSON config
json_path = os.path.join(OUT_DIR, "mindpal_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)
print(f"[SAVE] JSON -> {json_path}")

# 3) YAML config
yaml_path = os.path.join(OUT_DIR, "mindpal_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)
print(f"[SAVE] YAML -> {yaml_path}")

# 4) Preprocessing bundle (PKL)
pkl_path = os.path.join(OUT_DIR, "mindpal_preprocess.pkl")
joblib.dump({
    "preprocess": preprocess,
    "label_encoder": label_encoder,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols
}, pkl_path)
print(f"[SAVE] PKL -> {pkl_path}")

print("\n=== Done ===")
print(f"Target column: {target_col}")
print(f"Detected -> numeric: {numeric_cols} | categorical: {cat_cols} | text: {text_cols} | datetime: {datetime_cols}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.7659 - loss: 0.6245 - val_accuracy: 0.8407 - val_loss: 0.4157
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8558 - loss: 0.3540 - val_accuracy: 0.8407 - val_loss: 0.3509
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8561 - loss: 0.2448 - val_accuracy: 0.8475 - val_loss: 0.2847
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9302 - loss: 0.1473 - val_accuracy: 0.8949 - val_loss: 0.2561
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9867 - loss: 0.0703 - val_accuracy: 0.8983 - val_loss: 0.2560
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9957 - loss: 0.0251 - val_accuracy: 0.8983 - val_loss: 0.3193
Epoch 7/10
[1m37/37[0m [32m━━━━━━━━━



[INFO] Validation accuracy: 0.9017
[SAVE] H5 -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.h5
[SAVE] JSON -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.json
[SAVE] YAML -> C:\Users\sagni\Downloads\Mind Pal\mindpal_model.yaml


PicklingError: Can't pickle <function make_text_pipe.<locals>.<lambda> at 0x00000198D1F04AE0>: it's not found as __main__.make_text_pipe.<locals>.<lambda>