In [1]:
# === NeuroFit: Train model + save H5/YAML/JSON/PKL (version-safe) ===
# Input CSV : C:\Users\sagni\Downloads\Neuro Fit\archive\human_cognitive_performance.csv
# Outputs   : neurofit_model.h5 / .yaml / .json / preprocess.pkl
#             (and training_history.csv, metrics.txt, plots)

import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_absolute_error, mean_squared_error
)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# -----------------------------
# Paths
# -----------------------------
CSV_PATH = r"C:\Users\sagni\Downloads\Neuro Fit\archive\human_cognitive_performance.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Neuro Fit"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Helper transformers (picklable; no lambdas)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow/hour features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow", f"{c}_hour"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
                f"{c}_hour":  s.dt.hour.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# -----------------------------
# Utils
# -----------------------------
def get_ohe_version_safe():
    """Return OneHotEncoder with the right sparse arg for current scikit-learn."""
    try:
        # sklearn >= 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        # sklearn < 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

def to_dense_if_reasonable(X, threshold_features=50000):
    """Densify sparse matrix only if feature count is modest to avoid RAM blow-ups."""
    if sp.issparse(X):
        return X.toarray() if X.shape[1] <= threshold_features else X
    return X

def plot_curve(xs, ys, ys_val, title, ylabel, out_path):
    plt.figure(figsize=(8, 5))
    plt.plot(xs, ys, label="Train")
    if ys_val is not None:
        plt.plot(xs, ys_val, label="Validation")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def detect_target_column(df):
    """Heuristically detect target column."""
    candidates = [
        "target", "label", "class", "y",
        "cognitive_score", "cognition_score", "neuroscore",
        "reaction_time", "memory_score", "attention_score", "outcome"
    ]
    lower_map = {c.lower(): c for c in df.columns}
    for k in candidates:
        if k in lower_map:
            return lower_map[k]
    return df.columns[-1]

def detect_task_type(y_series):
    """
    Decide classification vs regression:
    - If numeric with many unique values -> regression
    - If numeric but few uniques (<= 10) -> classification
    - If non-numeric -> classification
    """
    if pd.api.types.is_numeric_dtype(y_series):
        nunique = y_series.nunique(dropna=True)
        return "classification" if nunique <= 10 else "regression"
    return "classification"

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Try to coerce obvious datetime-like columns (by name)
for c in df.columns:
    lc = c.lower()
    if any(k in lc for k in ["date", "time", "timestamp", "datetime"]):
        try:
            df[c] = pd.to_datetime(df[c], errors="ignore")
        except Exception:
            pass

target_col = detect_target_column(df)
X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

task_type = detect_task_type(y_raw)
print(f"[INFO] Target: {target_col} | Task: {task_type}")

# -----------------------------
# Column typing (auto-detect)
# -----------------------------
numeric_cols   = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols       = X_df.select_dtypes(include=["object"]).columns.tolist()
datetime_cols  = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Attempt to parse object columns that look like timestamps
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        if parsed.notna().mean() > 0.7:
            X_df[c] = parsed
            datetime_cols.append(c)
            if c in obj_cols:
                obj_cols.remove(c)
    except Exception:
        pass

# Heuristic: long strings => text, short strings => categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    (text_cols if s.str.len().mean() > 40 else cat_cols).append(c)

# -----------------------------
# Build preprocessing (picklable; version-safe OHE; no lambdas)
# -----------------------------
transformers = []

# Numeric
if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

# Categorical
if cat_cols:
    ohe = get_ohe_version_safe()
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot",  ohe)
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

# Text (TF-IDF per text column)
for c in text_cols:
    txt_pipe = Pipeline([
        ("select",  ColumnSelector(c)),
        ("impute",  SimpleImputer(strategy="constant", fill_value="")),
        ("to1d",    To1DString()),
        ("tfidf",   TfidfVectorizer(max_features=8000))
    ])
    transformers.append((f"text_{c}", txt_pipe, [c]))

# Datetime expansion
if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3) if transformers else "passthrough"

# -----------------------------
# Encode/prepare target + split
# -----------------------------
label_encoder = None
if task_type == "classification":
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_raw.astype(str))
else:
    # regression: ensure float
    y = pd.to_numeric(y_raw, errors="coerce").astype(float)
    # drop rows with nan target
    keep = ~np.isnan(y)
    X_df = X_df.loc[keep].reset_index(drop=True)
    y = y[keep]

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42,
    stratify=y if (task_type == "classification") else None
)

# Fit & transform
if preprocess == "passthrough":
    X_train = X_train_df.values
    X_test  = X_test_df.values
else:
    X_train = preprocess.fit_transform(X_train_df, y_train)
    X_test  = preprocess.transform(X_test_df)

# Densify only if reasonable
X_train = to_dense_if_reasonable(X_train)
X_test  = to_dense_if_reasonable(X_test)
input_dim = X_train.shape[1]

# -----------------------------
# Build & train Keras model
# -----------------------------
tf.random.set_seed(42)

def build_model(input_dim, task_type, n_classes=None):
    m = Sequential()
    m.add(Dense(256, activation="relu", input_shape=(input_dim,)))
    m.add(Dropout(0.3))
    m.add(Dense(128, activation="relu"))
    if task_type == "classification":
        if n_classes is None or n_classes <= 2:
            m.add(Dense(1, activation="sigmoid"))
            m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        else:
            m.add(Dense(n_classes, activation="softmax"))
            m.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    else:
        # regression
        m.add(Dense(1, activation="linear"))
        m.compile(optimizer="adam", loss="mse", metrics=["mae"])  # we'll compute RMSE manually
    return m

n_classes = None
if task_type == "classification":
    n_classes = len(np.unique(y_train))

model = build_model(input_dim=input_dim, task_type=task_type, n_classes=n_classes)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=12,          # tweak as needed
    batch_size=256,     # adjust depending on RAM/VRAM
    verbose=1
)

# -----------------------------
# Save training history + plots
# -----------------------------
hist_df = pd.DataFrame(history.history)
hist_df.index = np.arange(1, len(hist_df) + 1)
hist_csv = os.path.join(OUT_DIR, "training_history.csv")
hist_df.to_csv(hist_csv, index_label="epoch")

if task_type == "classification":
    acc_key = "accuracy" if "accuracy" in hist_df.columns else ("acc" if "acc" in hist_df.columns else None)
    val_acc_key = "val_accuracy" if "val_accuracy" in hist_df.columns else ("val_acc" if "val_acc" in hist_df.columns else None)
    if acc_key:
        plot_curve(hist_df.index, hist_df[acc_key],
                   hist_df[val_acc_key] if val_acc_key in hist_df.columns else None,
                   "Model Accuracy", "Accuracy",
                   os.path.join(OUT_DIR, "accuracy.png"))
    plot_curve(hist_df.index, hist_df["loss"],
               hist_df["val_loss"] if "val_loss" in hist_df.columns else None,
               "Model Loss", "Loss",
               os.path.join(OUT_DIR, "loss.png"))
else:
    # regression curves: MAE (exists) and computed RMSE (from loss if loss==mse)
    mae_tr = hist_df["mae"] if "mae" in hist_df.columns else None
    mae_va = hist_df["val_mae"] if "val_mae" in hist_df.columns else None
    if mae_tr is not None:
        plot_curve(hist_df.index, mae_tr, mae_va, "Model MAE", "MAE",
                   os.path.join(OUT_DIR, "mae.png"))
    # RMSE from loss (assuming MSE)
    rmse_tr = np.sqrt(hist_df["loss"]) if "loss" in hist_df.columns else None
    rmse_va = np.sqrt(hist_df["val_loss"]) if "val_loss" in hist_df.columns else None
    if rmse_tr is not None:
        plot_curve(hist_df.index, rmse_tr, rmse_va, "Model RMSE", "RMSE",
                   os.path.join(OUT_DIR, "rmse.png"))

# -----------------------------
# Save artifacts
# -----------------------------
# 1) H5
h5_path = os.path.join(OUT_DIR, "neurofit_model.h5")
model.save(h5_path)
print(f"[SAVE] H5 -> {h5_path}")

# 2) JSON config
json_path = os.path.join(OUT_DIR, "neurofit_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)
print(f"[SAVE] JSON -> {json_path}")

# 3) YAML config
yaml_path = os.path.join(OUT_DIR, "neurofit_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)
print(f"[SAVE] YAML -> {yaml_path}")

# 4) Preprocess bundle PKL
pkl_path = os.path.join(OUT_DIR, "neurofit_preprocess.pkl")
bundle = {
    "preprocess": preprocess,
    "task_type": task_type,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols,
}
if task_type == "classification":
    bundle["label_encoder"] = label_encoder
joblib.dump(bundle, pkl_path, compress=3, protocol=4)
print(f"[SAVE] PKL -> {pkl_path}")

# -----------------------------
# Final evaluation + metrics
# -----------------------------
metrics_path = os.path.join(OUT_DIR, "metrics.txt")
with open(metrics_path, "w", encoding="utf-8") as f:
    if task_type == "classification":
        y_prob = model.predict(X_test, verbose=0)
        if n_classes is None or n_classes <= 2 or (y_prob.ndim == 1 or y_prob.shape[1] == 1):
            y_pred = (y_prob.ravel() >= 0.5).astype(int)
        else:
            y_pred = np.argmax(y_prob, axis=1)
        acc = accuracy_score(y_test, y_pred)
        rep = classification_report(y_test, y_pred)
        f.write(f"Validation accuracy: {acc:.4f}\n\n")
        f.write(rep)
        print(f"Validation accuracy: {acc:.4f}")
    else:
        y_pred = model.predict(X_test, verbose=0).ravel()
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        f.write(f"Validation MAE : {mae:.4f}\n")
        f.write(f"Validation RMSE: {rmse:.4f}\n")
        print(f"Validation MAE : {mae:.4f}")
        print(f"Validation RMSE: {rmse:.4f}")

print("\n=== Training complete ===")
print(f"Target column: {target_col}")
print(f"Detected -> numeric: {numeric_cols} | categorical: {cat_cols} | text: {text_cols} | datetime: {datetime_cols}")
print(f"Artifacts saved in: {OUT_DIR}")


[INFO] Target: Cognitive_Score | Task: regression


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 127ms/step - loss: 1750.8784 - mae: 30.8199 - val_loss: 8.1655 - val_mae: 2.4398
Epoch 2/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 123ms/step - loss: 16.4247 - mae: 3.2198 - val_loss: 8.1723 - val_mae: 2.4466
Epoch 3/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 126ms/step - loss: 13.1723 - mae: 2.8104 - val_loss: 8.2744 - val_mae: 2.4560
Epoch 4/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 125ms/step - loss: 11.8558 - mae: 2.6142 - val_loss: 8.0775 - val_mae: 2.4260
Epoch 5/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 126ms/step - loss: 11.5063 - mae: 2.5644 - val_loss: 8.1285 - val_mae: 2.4392
Epoch 6/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 123ms/step - loss: 11.1244 - mae: 2.5388 - val_loss: 8.8075 - val_mae: 2.5159
Epoch 7/12
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



[SAVE] H5 -> C:\Users\sagni\Downloads\Neuro Fit\neurofit_model.h5
[SAVE] JSON -> C:\Users\sagni\Downloads\Neuro Fit\neurofit_model.json
[SAVE] YAML -> C:\Users\sagni\Downloads\Neuro Fit\neurofit_model.yaml
[SAVE] PKL -> C:\Users\sagni\Downloads\Neuro Fit\neurofit_preprocess.pkl
Validation MAE : 2.9323
Validation RMSE: 3.5946

=== Training complete ===
Target column: Cognitive_Score
Detected -> numeric: ['Age', 'Sleep_Duration', 'Stress_Level', 'Caffeine_Intake', 'Memory_Test_Score', 'AI_Predicted_Score'] | categorical: ['User_ID', 'Gender', 'Diet_Type', 'Exercise_Frequency'] | text: [] | datetime: ['Daily_Screen_Time', 'Reaction_Time']
Artifacts saved in: C:\Users\sagni\Downloads\Neuro Fit
