In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# -----------------------------
# Paths
# -----------------------------
CSV1 = r"C:\Users\sagni\Downloads\Mind Breath\archive\Stress_Dataset.csv"
CSV2 = r"C:\Users\sagni\Downloads\Mind Breath\archive\StressLevelDataset.csv"
OUT_DIR = r"C:\Users\sagni\Downloads\Mind Breath"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Helper transformers (picklable; no lambdas)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow/hour features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow", f"{c}_hour"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
                f"{c}_hour":  s.dt.hour.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# -----------------------------
# Utils
# -----------------------------
def get_ohe_version_safe():
    """Return OneHotEncoder with the right sparse arg for current scikit-learn."""
    try:
        # sklearn >= 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        # sklearn < 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

def to_dense_if_reasonable(X, max_feats=50000):
    """Densify sparse matrix only if feature count is modest to avoid RAM blow-ups."""
    if sp.issparse(X):
        return X.toarray() if X.shape[1] <= max_feats else X
    return X

def plot_curve(hist_df, y_series, y_val_series, title, ylabel, out_path):
    plt.figure(figsize=(8, 5))
    plt.plot(hist_df.index, y_series, label="Train")
    if y_val_series is not None:
        plt.plot(hist_df.index, y_val_series, label="Validation")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

def load_and_align(csv_paths):
    """Load multiple CSVs, align columns by union, and concatenate."""
    dfs = []
    all_cols = set()
    for p in csv_paths:
        d = pd.read_csv(p)
        dfs.append(d)
        all_cols |= set(d.columns)
    # Ensure consistent column order (sorted for stability)
    all_cols = list(sorted(all_cols))
    aligned = []
    for d in dfs:
        missing = [c for c in all_cols if c not in d.columns]
        if missing:
            for m in missing:
                d[m] = np.nan
        aligned.append(d[all_cols])
    return pd.concat(aligned, axis=0, ignore_index=True)

def detect_target_column(df):
    """Heuristically detect a stress label column."""
    candidates = [
        "stress", "stress_level", "stresslevel", "label", "class", "target", "y",
        "stress category", "stress_category", "stress level"
    ]
    lower_map = {c.lower(): c for c in df.columns}
    for key in candidates:
        if key in lower_map:
            return lower_map[key]
    return df.columns[-1]  # fallback

# -----------------------------
# Load & combine data
# -----------------------------
df = load_and_align([CSV1, CSV2])

# Try to coerce obvious datetime-like columns (by name)
for c in df.columns:
    lc = c.lower()
    if any(k in lc for k in ["date", "time", "timestamp", "datetime"]):
        try:
            df[c] = pd.to_datetime(df[c], errors="ignore")
        except Exception:
            pass

target_col = detect_target_column(df)
X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

# -----------------------------
# Column typing (auto-detect)
# -----------------------------
numeric_cols   = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols       = X_df.select_dtypes(include=["object"]).columns.tolist()
datetime_cols  = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Attempt to parse object columns that look like timestamps
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        ok_ratio = parsed.notna().mean()
        if ok_ratio > 0.7:
            X_df[c] = parsed
            datetime_cols.append(c)
            if c in obj_cols:
                obj_cols.remove(c)
    except Exception:
        pass

# Heuristic: long strings => text, short strings => categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    (text_cols if s.str.len().mean() > 40 else cat_cols).append(c)

# -----------------------------
# Build preprocessing (picklable; version-safe OHE; no lambdas)
# -----------------------------
transformers = []

# Numeric
if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

# Categorical (with version-safe OHE)
if cat_cols:
    ohe = get_ohe_version_safe()
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot",  ohe)
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

# Text (TF-IDF per text column)
for c in text_cols:
    txt_pipe = Pipeline([
        ("select",  ColumnSelector(c)),
        ("impute",  SimpleImputer(strategy="constant", fill_value="")),
        ("to1d",    To1DString()),
        ("tfidf",   TfidfVectorizer(max_features=8000))
    ])
    transformers.append((f"text_{c}", txt_pipe, [c]))

# Datetime expansion
if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3) if transformers else "passthrough"

# -----------------------------
# Encode target + split
# -----------------------------
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
n_classes = len(np.unique(y))

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y if n_classes > 1 else None
)

# Fit & transform
if preprocess == "passthrough":
    X_train = X_train_df.values
    X_test  = X_test_df.values
else:
    X_train = preprocess.fit_transform(X_train_df, y_train)
    X_test  = preprocess.transform(X_test_df)

# Densify only if reasonable
X_train = to_dense_if_reasonable(X_train)
X_test  = to_dense_if_reasonable(X_test)
input_dim = X_train.shape[1]

# -----------------------------
# Build & train Keras model
# -----------------------------
tf.random.set_seed(42)

def build_model(input_dim, n_classes):
    m = Sequential()
    m.add(Dense(256, activation="relu", input_shape=(input_dim,)))
    m.add(Dropout(0.35))
    m.add(Dense(128, activation="relu"))
    m.add(Dropout(0.25))
    if n_classes <= 2:
        m.add(Dense(1, activation="sigmoid"))
        m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        m.add(Dense(n_classes, activation="softmax"))
        m.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

model = build_model(input_dim=input_dim, n_classes=n_classes)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=12,          # adjust if needed
    batch_size=256,     # tune to your RAM/VRAM
    verbose=1
)

# -----------------------------
# Save training curves
# -----------------------------
hist_df = pd.DataFrame(history.history)
hist_df.index = np.arange(1, len(hist_df) + 1)
hist_csv = os.path.join(OUT_DIR, "training_history.csv")
hist_df.to_csv(hist_csv, index_label="epoch")

acc_key = "accuracy" if "accuracy" in hist_df.columns else ("acc" if "acc" in hist_df.columns else None)
val_acc_key = "val_accuracy" if "val_accuracy" in hist_df.columns else ("val_acc" if "val_acc" in hist_df.columns else None)

if acc_key:
    plot_curve(hist_df,
               hist_df[acc_key],
               hist_df[val_acc_key] if val_acc_key in hist_df.columns else None,
               "Model Accuracy", "Accuracy",
               os.path.join(OUT_DIR, "accuracy.png"))

plot_curve(hist_df,
           hist_df["loss"],
           hist_df["val_loss"] if "val_loss" in hist_df.columns else None,
           "Model Loss", "Loss",
           os.path.join(OUT_DIR, "loss.png"))

# -----------------------------
# Save artifacts
# -----------------------------
# 1) Keras H5 (weights+graph)
h5_path = os.path.join(OUT_DIR, "mindbreath_model.h5")
model.save(h5_path)
print(f"[SAVE] H5 -> {h5_path}")

# 2) Model config JSON
json_path = os.path.join(OUT_DIR, "mindbreath_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)
print(f"[SAVE] JSON -> {json_path}")

# 3) Model config YAML
yaml_path = os.path.join(OUT_DIR, "mindbreath_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)
print(f"[SAVE] YAML -> {yaml_path}")

# 4) Preprocess bundle PKL (picklable)
pkl_path = os.path.join(OUT_DIR, "mindbreath_preprocess.pkl")
joblib.dump({
    "preprocess": preprocess,
    "label_encoder": label_encoder,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols
}, pkl_path, compress=3, protocol=4)
print(f"[SAVE] PKL -> {pkl_path}")

# -----------------------------
# Final log
# -----------------------------
y_pred_prob = model.predict(X_test, verbose=0)
if n_classes <= 2:
    y_pred = (y_pred_prob.ravel() >= 0.5).astype(int)
else:
    y_pred = np.argmax(y_pred_prob, axis=1)

acc = accuracy_score(y_test, y_pred)
print("\n=== Training complete ===")
print(f"Target column: {target_col}")
print(f"Detected -> numeric: {numeric_cols} | categorical: {cat_cols} | text: {text_cols} | datetime: {datetime_cols}")
print(f"Validation accuracy: {acc:.4f}")
print(f"Artifacts saved in: {OUT_DIR}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 84ms/step - accuracy: 0.5426 - loss: 1.1018 - val_accuracy: 0.9023 - val_loss: 0.4550
Epoch 2/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.8897 - loss: 0.4379 - val_accuracy: 0.9049 - val_loss: 0.2742
Epoch 3/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9294 - loss: 0.2564 - val_accuracy: 0.9100 - val_loss: 0.2186
Epoch 4/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9345 - loss: 0.1952 - val_accuracy: 0.9100 - val_loss: 0.2024
Epoch 5/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.9374 - loss: 0.1733 - val_accuracy: 0.9177 - val_loss: 0.1879
Epoch 6/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9391 - loss: 0.1580 - val_accuracy: 0.9229 - val_loss: 0.1769
Epoch 7/12
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━



[SAVE] H5 -> C:\Users\sagni\Downloads\Mind Breath\mindbreath_model.h5
[SAVE] JSON -> C:\Users\sagni\Downloads\Mind Breath\mindbreath_model.json
[SAVE] YAML -> C:\Users\sagni\Downloads\Mind Breath\mindbreath_model.yaml
[SAVE] PKL -> C:\Users\sagni\Downloads\Mind Breath\mindbreath_preprocess.pkl

=== Training complete ===
Target column: stress_level
Detected -> numeric: ['Academic and extracurricular activities conflicting for you?', 'Age', 'Are you facing any difficulties with your professors or instructors?', 'Are you in competition with your peers, and does it affect you?', 'Do you attend classes regularly?', 'Do you face any sleep problems or difficulties falling asleep?', 'Do you feel overwhelmed with your academic workload?', 'Do you find that your relationship often causes you stress?', 'Do you get irritated easily?', 'Do you have trouble concentrating on your academic tasks?', 'Do you lack confidence in your academic performance?', 'Do you lack confidence in your choice of academ