In [2]:
# === EcoHabit: Train model + save H5/PKL/YAML/JSON (version-safe) ===
# Input CSV : C:\Users\sagni\Downloads\archive\sustainable_fashion_trends_2024.csv
# Outputs   : C:\Users\sagni\Downloads\Eco Habit\mindpal_model.h5
#             C:\Users\sagni\Downloads\Eco Habit\mindpal_preprocess.pkl
#             C:\Users\sagni\Downloads\Eco Habit\mindpal_model.json
#             C:\Users\sagni\Downloads\Eco Habit\mindpal_model.yaml
#             (plus: accuracy.png, loss.png, training_history.csv)

import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# -----------------------------
# Paths (as requested)
# -----------------------------
CSV_PATH = r"C:\Users\sagni\Downloads\archive\sustainable_fashion_trends_2024.csv"
OUT_DIR  = r"C:\Users\sagni\Downloads\Eco Habit"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Helper transformers (picklable; no lambdas)
# -----------------------------
class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select a single column as a 2D DataFrame."""
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.column]]

class To1DString(BaseEstimator, TransformerMixin):
    """Convert a 2D array/DataFrame (n,1) to 1D array[str] for text vectorizers."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            arr = X.iloc[:, 0].astype(str).values
        else:
            arr = np.asarray(X).astype(str).ravel()
        return arr

class DateTimeExpand(BaseEstimator, TransformerMixin):
    """Expand datetime columns into year/month/day/dow numeric features."""
    def __init__(self, columns):
        self.columns = columns
        self.out_cols = []
    def fit(self, X, y=None):
        self.out_cols = []
        for c in self.columns:
            self.out_cols += [f"{c}_year", f"{c}_month", f"{c}_day", f"{c}_dow"]
        return self
    def transform(self, X):
        outs = []
        for c in self.columns:
            s = pd.to_datetime(X[c], errors="coerce")
            outs.append(pd.DataFrame({
                f"{c}_year":  s.dt.year.fillna(0).astype(int),
                f"{c}_month": s.dt.month.fillna(0).astype(int),
                f"{c}_day":   s.dt.day.fillna(0).astype(int),
                f"{c}_dow":   s.dt.dayofweek.fillna(0).astype(int),
            }))
        return pd.concat(outs, axis=1) if outs else np.empty((len(X), 0))

# -----------------------------
# Utils
# -----------------------------
def get_ohe_version_safe():
    """Return OneHotEncoder with the right sparse arg for current scikit-learn."""
    try:
        # sklearn >= 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        # sklearn < 1.4
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

def to_dense_if_reasonable(X, threshold_features=50000):
    """Densify sparse matrix only if feature count is modest to avoid RAM blow-ups."""
    if sp.issparse(X):
        return X.toarray() if X.shape[1] <= threshold_features else X
    return X

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv(CSV_PATH)

# Pick target automatically (common names → else last column)
CANDIDATE_TARGETS = ["target", "label", "y", "class", "output"]
target_col = next((c for c in CANDIDATE_TARGETS if c in df.columns), df.columns[-1])

X_df = df.drop(columns=[target_col])
y_raw = df[target_col]

# -----------------------------
# Column typing (auto-detect)
# -----------------------------
numeric_cols   = X_df.select_dtypes(include=[np.number]).columns.tolist()
obj_cols       = X_df.select_dtypes(include=["object"]).columns.tolist()
datetime_cols  = X_df.select_dtypes(include=["datetime64[ns]"]).columns.tolist()

# Attempt to parse object columns that look like dates
for c in list(set(X_df.columns) - set(numeric_cols) - set(datetime_cols)):
    try:
        parsed = pd.to_datetime(X_df[c], errors="raise")
        X_df[c] = parsed
        datetime_cols.append(c)
        if c in obj_cols:
            obj_cols.remove(c)
    except Exception:
        pass

# Heuristic: long strings => text, short strings => categorical
text_cols, cat_cols = [], []
for c in obj_cols:
    s = X_df[c].astype(str)
    (text_cols if s.str.len().mean() > 40 else cat_cols).append(c)

# -----------------------------
# Build preprocessing (picklable; version-safe OHE; no lambdas)
# -----------------------------
transformers = []

# Numeric
if numeric_cols:
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    transformers.append(("num", num_pipe, numeric_cols))

# Categorical (with OHE compatibility)
if cat_cols:
    ohe = get_ohe_version_safe()
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot",  ohe)
    ])
    transformers.append(("cat", cat_pipe, cat_cols))

# Text (TF-IDF per text column)
for c in text_cols:
    txt_pipe = Pipeline([
        ("select",  ColumnSelector(c)),
        ("impute",  SimpleImputer(strategy="constant", fill_value="")),
        ("to1d",    To1DString()),
        ("tfidf",   TfidfVectorizer(max_features=5000))
    ])
    transformers.append((f"text_{c}", txt_pipe, [c]))

# Datetime expansion
if datetime_cols:
    dt_pipe = Pipeline([
        ("expand",  DateTimeExpand(datetime_cols)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler())
    ])
    transformers.append(("dt", dt_pipe, datetime_cols))

# If no features remain (edge case), just passthrough
preprocess = ColumnTransformer(transformers=transformers, sparse_threshold=0.3) if transformers else "passthrough"

# -----------------------------
# Encode target + split
# -----------------------------
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
n_classes = len(np.unique(y))

X_train_df, X_test_df, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y if n_classes > 1 else None
)

# Fit transform
if preprocess == "passthrough":
    X_train = X_train_df.values
    X_test  = X_test_df.values
else:
    X_train = preprocess.fit_transform(X_train_df, y_train)
    X_test  = preprocess.transform(X_test_df)

# Densify if small enough (keeps sparse otherwise)
X_train = to_dense_if_reasonable(X_train)
X_test  = to_dense_if_reasonable(X_test)
input_dim = X_train.shape[1]

# -----------------------------
# Build & train Keras model
# -----------------------------
tf.random.set_seed(42)

def build_model(input_dim, n_classes):
    m = Sequential()
    m.add(Dense(128, activation="relu", input_shape=(input_dim,)))
    m.add(Dropout(0.3))
    m.add(Dense(64, activation="relu"))
    if n_classes <= 2:
        m.add(Dense(1, activation="sigmoid"))
        m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    else:
        m.add(Dense(n_classes, activation="softmax"))
        m.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

model = build_model(input_dim=input_dim, n_classes=n_classes)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    verbose=1
)

# -----------------------------
# Quick eval + history plots
# -----------------------------
hist_df = pd.DataFrame(history.history)
hist_df.index = np.arange(1, len(hist_df) + 1)
hist_csv = os.path.join(OUT_DIR, "training_history.csv")
hist_df.to_csv(hist_csv, index_label="epoch")

def plot_curve(y_series, y_val_series, title, ylabel, out_path):
    plt.figure(figsize=(8, 5))
    plt.plot(hist_df.index, y_series, label="Train")
    if y_val_series is not None:
        plt.plot(hist_df.index, y_val_series, label="Validation")
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True, linestyle="--", linewidth=0.5)
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

acc_key = "accuracy" if "accuracy" in hist_df.columns else ("acc" if "acc" in hist_df.columns else None)
val_acc_key = "val_accuracy" if "val_accuracy" in hist_df.columns else ("val_acc" if "val_acc" in hist_df.columns else None)

if acc_key:
    plot_curve(
        hist_df[acc_key],
        hist_df[val_acc_key] if val_acc_key in hist_df.columns else None,
        "Model Accuracy", "Accuracy",
        os.path.join(OUT_DIR, "accuracy.png")
    )

plot_curve(
    hist_df["loss"],
    hist_df["val_loss"] if "val_loss" in hist_df.columns else None,
    "Model Loss", "Loss",
    os.path.join(OUT_DIR, "loss.png")
)

# -----------------------------
# Save artifacts
# -----------------------------
# 1) Keras H5 (weights+graph)
h5_path = os.path.join(OUT_DIR, "mindpal_model.h5")
model.save(h5_path)
print(f"[SAVE] H5 -> {h5_path}")

# 2) Model config JSON
json_path = os.path.join(OUT_DIR, "mindpal_model.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(model.get_config(), f, indent=2)
print(f"[SAVE] JSON -> {json_path}")

# 3) Model config YAML
yaml_path = os.path.join(OUT_DIR, "mindpal_model.yaml")
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(model.get_config(), f, sort_keys=False)
print(f"[SAVE] YAML -> {yaml_path}")

# 4) Preprocess bundle PKL (picklable)
pkl_path = os.path.join(OUT_DIR, "mindpal_preprocess.pkl")
joblib.dump({
    "preprocess": preprocess,
    "label_encoder": label_encoder,
    "target_col": target_col,
    "numeric_cols": numeric_cols,
    "cat_cols": cat_cols,
    "text_cols": text_cols,
    "datetime_cols": datetime_cols
}, pkl_path, compress=3, protocol=4)
print(f"[SAVE] PKL -> {pkl_path}")

# -----------------------------
# Final log
# -----------------------------
y_pred_prob = model.predict(X_test, verbose=0)
if n_classes <= 2:
    y_pred = (y_pred_prob.ravel() >= 0.5).astype(int)
else:
    y_pred = np.argmax(y_pred_prob, axis=1)

acc = accuracy_score(y_test, y_pred)
print("\n=== Training complete ===")
print(f"Target column: {target_col}")
print(f"Detected -> numeric: {numeric_cols} | categorical: {cat_cols} | text: {text_cols} | datetime: {datetime_cols}")
print(f"Validation accuracy: {acc:.4f}")
print(f"Artifacts saved in: {OUT_DIR}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.1990 - loss: 1.6123 - val_accuracy: 0.2100 - val_loss: 1.6093
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.3165 - loss: 1.5766 - val_accuracy: 0.2140 - val_loss: 1.6226
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4528 - loss: 1.4217 - val_accuracy: 0.2260 - val_loss: 1.6774
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8307 - loss: 0.7572 - val_accuracy: 0.2220 - val_loss: 1.7026
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9954 - loss: 0.1053 - val_accuracy: 0.2020 - val_loss: 1.6707
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0202 - val_accuracy: 0.2080 - val_loss: 1.6720
Epoch 7/10
[1m125/125[0m 



[SAVE] H5 -> C:\Users\sagni\Downloads\Eco Habit\mindpal_model.h5
[SAVE] JSON -> C:\Users\sagni\Downloads\Eco Habit\mindpal_model.json
[SAVE] YAML -> C:\Users\sagni\Downloads\Eco Habit\mindpal_model.yaml
[SAVE] PKL -> C:\Users\sagni\Downloads\Eco Habit\mindpal_preprocess.pkl

=== Training complete ===
Target column: Certifications
Detected -> numeric: ['Year', 'Carbon_Footprint_MT', 'Water_Usage_Liters', 'Waste_Production_KG', 'Product_Lines', 'Average_Price_USD'] | categorical: ['Brand_ID', 'Brand_Name', 'Country', 'Sustainability_Rating', 'Material_Type', 'Eco_Friendly_Manufacturing', 'Recycling_Programs', 'Market_Trend'] | text: [] | datetime: []
Validation accuracy: 0.2140
Artifacts saved in: C:\Users\sagni\Downloads\Eco Habit
