In [1]:
import os
import io
import json
import time
import joblib
import base64
import argparse
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

import yaml
import h5py

# --------------------------
# Helpers
# --------------------------

POSSIBLE_TARGETS = [
    "target", "Target", "TARGET",
    "output", "Output",
    "diagnosis", "Diagnosis",
    "label", "Label",
    "HeartDisease", "heart_disease"
]

def guess_target_column(df: pd.DataFrame) -> str:
    # Prefer known names
    for col in POSSIBLE_TARGETS:
        if col in df.columns:
            return col
    # Heuristic: last column if binary-like
    last = df.columns[-1]
    if df[last].dropna().nunique() <= 10:
        return last
    # Otherwise, look for any low-cardinality candidate
    for col in df.columns[::-1]:
        if df[col].dropna().nunique() <= 10:
            return col
    # Fallback: last column anyway
    return df.columns[-1]

def is_binary_series(s: pd.Series) -> bool:
    vals = pd.unique(s.dropna())
    return len(vals) == 2 and set(vals).issubset({0,1}) or len(vals) == 2

def to_jsonable(obj):
    try:
        json.dumps(obj)
        return obj
    except Exception:
        return str(obj)

def save_h5_from_pickle(pkl_path: str, h5_path: str, meta: dict):
    """Create an HDF5 file that embeds the pickled model bytes and metadata.
    Avoids TF/Keras dependency while still providing an .h5 artifact.
    """
    with open(pkl_path, "rb") as f:
        model_bytes = f.read()

    with h5py.File(h5_path, "w") as h5:
        dset = h5.create_dataset("model_pickle", data=np.void(model_bytes))
        # store metadata as attributes
        for k, v in meta.items():
            try:
                if isinstance(v, (str, int, float, np.integer, np.floating)):
                    dset.attrs[k] = v
                else:
                    dset.attrs[k] = json.dumps(v)
            except Exception:
                dset.attrs[k] = str(v)

# --------------------------
# Main
# --------------------------

def main():
    parser = argparse.ArgumentParser(description="Train CardioTrack models and export artifacts.")
    parser.add_argument(
        "--csv",
        default=r"C:\Users\sagni\Downloads\Cardio Track\archive\heart.csv",
        help="Path to heart.csv"
    )
    parser.add_argument(
        "--out",
        default=r"C:\Users\sagni\Downloads\Cardio Track",
        help="Output directory for artifacts"
    )
    # Use parse_known_args so Jupyter's -f arg doesn't break it
    args, _ = parser.parse_known_args()

    csv_path = args.csv
    out_dir  = args.out

    os.makedirs(out_dir, exist_ok=True)

    print("[INFO] Loading CSV:", csv_path)
    df = pd.read_csv(csv_path)

    # Identify target
    target_col = guess_target_column(df)
    if target_col not in df.columns:
        raise ValueError(f"Could not find target column; guessed '{target_col}' but it's not in CSV.")

    print(f"[INFO] Using target column: {target_col}")

    # Basic cleaning: drop rows with all-NA
    df = df.dropna(how="all").reset_index(drop=True)

    # Ensure target is numeric 0/1 if possible
    y_raw = df[target_col]
    if y_raw.dtype == object:
        # map yes/no or strings to ints if possible
        unique_vals = y_raw.dropna().unique().tolist()
        mapping = {v: i for i, v in enumerate(sorted(unique_vals))}
        y = y_raw.map(mapping)
        label_mapping = mapping
    else:
        y = y_raw.copy()
        # if values are {1,2} convert to {0,1} to help AUROC
        uniq = sorted(pd.unique(y.dropna()))
        label_mapping = None
        if set(uniq) == {1,2}:
            y = y.replace({2:1})
        elif set(uniq) == {0,2}:
            y = y.replace({2:1})

    X = df.drop(columns=[target_col])

    # Split numeric/categorical
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]

    # Preprocess: impute, scale numeric; impute, one-hot categorical
    numeric_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_tf = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot",  __import__("sklearn").preprocessing.OneHotEncoder(
            handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_tf, num_cols),
            ("cat", categorical_tf, cat_cols)
        ],
        remainder="drop"
    )

    # Models
    models = {
        "logreg": LogisticRegression(max_iter=2000, n_jobs=None),
        "rf": RandomForestClassifier(
            n_estimators=400,
            max_depth=None,
            random_state=42,
            n_jobs=-1
        )
    }

    # Train/Val split
    stratify = y if y is not None and y.nunique() > 1 else None
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=stratify
    )

    results = {}
    best_name = None
    best_score = -np.inf

    # Fit & evaluate
    for name, est in models.items():
        pipe = Pipeline(steps=[("prep", preprocessor), ("model", est)])
        print(f"[INFO] Training {name} ...")
        pipe.fit(X_train, y_train)

        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        # AUROC if possible
        try:
            if len(np.unique(y_test)) == 2:
                if hasattr(pipe, "predict_proba"):
                    y_prob = pipe.predict_proba(X_test)[:, 1]
                elif hasattr(pipe, "decision_function"):
                    # logistic-like interface
                    scores = pipe.decision_function(X_test)
                    # scale to [0,1]
                    y_prob = (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)
                else:
                    y_prob = None

                auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
            else:
                auc = None
        except Exception:
            auc = None

        # Choose by AUROC primarily, else by accuracy
        score = auc if auc is not None else acc
        results[name] = {
            "accuracy": acc,
            "roc_auc": auc,
            "chosen_score": score
        }

        if score is not None and score > best_score:
            best_score = score
            best_name = name
            best_pipe = pipe

    if best_name is None:
        raise RuntimeError("No model could be trained successfully.")

    print(f"[INFO] Best model: {best_name} (score={best_score:.4f})")

    # Fit on all data for final export (optional; here we keep best_pipe as trained on train)
    # best_pipe.fit(X, y)

    # --------------------------
    # Exports
    # --------------------------
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    pkl_path = os.path.join(out_dir, "cardiotrack_model.pkl")
    h5_path  = os.path.join(out_dir, "cardiotrack_model.h5")
    yml_path = os.path.join(out_dir, "cardiotrack_config.yaml")
    jsn_path = os.path.join(out_dir, "cardiotrack_report.json")

    # 1) PKL
    joblib.dump(best_pipe, pkl_path)
    print("[OK] Saved:", pkl_path)

    # 2) YAML (config/run info)
    config_payload = {
        "run": {
            "timestamp": timestamp,
            "csv_path": csv_path,
            "out_dir": out_dir,
            "target": target_col
        },
        "preprocessing": {
            "numeric_columns": num_cols,
            "categorical_columns": cat_cols,
            "impute_numeric": "median",
            "impute_categorical": "most_frequent",
            "scale_numeric": "standard",
            "one_hot": True
        },
        "models_tried": list(models.keys()),
        "chosen_model": best_name
    }
    with open(yml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(config_payload, f, sort_keys=False, allow_unicode=True)
    print("[OK] Saved:", yml_path)

    # 3) JSON (metrics + schema + info)
    report = {
        "timestamp": timestamp,
        "dataset_shape": [int(df.shape[0]), int(df.shape[1])],
        "target": target_col,
        "class_balance": {str(k): int(v) for k, v in pd.Series(y).value_counts(dropna=False).to_dict().items()},
        "features": [c for c in X.columns],
        "results": results,
        "chosen_model": best_name,
        "sklearn_version": __import__("sklearn").__version__,
        "notes": "CardioTrack quick baseline training"
    }
    if label_mapping is not None:
        report["label_mapping"] = {str(k): int(v) for k, v in label_mapping.items()}

    with open(jsn_path, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)

    print("[OK] Saved:", jsn_path)

    # 4) H5 (embed the pickled model plus some metadata)
    h5_meta = {
        "artifact_type": "sklearn_pickle_container",
        "chosen_model": best_name,
        "timestamp": timestamp,
        "target": target_col
    }
    save_h5_from_pickle(pkl_path, h5_path, h5_meta)
    print("[OK] Saved:", h5_path)

    print("\n=== CardioTrack Artifacts Ready ===")
    print("PKL ->", pkl_path)
    print("H5  ->", h5_path)
    print("YAML->", yml_path)
    print("JSON->", jsn_path)

if __name__ == "__main__":
    main()


[INFO] Loading CSV: C:\Users\sagni\Downloads\Cardio Track\archive\heart.csv
[INFO] Using target column: target
[INFO] Training logreg ...
[INFO] Training rf ...
[INFO] Best model: rf (score=1.0000)
[OK] Saved: C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.pkl
[OK] Saved: C:\Users\sagni\Downloads\Cardio Track\cardiotrack_config.yaml
[OK] Saved: C:\Users\sagni\Downloads\Cardio Track\cardiotrack_report.json
[OK] Saved: C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.h5

=== CardioTrack Artifacts Ready ===
PKL -> C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.pkl
H5  -> C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.h5
YAML-> C:\Users\sagni\Downloads\Cardio Track\cardiotrack_config.yaml
JSON-> C:\Users\sagni\Downloads\Cardio Track\cardiotrack_report.json
