In [1]:
import os
import json
import argparse
from datetime import datetime

import numpy as np
import pandas as pd
import joblib

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix


DEFAULT_MODEL  = r"C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.pkl"
DEFAULT_INPUT  = r"C:\Users\sagni\Downloads\Cardio Track\archive\heart.csv"
DEFAULT_OUTDIR = r"C:\Users\sagni\Downloads\Cardio Track"

POSSIBLE_TARGETS = [
    "target", "Target", "TARGET",
    "output", "Output",
    "diagnosis", "Diagnosis",
    "label", "Label",
    "HeartDisease", "heart_disease"
]

def guess_target_column(df: pd.DataFrame) -> str | None:
    for col in POSSIBLE_TARGETS:
        if col in df.columns:
            return col
    # Heuristic: low-cardinality last column
    last = df.columns[-1]
    try:
        if df[last].dropna().nunique() <= 10:
            return last
    except Exception:
        pass
    return None

def safe_float(x):
    try:
        return float(x)
    except Exception:
        return None

def main():
    parser = argparse.ArgumentParser(description="CardioTrack batch prediction")
    parser.add_argument("--model",  default=DEFAULT_MODEL,  help="Path to joblib .pkl model pipeline")
    parser.add_argument("--input",  default=DEFAULT_INPUT,  help="Path to input CSV")
    parser.add_argument("--outdir", default=DEFAULT_OUTDIR, help="Output directory")

    parser.add_argument("--target", default=None, help="Name of target column (optional)")
    parser.add_argument("--id_col", default=None, help="Optional ID column to carry into outputs")

    parser.add_argument("--pred_name", default="predicted", help="Name of the prediction column in output CSV")
    parser.add_argument("--proba1_name", default="proba_1", help="Column name for positive-class probability")
    parser.add_argument("--proba0_name", default="proba_0", help="Column name for negative-class probability")

    args, _ = parser.parse_known_args()

    model_path = args.model
    input_path = args.input
    outdir     = args.outdir
    target_col = args.target
    id_col     = args.id_col

    os.makedirs(outdir, exist_ok=True)

    # ---------- Load model ----------
    print("[INFO] Loading model:", model_path)
    pipe = joblib.load(model_path)

    # ---------- Load data ----------
    print("[INFO] Loading data:", input_path)
    df = pd.read_csv(input_path).dropna(how="all").reset_index(drop=True)
    n_rows, n_cols = df.shape
    print(f"[INFO] Data shape: {n_rows} x {n_cols}")

    # Detect target if not explicitly provided
    inferred_target = guess_target_column(df)
    if target_col is None and inferred_target is not None:
        target_col = inferred_target
        print(f"[INFO] Inferred target column: {target_col}")

    # Separate features and (optional) ground truth
    if target_col and target_col in df.columns:
        y_true = df[target_col].copy()
        X = df.drop(columns=[target_col])
    else:
        y_true = None
        X = df

    # Keep ID column (if any) aside to re-attach later
    id_series = None
    if id_col and id_col in X.columns:
        id_series = X[id_col].copy()
        X = X.drop(columns=[id_col])

    # ---------- Predict ----------
    print("[INFO] Running predictions...")
    y_pred = pipe.predict(X)

    proba_1 = None
    proba_0 = None
    if hasattr(pipe, "predict_proba"):
        try:
            probs = pipe.predict_proba(X)
            # Determine which column is the "positive class" for binary case
            # scikit-learn: classes_ attribute holds class order
            classes = getattr(pipe, "classes_", None)
            # If pipeline, access final estimator's classes_
            if classes is None and hasattr(pipe, "named_steps") and "model" in pipe.named_steps:
                classes = getattr(pipe.named_steps["model"], "classes_", None)

            if classes is not None and len(classes) == 2:
                # Proba for positive class = class with max label, or assume class 1 if present
                # Prefer class "1" if available
                pos_index = 1 if 1 in classes else np.argmax(classes)
                proba_1 = probs[:, list(classes).index(pos_index)] if pos_index in classes else probs[:, 1]
                proba_0 = 1.0 - proba_1
            else:
                # Multiclass or unknown schema: store only max probability
                proba_1 = probs.max(axis=1)
        except Exception as e:
            print("[WARN] predict_proba not available or failed:", str(e))

    # ---------- Build output ----------
    out_df = df.copy()
    out_df[args.pred_name] = y_pred

    if proba_1 is not None:
        out_df[args.proba1_name] = proba_1
    if proba_0 is not None:
        out_df[args.proba0_name] = proba_0

    # Re-attach ID if we pulled it out earlier (ensure same order)
    if id_series is not None and id_col not in out_df.columns:
        out_df.insert(0, id_col, id_series.values)

    # ---------- Save predictions ----------
    pred_csv = os.path.join(outdir, "predictions.csv")
    out_df.to_csv(pred_csv, index=False)
    print("[OK] Saved predictions CSV:", pred_csv)

    # ---------- Summary JSON ----------
    summary = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "model_path": model_path,
        "input_path": input_path,
        "n_rows": int(n_rows),
        "n_cols": int(n_cols),
        "target_in_input": bool(target_col and target_col in df.columns),
        "pred_column": args.pred_name,
        "proba_1_column": (args.proba1_name if proba_1 is not None else None),
        "proba_0_column": (args.proba0_name if proba_0 is not None else None)
    }
    summary_json = os.path.join(outdir, "prediction_summary.json")
    with open(summary_json, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2)
    print("[OK] Saved summary JSON:", summary_json)

    # ---------- Optional: Metrics if ground truth present ----------
    if target_col and target_col in df.columns:
        y_true_proc = y_true.copy()

        # Normalize common binary label quirks for fair metrics
        uniq = sorted(pd.unique(y_true_proc.dropna()))
        if set(uniq) == {1, 2}:
            y_true_proc = y_true_proc.replace({2: 1})
        elif set(uniq) == {0, 2}:
            y_true_proc = y_true_proc.replace({2: 1})

        metrics = {}
        try:
            acc = accuracy_score(y_true_proc, y_pred)
            metrics["accuracy"] = float(acc)
        except Exception:
            pass

        # AUROC for binary only and if probabilities available
        try:
            if proba_1 is not None and len(np.unique(y_true_proc.dropna())) == 2:
                # ensure probs not constant
                if not np.allclose(np.min(proba_1), np.max(proba_1)):
                    auc = roc_auc_score(y_true_proc, proba_1)
                    metrics["roc_auc"] = float(auc)
        except Exception:
            pass

        # Confusion matrix (safe)
        try:
            labels_sorted = np.unique(np.concatenate([y_true_proc.values, y_pred]))
            cm = confusion_matrix(y_true_proc, y_pred, labels=labels_sorted).tolist()
            metrics["labels"] = [int(x) if isinstance(x, (np.integer,)) else (x.item() if isinstance(x, np.generic) else x)
                                 for x in labels_sorted]
            metrics["confusion_matrix"] = cm
        except Exception:
            pass

        metrics_path = os.path.join(outdir, "prediction_metrics.json")
        with open(metrics_path, "w", encoding="utf-8") as f:
            json.dump(metrics, f, indent=2)
        print("[OK] Saved metrics JSON (since target present):", metrics_path)

    # ---------- Preview ----------
    print("\n=== Preview (first 5 rows) ===")
    with pd.option_context("display.max_columns", None):
        print(out_df.head(5))


if __name__ == "__main__":
    main()


[INFO] Loading model: C:\Users\sagni\Downloads\Cardio Track\cardiotrack_model.pkl
[INFO] Loading data: C:\Users\sagni\Downloads\Cardio Track\archive\heart.csv
[INFO] Data shape: 1025 x 14
[INFO] Inferred target column: target
[INFO] Running predictions...
[OK] Saved predictions CSV: C:\Users\sagni\Downloads\Cardio Track\predictions.csv
[OK] Saved summary JSON: C:\Users\sagni\Downloads\Cardio Track\prediction_summary.json
[OK] Saved metrics JSON (since target present): C:\Users\sagni\Downloads\Cardio Track\prediction_metrics.json

=== Preview (first 5 rows) ===
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   