In [2]:
import os, json, datetime as dt
import numpy as np, pandas as pd
import joblib

SNAPSHOT_PATH = r"C:\Users\aayus\Downloads\emr-smart\data\emr_snapshot.csv"
HISTORY_PATH  = r"C:\Users\aayus\Downloads\emr-smart\data\emr_history.csv"
MODEL_DIR     = os.path.join(os.path.dirname(__file__), "models")
OUT_DIR       = os.path.join(os.path.dirname(__file__), "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

# ---------- helpers ----------
def severity_to_class(s, c1=2.5, c2=5.6):
    if s < c1: return "Low"
    if s < c2: return "Moderate"
    return "High"

def rule_alerts(feat):
    alerts, why = [], []
    o2 = feat.get("Oxygen_Saturation")
    if o2 is not None and pd.notna(o2) and o2 < 92:
        alerts.append("Hypoxemia"); why.append(f"O₂ {o2:.0f}%")
    T = feat.get("Temperature")
    if T is not None and pd.notna(T) and T >= 38.3:
        alerts.append("Possible infection/fever"); why.append(f"T {T:.1f}°C")
    sbp = feat.get("Blood_Pressure_Systolic"); dbp = feat.get("Blood_Pressure_Diastolic")
    if all(v is not None and pd.notna(v) for v in [sbp, dbp]) and (sbp >= 180 or dbp >= 120):
        alerts.append("Hypertensive crisis (recheck)"); why.append(f"BP {int(sbp)}/{int(dbp)}")
    glu = feat.get("Blood_Sugar")
    if glu is not None and pd.notna(glu) and glu >= 250:
        alerts.append("Hyperglycemia"); why.append(f"Glucose {int(glu)} mg/dL")
    return alerts, why

def series_delta_slope(row, prefix):
    weeks = [row.get(f"{prefix}_Week{i}", np.nan) for i in range(1,6)]
    arr = np.array(weeks, dtype=float)
    if np.any(np.isnan(arr)): return {"delta": None, "slope": None}
    x = np.arange(1,6, dtype=float)
    slope = float(np.polyfit(x, arr, 1)[0])
    delta = float(arr[-1] - arr[0])
    return {"delta": delta, "slope": slope}

# ---------- load ----------
snap = pd.read_csv(SNAPSHOT_PATH)
hist = pd.read_csv(HISTORY_PATH)

# models (snapshot may not exist if you skipped severity earlier)
snapshot_model = None
cut_c1, cut_c2 = 2.5, 5.6
sp = os.path.join(MODEL_DIR, "snapshot_regressor.joblib")
if os.path.exists(sp):
    snapshot_model = joblib.load(sp)
    cpath = os.path.join(MODEL_DIR, "snapshot_cutpoints.txt")
    if os.path.exists(cpath):
        with open(cpath) as f:
            for line in f:
                if line.startswith("c1="): cut_c1 = float(line.split("=")[1])
                if line.startswith("c2="): cut_c2 = float(line.split("=")[1])

hist_model = joblib.load(os.path.join(MODEL_DIR, "history_classifier.joblib"))

# align patients present in both
if "Patient_ID" not in snap.columns or "Patient_ID" not in hist.columns:
    raise ValueError("Both CSVs must contain Patient_ID")

pids = sorted(set(snap["Patient_ID"]).intersection(set(hist["Patient_ID"])))
if not pids:
    raise ValueError("No overlapping Patient_IDs between snapshot and history")

# choose the same feature set the model expects (model’s ColumnTransformer handles the rest)
def current_feats(row):
    keys = ["Heart_Rate","Blood_Pressure_Systolic","Blood_Pressure_Diastolic",
            "Respiratory_Rate","Temperature","Oxygen_Saturation",
            "Blood_Sugar","Cholesterol_Total","Weight","BMI"]
    return {k: row.get(k, None) for k in keys}

fusions = []
for pid in pids:
    srow = snap.loc[snap["Patient_ID"] == pid].iloc[0]
    hrow = hist.loc[hist["Patient_ID"] == pid].iloc[0]

    # SNAPSHOT predict
    sev_score = None; sev_class = None
    if snapshot_model is not None:
        # build X row dropping IDs/Severity if present; ColumnTransformer will handle numeric+cats
        drop_cols = [c for c in ["Patient_ID","Patient_Name","Severity"] if c in srow.index]
        Xs = pd.DataFrame([srow.drop(labels=drop_cols, errors="ignore").to_dict()])
        sev_score = float(snapshot_model.predict(Xs)[0])
        sev_class = severity_to_class(sev_score, cut_c1, cut_c2)

    # HISTORY predict
    # the classifier pipeline expects the raw row-like frame it was trained on (build_design was inside pipeline at train)
    Xh = pd.DataFrame([hrow.drop(labels=[c for c in ["Patient_ID","Trend_Status"] if c in hrow.index]).to_dict()])
    probs = hist_model.predict_proba(Xh)[0]
    classes = list(hist_model.classes_)
    trend_label = classes[int(np.argmax(probs))]
    trend_probs  = {cls: float(p) for cls, p in zip(classes, probs)}

    # compact trend features (handful)
    tf = {
        "Heart_Rate": series_delta_slope(hrow, "Heart_Rate"),
        "Blood_Pressure_Systolic": series_delta_slope(hrow, "Blood_Pressure_Systolic"),
    }

    cur = current_feats(srow)
    alerts, why = rule_alerts(cur)

    # simple risk score
    risk = 1
    if sev_class == "High" and trend_label == "Worsening":
        risk = 3
    elif (sev_class == "High" and trend_label in ["Stable","Improving"]) or (sev_class == "Moderate" and trend_label == "Worsening"):
        risk = 2

    fusion = {
        "patient_id": str(pid),
        "timestamp": dt.datetime.utcnow().isoformat() + "Z",
        "snapshot": {
            "severity_score": sev_score,
            "severity_class": sev_class,
            "features": cur
        },
        "history": {
            "trend_label": trend_label,
            "probs": trend_probs,
            "trend_features": tf
        },
        "derived": {
            "risk_score": risk,
            "alerts": alerts,
            "rationales": why
        }
    }
    fusions.append(fusion)

# write one JSON per patient + one big ndjson
ndjson_path = os.path.join(OUT_DIR, "fusion.ndjson")
with open(ndjson_path, "w", encoding="utf-8") as f:
    for fu in fusions:
        f.write(json.dumps(fu) + "\n")

print(f"Fusion written: {ndjson_path}  ({len(fusions)} patients)")


NameError: name '__file__' is not defined

In [5]:
from pathlib import Path
import os, json, joblib
import numpy as np
import pandas as pd

# ---- Paths (project-rooted) ----
BASE = Path(r"C:\Users\aayus\Downloads\emr-smart")
DATA_DIR  = BASE / "data"
MODEL_DIR = BASE / "models"
OUT_DIR   = BASE / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

SNAPSHOT_PATH = DATA_DIR / "emr_snapshot.csv"
HISTORY_PATH  = DATA_DIR / "emr_history.csv"

# ---- Load data ----
snap = pd.read_csv(SNAPSHOT_PATH)
hist = pd.read_csv(HISTORY_PATH)

# ---- Load models ----
hist_model_path = MODEL_DIR / "best_history_classifier.joblib"
if not hist_model_path.exists():
    raise FileNotFoundError(f"Missing model: {hist_model_path}")
hist_pipe = joblib.load(hist_model_path)

snap_pipe = None
cutpoints = (2.5, 5.6)
try:
    snap_model_path = MODEL_DIR / "snapshot_regressor.joblib"
    if snap_model_path.exists():
        snap_pipe = joblib.load(snap_model_path)
        cp_txt = MODEL_DIR / "snapshot_cutpoints.txt"
        if cp_txt.exists():
            lines = cp_txt.read_text(encoding="utf-8").strip().splitlines()
            kv = {}
            for ln in lines:
                if "=" in ln:
                    k, v = ln.split("=", 1)
                    kv[k.strip()] = float(v.strip())
            cutpoints = (kv.get("c1", 2.5), kv.get("c2", 5.6))
except Exception as e:
    print("Snapshot model not loaded:", e)

# ---- Helper for binning severity ----
bins = [-1e9, cutpoints[0], cutpoints[1], 1e9]
labels = ["Low", "Moderate", "High"]

# ---- Build fusion records by patient ----
by_id = {}

# Ensure Patient_ID columns
if "Patient_ID" not in snap.columns:
    snap["Patient_ID"] = np.arange(len(snap))
if "Patient_ID" not in hist.columns:
    hist["Patient_ID"] = np.arange(len(hist))

snap_ids = snap["Patient_ID"].astype(str)
hist_ids = hist["Patient_ID"].astype(str)

# ---- Snapshot predictions (optional) ----
if snap_pipe is not None:
    drop_cols = [c for c in ["Patient_ID", "Patient_Name", "Severity", "_SeveritySynth_"] if c in snap.columns]
    Xs = snap.drop(columns=drop_cols, errors="ignore")
    sev_pred = snap_pipe.predict(Xs)
    sev_lbl  = pd.cut(sev_pred, bins=bins, labels=labels).astype(str)
    for pid, s_num, s_lbl in zip(snap_ids, sev_pred, sev_lbl):
        d = by_id.setdefault(pid, {})
        d["snapshot"] = {
            "severity_numeric": float(s_num),
            "severity_label": s_lbl
        }

# ---- History predictions (required) ----
# Mirror training drops: remove id + any severity week columns if present
severity_cols = [c for c in hist.columns if str(c).startswith("Severity_Week")]
Xh = hist.drop(columns=[c for c in ["Patient_ID", "Trend_Status", *severity_cols] if c in hist.columns], errors="ignore")

trend_pred = hist_pipe.predict(Xh)
try:
    P = hist_pipe.predict_proba(Xh)
    cls = hist_pipe.named_steps["model"].classes_
except Exception:
    P = None
    cls = None

for i, (pid, tr) in enumerate(zip(hist_ids, trend_pred)):
    rec = by_id.setdefault(pid, {})
    rec["history"] = {"trend": str(tr)}
    if P is not None:
        pmap = {str(c): float(P[i, j]) for j, c in enumerate(cls)}
        rec["history"]["proba"] = pmap

# ---- Alerts (simple rules) ----
def mk_alerts(blob: dict):
    alerts = []
    snap_blob = blob.get("snapshot", {})
    hist_blob = blob.get("history", {})
    sev = snap_blob.get("severity_numeric")
    sev_lbl = snap_blob.get("severity_label")
    trend = hist_blob.get("trend")
    probs = hist_blob.get("proba", {})

    # Example heuristics
    if sev is not None and sev >= 7.5:
        alerts.append("High predicted severity; consider closer monitoring.")
    if trend == "Worsening":
        alerts.append("Worsening 5-week trend flagged; review recent vitals and medications.")
    if probs and probs.get("Stable", 0.0) < 0.2:
        alerts.append("Low stability probability; check for emerging issues.")
    return alerts

for pid, blob in by_id.items():
    blob["alerts"] = mk_alerts(blob)

# ---- Write NDJSON ----
out_path = OUT_DIR / "fusion.ndjson"
with out_path.open("w", encoding="utf-8") as f:
    for pid, blob in by_id.items():
        f.write(json.dumps({"patient_id": pid, **blob}, ensure_ascii=False) + "\n")

print("Wrote:", str(out_path))


ValueError: columns are missing: {'Respiratory_Rate_delta', 'Blood_Sugar_cv', 'Weight_abs_slope', 'Blood_Pressure_Systolic_mad', 'Weight_mad', 'Cholesterol_Total_slope', 'BMI_abs_slope', 'Heart_Rate_range', 'BMI_mean', 'Severity_Week5', 'Cholesterol_Total_delta', 'Blood_Pressure_Diastolic_abs_slope', 'Severity_Week4', 'global_mean_abs_step', 'Respiratory_Rate_cv', 'Blood_Pressure_Diastolic_cv', 'Respiratory_Rate_abs_slope', 'Oxygen_Saturation_slope', 'Blood_Pressure_Systolic_delta', 'Temperature_range', 'Respiratory_Rate_mean_abs_step', 'Severity_Week3', 'Blood_Sugar_std', 'Blood_Pressure_Systolic_mean', 'Weight_delta', 'Temperature_stable', 'Temperature_mean_abs_step', 'Blood_Pressure_Systolic_range', 'Blood_Pressure_Diastolic_slope', 'Blood_Pressure_Systolic_slope', 'Blood_Pressure_Diastolic_mean', 'Blood_Pressure_Diastolic_range', 'BMI_slope', 'Heart_Rate_mean_abs_step', 'Oxygen_Saturation_delta', 'Temperature_abs_slope', 'Weight_stable', 'Temperature_std', 'Heart_Rate_cv', 'Weight_mean', 'Cholesterol_Total_mad', 'BMI_std', 'Temperature_mean', 'Blood_Pressure_Diastolic_stable', 'Blood_Pressure_Systolic_mean_abs_step', 'Weight_cv', 'Heart_Rate_std', 'Blood_Pressure_Systolic_std', 'Temperature_delta', 'BMI_stable', 'Cholesterol_Total_std', 'Respiratory_Rate_std', 'Blood_Sugar_delta', 'Oxygen_Saturation_mean', 'Oxygen_Saturation_mean_abs_step', 'Cholesterol_Total_mean_abs_step', 'global_stability_ratio', 'Cholesterol_Total_cv', 'Blood_Pressure_Systolic_abs_slope', 'Temperature_mad', 'Weight_slope', 'Oxygen_Saturation_cv', 'BMI_mean_abs_step', 'BMI_cv', 'Heart_Rate_mean', 'Oxygen_Saturation_std', 'Weight_std', 'Blood_Sugar_slope', 'BMI_range', 'Blood_Pressure_Diastolic_mean_abs_step', 'Oxygen_Saturation_stable', 'Cholesterol_Total_stable', 'Blood_Pressure_Diastolic_std', 'Respiratory_Rate_range', 'Blood_Pressure_Diastolic_delta', 'Blood_Sugar_abs_slope', 'Heart_Rate_slope', 'Blood_Pressure_Systolic_cv', 'Heart_Rate_abs_slope', 'Weight_range', 'Respiratory_Rate_mean', 'Blood_Sugar_stable', 'Severity_Week2', 'Blood_Sugar_mean', 'Oxygen_Saturation_abs_slope', 'Respiratory_Rate_slope', 'Temperature_slope', 'Blood_Sugar_range', 'BMI_delta', 'Cholesterol_Total_abs_slope', 'Blood_Pressure_Diastolic_mad', 'Heart_Rate_delta', 'Respiratory_Rate_mad', 'Respiratory_Rate_stable', 'Temperature_cv', 'Weight_mean_abs_step', 'Blood_Sugar_mean_abs_step', 'Cholesterol_Total_range', 'Oxygen_Saturation_range', 'Severity_Week1', 'Cholesterol_Total_mean', 'Heart_Rate_mad', 'Blood_Sugar_mad', 'Oxygen_Saturation_mad', 'BMI_mad', 'Blood_Pressure_Systolic_stable', 'Heart_Rate_stable'}

In [6]:
# ============================================
# EMR Inference: Snapshot + History → fusion.ndjson
# Paths adapted to your layout shown in screenshots
# ============================================

import os, json, warnings
import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

# ---------- Paths ----------
BASE_DIR   = Path(r"C:\Users\aayus\Downloads\emr-smart")
DATA_DIR   = BASE_DIR / "data"
MODEL_DIR  = BASE_DIR / "models"
OUT_DIR    = BASE_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

snap_csv = DATA_DIR / "emr_snapshot.csv"
hist_csv = DATA_DIR / "emr_history.csv"

snap_model_path = MODEL_DIR / "snapshot_regressor.joblib"      # may not exist if snapshot target was synthesized
cutpoints_path  = MODEL_DIR / "snapshot_cutpoints.txt"          # has c1,c2 used when binning
hist_model_path = MODEL_DIR / "best_history_classifier.joblib"  # exists per your screenshot

# ---------- Helpers ----------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

# ---------- Trend features (MUST match training) ----------
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    # This is the SAME definition we trained with (eps=1.6, full feature list)
    def __init__(self, bases, eps=1.6):
        self.bases = bases
        self.eps = eps
        self.feature_names_ = None

    def fit(self, X, y=None):
        feats = []
        for b in self.bases:
            feats += [
                f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope",
                f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step",
                f"{b}_cv", f"{b}_mad", f"{b}_abs_slope"
            ]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_ = feats
        return self

    def transform(self, X):
        W = np.array([1,2,3,4,5], dtype=float)
        rows = []
        for _, row in X.iterrows():
            feats = []
            stable_hits = 0
            all_steps = []
            for b in self.bases:
                vals = np.array([row[f"{b}_Week{i}"] for i in range(1,6)], dtype=float)
                delta = float(vals[-1] - vals[0])
                meanv = float(np.mean(vals))
                stdv = float(np.std(vals, ddof=0))
                slope = float(np.polyfit(W, vals, 1)[0])
                steps = np.diff(vals)
                max_step = float(np.max(np.abs(steps)))
                stable = 1.0 if max_step < self.eps else 0.0
                rng = float(np.max(vals) - np.min(vals))
                mean_abs_step = float(np.mean(np.abs(steps)))
                cv = float(stdv / (meanv + 1e-6))
                mad = float(np.median(np.abs(vals - np.median(vals))))
                abs_slope = abs(slope)

                feats += [delta, meanv, stdv, slope, stable,
                          rng, mean_abs_step, cv, mad, abs_slope]
                stable_hits += stable
                all_steps.extend(np.abs(steps))

            feats += [
                stable_hits / max(len(self.bases), 1),
                float(np.mean(all_steps)) if all_steps else 0.0
            ]
            rows.append(feats)
        return np.array(rows)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

def build_history_design(df_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Reconstruct the SAME matrix used at training time:
    - Keep all original Week1..Week5 columns (do NOT drop)
    - Append engineered trend features
    - Keep any categorical columns as-is
    """
    # Do NOT drop Severity_Week* here; training kept them
    cat_cols = [c for c in df_raw.columns if df_raw[c].dtype == "object"]
    num_cols = [c for c in df_raw.columns if c not in cat_cols]

    # Determine which bases exist
    bases_present = [b for b in ALL_BASES if f"{b}_Week1" in df_raw.columns and f"{b}_Week5" in df_raw.columns]
    if not bases_present:
        raise ValueError("No *_Week1..Week5 series found in history input; cannot engineer trends.")

    eng = TrendFeatureEngineer(bases_present, eps=1.6)
    trend = eng.fit_transform(df_raw)
    tcols = eng.get_feature_names_out().tolist()

    # Build new frame with numeric (original) + engineered + cats (concat for speed)
    X_num = df_raw[num_cols].copy()
    X_trend = pd.DataFrame(trend, columns=tcols, index=df_raw.index)
    X_cat = df_raw[cat_cols].copy()
    out = pd.concat([X_num, X_trend, X_cat], axis=1)
    return out

# ---------- Load models ----------
if not hist_model_path.exists():
    raise FileNotFoundError(f"Missing model: {hist_model_path}")
hist_pipe = joblib.load(hist_model_path)

snap_available = snap_model_path.exists()
if snap_available:
    snap_pipe = joblib.load(snap_model_path)
    # load cutpoints if present
    c1, c2 = 2.5, 5.6
    if cutpoints_path.exists():
        try:
            text = Path(cutpoints_path).read_text().strip().splitlines()
            kv = dict(line.split("=") for line in text if "=" in line)
            c1 = float(kv.get("c1", c1))
            c2 = float(kv.get("c2", c2))
        except Exception:
            pass

# ---------- SNAPSHOT inference (optional) ----------
snap_records = {}
if snap_available and snap_csv.exists():
    snap = pd.read_csv(snap_csv)
    # infer using the saved pipeline (it contains preprocessing); we just drop ID/name if present
    Xs = snap.drop(columns=[c for c in ["Patient_ID", "Patient_Name", "Severity", "severity",
                                        "SeverityScore","severity_score","Severity_Label","severity_label",
                                        "SeverityClass","severityClass","sev","label","target"]
                            if c in snap.columns], errors="ignore")
    y_pred_sev = snap_pipe.predict(Xs)
    # Bin to Low/Moderate/High using saved cutpoints
    bins = [-1e9, c1, c2, 1e9]
    labels = ["Low","Moderate","High"]
    sev_cls = pd.cut(y_pred_sev, bins=bins, labels=labels).astype(str)

    # stash by patient_id if present; else row index
    pid_col = "Patient_ID" if "Patient_ID" in snap.columns else None
    for i in range(len(snap)):
        pid = str(snap.iloc[i][pid_col]) if pid_col else f"row_{i}"
        snap_records[pid] = {
            "severity_numeric": float(y_pred_sev[i]),
            "severity_label": sev_cls.iloc[i]
        }
    print(f"[SNAPSHOT] Inferred {len(snap_records)} rows (with classes via cutpoints c1={c1}, c2={c2}).")
else:
    if not snap_csv.exists():
        warnings.warn(f"Snapshot CSV not found at {snap_csv}; skipping snapshot inference.")
    else:
        warnings.warn("Snapshot model not found; skipping snapshot inference.")

# ---------- HISTORY inference (rebuild design!) ----------
if not hist_csv.exists():
    raise FileNotFoundError(f"History CSV not found at: {hist_csv}")

hist = pd.read_csv(hist_csv)

# We only drop obvious IDs/labels; KEEP Severity_Week* columns because training kept them
drop_cols = [c for c in ["Patient_ID", "Trend_Status"] if c in hist.columns]
hist_raw = hist.drop(columns=drop_cols, errors="ignore").copy()

# Build design to match training schema
Xh = build_history_design(hist_raw)

# Predict
trend_pred = hist_pipe.predict(Xh)
try:
    P = hist_pipe.predict_proba(Xh)
    classes = hist_pipe.named_steps.get("model", hist_pipe).classes_
except Exception:
    # some estimators lack predict_proba
    P = None
    classes = np.unique(trend_pred)

# Map probs to dict per row
def row_proba(i):
    if P is None: 
        return None
    return {str(classes[j]): float(P[i, j]) for j in range(len(classes))}

# ---------- Fuse and write NDJSON ----------
fusion_path = OUT_DIR / "fusion.ndjson"
with open(fusion_path, "w", encoding="utf-8") as fout:
    for i in range(len(hist)):
        pid = str(hist.iloc[i]["Patient_ID"]) if "Patient_ID" in hist.columns else f"row_{i}"
        rec = {
            "patient_id": pid,
            "history": {
                "trend": str(trend_pred[i]),
                "proba": row_proba(i)
            },
            "snapshot": snap_records.get(pid, None),
            "alerts": []  # you can add rule-based alerts here if you want
        }
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Wrote:", fusion_path)


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [7]:
# fuse_infer_generate.py  — fixed sev_cls indexing + rebuilds trend features

import os, json, warnings
import numpy as np
import pandas as pd
import joblib

from sklearn.metrics import precision_recall_fscore_support

# ----------------- paths -----------------
ROOT = r"C:\Users\aayus\Downloads\emr-smart"
DATA_DIR   = os.path.join(ROOT, "data")
MODEL_DIR  = os.path.join(ROOT, "models")
OUT_DIR    = os.path.join(ROOT, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

snap_path = os.path.join(DATA_DIR, "emr_snapshot.csv")
hist_path = os.path.join(DATA_DIR, "emr_history.csv")

snap_model_path = os.path.join(MODEL_DIR, "snapshot_regressor.joblib")
hist_model_path = os.path.join(MODEL_DIR, "best_history_classifier.joblib")
cutpoints_path  = os.path.join(MODEL_DIR, "snapshot_cutpoints.txt")

# ----------------- helper: trend features (must match training) -----------------
from sklearn.base import BaseEstimator, TransformerMixin

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases = bases
        self.eps   = eps
        self.feature_names_ = None

    def fit(self, X, y=None):
        feats = []
        for b in self.bases:
            feats += [
                f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope",
                f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step",
                f"{b}_cv", f"{b}_mad", f"{b}_abs_slope"
            ]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_ = feats
        return self

    def transform(self, X):
        W = np.array([1,2,3,4,5], dtype=float)
        rows = []
        for _, row in X.iterrows():
            feats = []
            stable_hits = 0
            all_steps = []
            for b in self.bases:
                vals = np.array([row[f"{b}_Week{i}"] for i in range(1,6)], dtype=float)
                delta = float(vals[-1] - vals[0])
                meanv = float(np.mean(vals))
                stdv  = float(np.std(vals, ddof=0))
                slope = float(np.polyfit(W, vals, 1)[0])
                steps = np.diff(vals)
                max_step = float(np.max(np.abs(steps)))
                stable = 1.0 if max_step < self.eps else 0.0
                rng = float(np.max(vals) - np.min(vals))
                mean_abs_step = float(np.mean(np.abs(steps)))
                cv  = float(stdv / (meanv + 1e-6))
                mad = float(np.median(np.abs(vals - np.median(vals))))
                abs_slope = abs(slope)

                feats += [delta, meanv, stdv, slope, stable,
                          rng, mean_abs_step, cv, mad, abs_slope]
                stable_hits += stable
                all_steps.extend(np.abs(steps))

            feats += [
                stable_hits / max(len(self.bases), 1),
                float(np.mean(all_steps)) if all_steps else 0.0
            ]
            rows.append(feats)
        return np.array(rows)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

def build_history_design(df):
    # Rebuild exactly like training
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    num_cols = [c for c in df.columns if c not in cat_cols]
    bases_present = [b for b in ALL_BASES if f"{b}_Week1" in df.columns and f"{b}_Week5" in df.columns]
    if not bases_present:
        raise ValueError("No *_Week1..Week5 series found in history dataset.")

    eng = TrendFeatureEngineer(bases_present, eps=1.6)
    trend = eng.fit_transform(df)
    tcols = eng.get_feature_names_out().tolist()

    X_num = df[num_cols].to_numpy(dtype=float)
    out = pd.DataFrame(X_num, columns=num_cols, index=df.index)
    # concat all new columns at once (avoids fragmentation warnings)
    trend_df = pd.DataFrame(trend, columns=tcols, index=df.index)
    out = pd.concat([out, trend_df], axis=1)
    # reattach categoricals
    for c in cat_cols: out[c] = df[c].values
    return out

# ----------------- load models -----------------
if not os.path.exists(hist_model_path):
    raise FileNotFoundError(f"Missing model: {hist_model_path}")
hist_pipe = joblib.load(hist_model_path)

snap_pipe = None
if os.path.exists(snap_model_path):
    snap_pipe = joblib.load(snap_model_path)
else:
    warnings.warn(f"Snapshot regressor not found at: {snap_model_path}. Snapshot inference will be skipped.")

# cutpoints for snapshot classes
c1, c2 = 2.5, 5.6
if os.path.exists(cutpoints_path):
    try:
        with open(cutpoints_path, "r", encoding="utf-8") as f:
            txt = f.read()
        # allow lines like "c1=2.5\nc2=5.6"
        for line in txt.splitlines():
            if line.startswith("c1="): c1 = float(line.split("=",1)[1])
            if line.startswith("c2="): c2 = float(line.split("=",1)[1])
    except Exception:
        pass

# ----------------- SNAPSHOT inference -----------------
snap_records = {}
if snap_pipe is not None and os.path.exists(snap_path):
    snap = pd.read_csv(snap_path)
    pid_col = "Patient_ID" if "Patient_ID" in snap.columns else None

    # use same columns as training: we saved a Pipeline(preproc, model) on raw snapshot features
    drop_cols = [c for c in ["Patient_ID","Patient_Name"] if c in snap.columns]
    Xs = snap.drop(columns=drop_cols, errors="ignore")

    # predict
    y_pred_sev = snap_pipe.predict(Xs)

    # class via cutpoints — produce a NumPy array
    sev_cls = pd.cut(y_pred_sev, bins=[-1e9, c1, c2, 1e9],
                     labels=["Low","Moderate","High"]).astype(str).to_numpy()

    # build per-patient dict (FIX: index sev_cls by [i], not .iloc)
    for i in range(len(snap)):
        pid = str(snap.iloc[i][pid_col]) if pid_col else f"row_{i}"
        snap_records[pid] = {
            "severity_numeric": float(y_pred_sev[i]),
            "severity_label":   str(sev_cls[i])
        }
    print(f"[SNAPSHOT] Inferred {len(snap_records)} rows (classes via cutpoints c1={c1}, c2={c2}).")
else:
    print("[SNAPSHOT] Skipped (no model or file).")

# ----------------- HISTORY inference (rebuild features) -----------------
if not os.path.exists(hist_path):
    raise FileNotFoundError(f"History CSV missing: {hist_path}")
hist = pd.read_csv(hist_path)

if "Trend_Status" in hist.columns:
    y_true_hist = hist["Trend_Status"].astype(str)
else:
    y_true_hist = None

pid_col_h = "Patient_ID" if "Patient_ID" in hist.columns else None
severity_cols = [c for c in hist.columns if str(c).startswith("Severity_Week")]
Xh_raw = hist.drop(columns=[c for c in ["Trend_Status", *severity_cols] if c in hist.columns], errors="ignore")

# rebuild design to match training
Xh = build_history_design(Xh_raw)

# predict labels + probabilities
trend_pred = hist_pipe.predict(Xh)
try:
    P = hist_pipe.predict_proba(Xh)
    classes = hist_pipe.named_steps["model"].classes_
except Exception:
    # some sklearn classifiers expose classes_ at top-level
    P = hist_pipe.predict_proba(Xh)
    classes = hist_pipe.classes_

cls_to_idx = {c:i for i,c in enumerate(classes)}
def row_proba(i):
    return {str(c): float(P[i, cls_to_idx[c]]) for c in classes}

hist_records = {}
for i in range(len(hist)):
    pid = str(hist.iloc[i][pid_col_h]) if pid_col_h else f"row_{i}"
    hist_records[pid] = {
        "trend": str(trend_pred[i]),
        "proba": row_proba(i)
    }

print(f"[HISTORY] Inferred {len(hist_records)} rows with probabilities.")

# ----------------- FUSE + SAVE -----------------
fusion_rows = []
matched = 0
for pid, hrec in hist_records.items():
    srec = snap_records.get(pid)
    out = {
        "patient_id": pid,
        "history": hrec,
        "snapshot": srec if srec else None,
        "alerts": []
    }
    # simple example alert logic (you can adjust later)
    if srec and srec["severity_label"] == "High":
        out["alerts"].append("High current severity")
    if hrec["trend"] == "Worsening":
        out["alerts"].append("Worsening trend over 5 weeks")
    fusion_rows.append(out)
    if srec: matched += 1

print(f"[FUSION] Built {len(fusion_rows)} records. Matched snapshot+history for {matched} patients.")

# write NDJSON for LLM step + a CSV for quick scanning
ndjson_path = os.path.join(OUT_DIR, "fusion.ndjson")
with open(ndjson_path, "w", encoding="utf-8") as f:
    for rec in fusion_rows:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

csv_rows = []
for rec in fusion_rows:
    pid = rec["patient_id"]
    sev_num = rec["snapshot"]["severity_numeric"] if rec["snapshot"] else np.nan
    sev_lab = rec["snapshot"]["severity_label"]   if rec["snapshot"] else ""
    tr_lab  = rec["history"]["trend"]
    pr      = rec["history"]["proba"]
    csv_rows.append({
        "patient_id": pid,
        "severity_numeric": sev_num,
        "severity_label": sev_lab,
        "trend": tr_lab,
        "p_improving": pr.get("Improving", np.nan),
        "p_stable":    pr.get("Stable", np.nan),
        "p_worsening": pr.get("Worsening", np.nan),
        "alerts": "; ".join(rec["alerts"])
    })

fusion_csv_path = os.path.join(OUT_DIR, "fusion.csv")
pd.DataFrame(csv_rows).to_csv(fusion_csv_path, index=False)

print("Saved:")
print(" ", ndjson_path)
print(" ", fusion_csv_path)


AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [11]:
# fuse_infer_generate.py — unified inference & fusion (snapshot + history)
# - Fix: snapshot class binning uses plain indexing (no .values / .iloc)
# - Rebuilds TrendFeatureEngineer features to match training
# - Keeps Severity_Week* cols and aligns to expected preprocessor inputs
# - Writes outputs/fusion.ndjson and outputs/fusion.csv

import os, json, warnings
import numpy as np
import pandas as pd
import joblib

# ----------------- paths -----------------
ROOT = r"C:\Users\aayus\Downloads\emr-smart"
DATA_DIR   = os.path.join(ROOT, "data")
MODEL_DIR  = os.path.join(ROOT, "models")
OUT_DIR    = os.path.join(ROOT, "outputs")
os.makedirs(OUT_DIR, exist_ok=True)

snap_path = os.path.join(DATA_DIR, "emr_snapshot.csv")
hist_path = os.path.join(DATA_DIR, "emr_history.csv")

snap_model_path = os.path.join(MODEL_DIR, "snapshot_regressor.joblib")
hist_model_path = os.path.join(MODEL_DIR, "best_history_classifier.joblib")
cutpoints_path  = os.path.join(MODEL_DIR, "snapshot_cutpoints.txt")

# ----------------- helper: trend features (match training) -----------------
from sklearn.base import BaseEstimator, TransformerMixin

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases = bases
        self.eps   = eps
        self.feature_names_ = None

    def fit(self, X, y=None):
        feats = []
        for b in self.bases:
            feats += [
                f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope",
                f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step",
                f"{b}_cv", f"{b}_mad", f"{b}_abs_slope"
            ]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_ = feats
        return self

    def transform(self, X):
        W = np.array([1,2,3,4,5], dtype=float)
        rows = []
        for _, row in X.iterrows():
            feats, stable_hits, all_steps = [], 0, []
            for b in self.bases:
                vals = np.array([row[f"{b}_Week{i}"] for i in range(1,6)], dtype=float)
                delta = float(vals[-1] - vals[0])
                meanv = float(np.mean(vals))
                stdv  = float(np.std(vals, ddof=0))
                slope = float(np.polyfit(W, vals, 1)[0])
                steps = np.diff(vals)
                max_step = float(np.max(np.abs(steps)))
                stable = 1.0 if max_step < self.eps else 0.0
                rng = float(np.max(vals) - np.min(vals))
                mean_abs_step = float(np.mean(np.abs(steps)))
                cv  = float(stdv / (meanv + 1e-6))
                mad = float(np.median(np.abs(vals - np.median(vals))))
                abs_slope = abs(slope)

                feats += [delta, meanv, stdv, slope, stable,
                          rng, mean_abs_step, cv, mad, abs_slope]
                stable_hits += stable
                all_steps.extend(np.abs(steps))

            feats += [
                stable_hits / max(len(self.bases), 1),
                float(np.mean(all_steps)) if all_steps else 0.0
            ]
            rows.append(feats)
        return np.array(rows)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

def build_history_design(df):
    """
    Rebuilds the training-time design:
    - Keep all numeric columns (including Severity_Week1..5 etc.)
    - Add engineered trend features
    - Re-attach categoricals
    """
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    num_cols = [c for c in df.columns if c not in cat_cols]

    bases_present = [b for b in ALL_BASES if f"{b}_Week1" in df.columns and f"{b}_Week5" in df.columns]
    if not bases_present:
        raise ValueError("No *_Week1..Week5 series found in history dataset.")

    eng = TrendFeatureEngineer(bases_present, eps=1.6)
    trend = eng.fit_transform(df)
    tcols = eng.get_feature_names_out().tolist()

    X_num = df[num_cols].to_numpy(dtype=float)
    out = pd.DataFrame(X_num, columns=num_cols, index=df.index)

    # add all trend columns at once (avoid fragmentation warnings)
    trend_df = pd.DataFrame(trend, columns=tcols, index=df.index)
    out = pd.concat([out, trend_df], axis=1)

    # reattach categoricals
    for c in cat_cols:
        out[c] = df[c].values

    return out

# --------------- align-to-preproc helper (robust to training/inference drift) ---------------
def expected_columns_from_preproc(preproc):
    """Get the exact column names the fitted ColumnTransformer expects."""
    exp = []
    for name, trans, cols in preproc.transformers_:
        if name == "remainder":
            continue
        exp.extend(list(cols))
    # preserve order, unique
    return list(dict.fromkeys(exp))

def align_to_expected_columns(X, preproc):
    """Add any missing expected columns as NaN so ColumnTransformer can run."""
    exp_cols = expected_columns_from_preproc(preproc)
    missing = [c for c in exp_cols if c not in X.columns]
    if missing:
        for c in missing:
            X[c] = np.nan
    # keep column order starting with exp_cols, then any extras (remainder="drop" by default)
    # but ColumnTransformer will select by names, so order isn't critical
    return X

# ----------------- load models -----------------
if not os.path.exists(hist_model_path):
    raise FileNotFoundError(f"Missing model: {hist_model_path}")
hist_pipe = joblib.load(hist_model_path)

snap_pipe = None
if os.path.exists(snap_model_path):
    snap_pipe = joblib.load(snap_model_path)
else:
    warnings.warn(f"Snapshot regressor not found at: {snap_model_path}. Snapshot inference will be skipped.")

# cutpoints for snapshot classes
c1, c2 = 2.5, 5.6
if os.path.exists(cutpoints_path):
    try:
        with open(cutpoints_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.startswith("c1="): c1 = float(line.split("=",1)[1])
                if line.startswith("c2="): c2 = float(line.split("=",1)[1])
    except Exception:
        pass

# ----------------- SNAPSHOT inference -----------------
snap_records = {}
if snap_pipe is not None and os.path.exists(snap_path):
    snap = pd.read_csv(snap_path)
    pid_col = "Patient_ID" if "Patient_ID" in snap.columns else None

    # same inputs as training (we trained on raw snapshot features minus IDs)
    drop_cols = [c for c in ["Patient_ID","Patient_Name"] if c in snap.columns]
    Xs = snap.drop(columns=drop_cols, errors="ignore")

    # predict numeric severity
    y_pred_sev = snap_pipe.predict(Xs)

    # class via cutpoints -> keep array-like, no .values / no .iloc
    sev_cls = pd.cut(
        y_pred_sev, bins=[-1e9, c1, c2, 1e9], labels=["Low","Moderate","High"]
    ).astype(str)

    # build per-patient dict
    for i in range(len(snap)):
        pid = str(snap.iloc[i][pid_col]) if pid_col else f"row_{i}"
        snap_records[pid] = {
            "severity_numeric": float(y_pred_sev[i]),
            "severity_label":   str(sev_cls[i])
        }
    print(f"[SNAPSHOT] Inferred {len(snap_records)} rows (classes via cutpoints c1={c1}, c2={c2}).")
else:
    print("[SNAPSHOT] Skipped (no model or file).")

# ----------------- HISTORY inference (rebuild features) -----------------
if not os.path.exists(hist_path):
    raise FileNotFoundError(f"History CSV missing: {hist_path}")
hist = pd.read_csv(hist_path)

# ground-truth (optional)
y_true_hist = hist["Trend_Status"].astype(str) if "Trend_Status" in hist.columns else None

pid_col_h = "Patient_ID" if "Patient_ID" in hist.columns else None

# IMPORTANT: do NOT drop Severity_Week* (the model likely expects them).
# Drop only label/ID columns that were dropped in training.
cols_to_drop = [c for c in ["Trend_Status", "Patient_ID"] if c in hist.columns]
Xh_raw = hist.drop(columns=cols_to_drop, errors="ignore")

# rebuild design (adds engineered trend features)
Xh = build_history_design(Xh_raw)

# align to the fitted preprocessor's expected input columns (robust to drift)
preproc = hist_pipe.named_steps.get("preprocess", None)
if preproc is not None:
    Xh = align_to_expected_columns(Xh, preproc)

# predict labels + probabilities
trend_pred = hist_pipe.predict(Xh)
try:
    P = hist_pipe.predict_proba(Xh)
    classes = hist_pipe.named_steps["model"].classes_
except Exception:
    P = hist_pipe.predict_proba(Xh)
    classes = hist_pipe.classes_

cls_to_idx = {c:i for i,c in enumerate(classes)}
def row_proba(i):
    return {str(c): float(P[i, cls_to_idx[c]]) for c in classes}

hist_records = {}
for i in range(len(hist)):
    pid = str(hist.iloc[i][pid_col_h]) if pid_col_h else f"row_{i}"
    hist_records[pid] = {
        "trend": str(trend_pred[i]),
        "proba": row_proba(i)
    }

print(f"[HISTORY] Inferred {len(hist_records)} rows with probabilities.")

# ----------------- FUSE + SAVE -----------------
fusion_rows = []
matched = 0
for pid, hrec in hist_records.items():
    srec = snap_records.get(pid)
    out = {
        "patient_id": pid,
        "history": hrec,
        "snapshot": srec if srec else None,
        "alerts": []
    }
    # simple alerts (adjust later as needed)
    if srec and srec["severity_label"] == "High":
        out["alerts"].append("High current severity")
    if hrec["trend"] == "Worsening":
        out["alerts"].append("Worsening trend over 5 weeks")
    fusion_rows.append(out)
    if srec: matched += 1

print(f"[FUSION] Built {len(fusion_rows)} records. Matched snapshot+history for {matched} patients.")

# NDJSON for LLM + CSV for quick scan
ndjson_path = os.path.join(OUT_DIR, "fusion.ndjson")
with open(ndjson_path, "w", encoding="utf-8") as f:
    for rec in fusion_rows:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

csv_rows = []
for rec in fusion_rows:
    pid = rec["patient_id"]
    sev_num = rec["snapshot"]["severity_numeric"] if rec["snapshot"] else np.nan
    sev_lab = rec["snapshot"]["severity_label"]   if rec["snapshot"] else ""
    tr_lab  = rec["history"]["trend"]
    pr      = rec["history"]["proba"]
    csv_rows.append({
        "patient_id": pid,
        "severity_numeric": sev_num,
        "severity_label": sev_lab,
        "trend": tr_lab,
        "p_improving": pr.get("Improving", np.nan),
        "p_stable":    pr.get("Stable", np.nan),
        "p_worsening": pr.get("Worsening", np.nan),
        "alerts": "; ".join(rec["alerts"])
    })

fusion_csv_path = os.path.join(OUT_DIR, "fusion.csv")
pd.DataFrame(csv_rows).to_csv(fusion_csv_path, index=False)

print("Saved:")
print(" ", ndjson_path)
print(" ", fusion_csv_path)


[SNAPSHOT] Inferred 1000 rows (classes via cutpoints c1=2.5, c2=5.6).
[HISTORY] Inferred 1000 rows with probabilities.
[FUSION] Built 1000 records. Matched snapshot+history for 1000 patients.
Saved:
  C:\Users\aayus\Downloads\emr-smart\outputs\fusion.ndjson
  C:\Users\aayus\Downloads\emr-smart\outputs\fusion.csv
