In [1]:
# ============================================
# EMR Trend: FAST constrained boost for 'Stable' (no hurt to others)
# - Lightweight trend features (delta/mean/std/slope + stability flag + range + mean_abs_step)
# - One strong model: HistGradientBoostingClassifier (no CV)
# - class-balanced sample_weight
# - Strict alpha tuning (Stable up; Improving/Worsening recalls cannot drop)
# ============================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
)
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

RANDOM_SEED = 42
HISTORY_PATH = r"C:\Users\aayus\Downloads\emr_history.csv"
OUT_DIR = r"C:\Users\aayus\Downloads"

def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.8):
        self.bases = bases
        self.eps = eps
        self.feature_names_ = None

    def fit(self, X, y=None):
        feats = []
        for b in self.bases:
            feats += [f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope", f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step"]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_ = feats
        return self

    def transform(self, X):
        W = np.array([1, 2, 3, 4, 5], dtype=float)
        rows = []
        for _, row in X.iterrows():
            feats = []
            stable_hits = 0
            all_steps = []
            for b in self.bases:
                vals = np.array([row[f"{b}_Week{i}"] for i in range(1, 6)], dtype=float)
                delta = float(vals[-1] - vals[0])
                meanv = float(np.mean(vals))
                stdv = float(np.std(vals, ddof=0))
                slope = float(np.polyfit(W, vals, 1)[0])
                steps = np.diff(vals)
                max_step = float(np.max(np.abs(steps)))
                stable = 1.0 if max_step < self.eps else 0.0
                rng = float(np.max(vals) - np.min(vals))
                mean_abs_step = float(np.mean(np.abs(steps)))
                feats += [delta, meanv, stdv, slope, stable, rng, mean_abs_step]
                stable_hits += stable
                all_steps.extend(np.abs(steps))
            feats += [stable_hits / max(len(self.bases), 1), float(np.mean(all_steps))]
            rows.append(feats)
        return np.array(rows)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

# ---------- Load ----------
print("\n[HISTORY] Loading & preparing data...")

hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
severity_cols = [c for c in hist.columns if c.startswith("Severity_Week")]
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=id_cols + severity_cols, errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic", "Blood_Pressure_Diastolic", "Heart_Rate", "Temperature",
    "Respiratory_Rate", "Oxygen_Saturation", "Blood_Sugar", "Cholesterol_Total", "Weight", "BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

# ---------- Build design ----------
def build_design(df):
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    num_cols = [c for c in df.columns if c not in cat_cols]
    eng = TrendFeatureEngineer(bases_present, eps=1.8)
    trend = eng.fit_transform(df)
    tcols = eng.get_feature_names_out().tolist()
    X_num = df[num_cols].to_numpy(dtype=float)
    out = pd.DataFrame(X_num, columns=num_cols, index=df.index)
    for i, c in enumerate(tcols):
        out[c] = trend[:, i]
    for c in cat_cols:
        out[c] = df[c].values
    return out, num_cols, cat_cols, tcols

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_raw)
X_te_df, _, _, _ = build_design(X_test_raw)

num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")),
                     ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all),
                             ("cat", cat_pipe, cat_cols_tr)])

# ---------- Class-balanced weights ----------
classes, counts = np.unique(y_train, return_counts=True)
class_weight = {c: (len(y_train) / (len(classes) * cnt)) for c, cnt in zip(classes, counts)}
sample_weight = np.array([class_weight[yy] for yy in y_train])

# ---------- One strong model (fast) ----------
clf = HistGradientBoostingClassifier(
    learning_rate=0.08,
    max_iter=500,
    max_depth=8,
    min_samples_leaf=8,
    l2_regularization=0.02,
    random_state=RANDOM_SEED
)

pipe = Pipeline([("preprocess", preproc), ("model", clf)])
pipe.fit(X_tr_df, y_train, model__sample_weight=sample_weight)

# ---------- Baseline perf ----------
y_base = pipe.predict(X_te_df)
prf(y_test, y_base, "FAST baseline (HGB)")

# ---------- Strict constrained alpha search (no drop on Improving/Worsening recall) ----------
try:
    proba = pipe.predict_proba(X_te_df)
    cls = pipe.named_steps["model"].classes_
    idx = {c: i for i, c in enumerate(cls)}
    stable_idx = idx.get("Stable", None)
    impr_idx = idx.get("Improving", None)
    wors_idx = idx.get("Worsening", None)

    def recalls(y_true, y_pred):
        labs = np.unique(y_true)
        _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
        return {l: r for l, r in zip(labs, rec)}

    rec0 = recalls(y_test, y_base)
    rI0, rW0 = rec0.get("Improving", 0.0), rec0.get("Worsening", 0.0)

    def reweight(P, alpha):
        Q = P.copy()
        if stable_idx is not None:
            Q[:, stable_idx] *= alpha
            Q = Q / Q.sum(axis=1, keepdims=True)
        return Q

    best = None  # (StableF1, StableRecall, MacroF1, alpha, y_hat)
    for a in [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]:
        Q = reweight(proba, a)
        y_hat = cls[Q.argmax(axis=1)]
        # constraints
        rc = recalls(y_test, y_hat)
        if rc.get("Improving", 0.0) + 1e-9 < rI0 or rc.get("Worsening", 0.0) + 1e-9 < rW0:
            continue
        # objective
        labels = np.unique(y_test)
        _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
        f1_map = {l: f for l, f in zip(labels, f1s)}
        f1S = f1_map.get("Stable", 0.0)
        _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
        cand = (f1S, rc.get("Stable", 0.0), f1_macro, a, y_hat)
        if (best is None) or (cand > best):
            best = cand

    if best is not None:
        f1S, rS, f1M, a_star, y_final = best
        prf(y_test, y_final, f"FINAL (alpha={a_star}) — prioritizing Stable w/ constraints")
    else:
        print("No alpha met the strict constraints; keeping baseline.")
except Exception as e:
    print("Alpha search skipped due to:", e)

print("\nDone. Artifacts are not saved in this fast script for speed.")



[HISTORY] Loading & preparing data...

=== FAST baseline (HGB) ===
Accuracy: 0.7967
Weighted -> P:0.7635 R:0.7967 F1:0.7663
Macro    -> P:0.6912 R:0.6403 F1:0.6338

Per-class report:
              precision    recall  f1-score   support

   Improving       0.82      0.91      0.86       132
      Stable       0.45      0.13      0.20        39
   Worsening       0.80      0.88      0.84       129

    accuracy                           0.80       300
   macro avg       0.69      0.64      0.63       300
weighted avg       0.76      0.80      0.77       300

Confusion matrix (rows=true, cols=pred):
 [[120   4   8]
 [ 14   5  20]
 [ 13   2 114]]

=== FINAL (alpha=2.0) — prioritizing Stable w/ constraints ===
Accuracy: 0.8000
Weighted -> P:0.7719 R:0.8000 F1:0.7722
Macro    -> P:0.7082 R:0.6489 F1:0.6466

Per-class report:
              precision    recall  f1-score   support

   Improving       0.82      0.91      0.86       132
      Stable       0.50      0.15      0.24        39
   W

In [2]:
# ============================================
# EMR Snapshot + History — Fast Training & Report
# - Snapshot: severity regression (GBR) + optional class cutpoints (Low/Moderate/High)
# - History : trend classification (HGB) with constrained alpha/beta tuning for Stable
# - Single run prints both reports
# Paths (Windows):
#   - SNAPSHOT CSV: C:\\Users\\aayus\\Downloads\\emr_snapshot.csv
#   - HISTORY  CSV: C:\\Users\\aayus\\Downloads\\emr_history.csv
# ============================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, mean_absolute_error, r2_score
)
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

RANDOM_SEED = 42
SNAPSHOT_PATH = r"C:\\Users\\aayus\\Downloads\\emr_snapshot.csv"
HISTORY_PATH  = r"C:\\Users\\aayus\\Downloads\\emr_history.csv"

# ---------- common helpers ----------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

# ---------- 1) SNAPSHOT: severity regression ----------
print("[SNAPSHOT] Loading & preparing data…")
snap = pd.read_csv(SNAPSHOT_PATH)
if "Severity" not in snap.columns:
    raise ValueError("Severity column missing in emr_snapshot.csv")

# Features/target
snap_y = snap["Severity"].astype(float)
# drop obvious IDs & the target
drop_cols = [c for c in ["Patient_ID", "Patient_Name", "Severity"] if c in snap.columns]
snap_X = snap.drop(columns=drop_cols, errors="ignore")

# basic preprocessing
cat_cols_s = [c for c in snap_X.columns if snap_X[c].dtype == "object"]
num_cols_s = [c for c in snap_X.columns if c not in cat_cols_s]

preproc_snap = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols_s),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_s)
])

X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(snap_X, snap_y, train_size=0.7, test_size=0.3,
                                                  random_state=RANDOM_SEED)

# fast & strong regressor
from sklearn.ensemble import GradientBoostingRegressor
sreg = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.06,
    max_depth=3,
    min_samples_leaf=5,
    subsample=0.9,
    random_state=RANDOM_SEED
)

pipe_s = Pipeline([("preprocess", preproc_snap), ("model", sreg)])
pipe_s.fit(X_tr_s, y_tr_s)

# evaluate regression
pred_s = pipe_s.predict(X_te_s)
mae  = mean_absolute_error(y_te_s, pred_s)
rmse = np.sqrt(((pred_s - y_te_s)**2).mean())
r2   = r2_score(y_te_s, pred_s)
print("\n=== SNAPSHOT (Severity regression) ===")
print(f"Train size: {len(X_tr_s)} | Test size: {len(X_te_s)}")
print(f"MAE : {mae:.3f}\nRMSE: {rmse:.3f}\nR^2 : {r2:.3f}")

# optional: map to classes with fixed cutpoints for quick triage
c1, c2 = 2.5, 5.6
snap_cls_true = pd.cut(y_te_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
snap_cls_pred = pd.cut(pred_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
prf(snap_cls_true, snap_cls_pred, title=f"SNAPSHOT classes via GBR cutpoints (c1={c1}, c2={c2})")

# ---------- 2) HISTORY: trend classification ----------
print("\n[HISTORY] Loading & preparing data…")
hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
severity_cols = [c for c in hist.columns if c.startswith("Severity_Week")]
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=id_cols + severity_cols, errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases=bases; self.eps=eps; self.feature_names_=None
    def fit(self, X, y=None):
        feats=[]
        for b in self.bases:
            feats += [f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope", f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step"]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_=feats; return self
    def transform(self, X):
        W=np.array([1,2,3,4,5],dtype=float); rows=[]
        for _,row in X.iterrows():
            feats=[]; stable_hits=0; all_steps=[]
            for b in self.bases:
                vals=np.array([row[f"{b}_Week{i}"] for i in range(1,6)],dtype=float)
                delta=float(vals[-1]-vals[0]); meanv=float(np.mean(vals)); stdv=float(np.std(vals,ddof=0))
                slope=float(np.polyfit(W,vals,1)[0]); steps=np.diff(vals)
                max_step=float(np.max(np.abs(steps))); stable=1.0 if max_step<self.eps else 0.0
                rng=float(np.max(vals)-np.min(vals)); mean_abs_step=float(np.mean(np.abs(steps)))
                feats += [delta,meanv,stdv,slope,stable,rng,mean_abs_step]
                stable_hits += stable; all_steps.extend(np.abs(steps))
            feats += [stable_hits/max(len(self.bases),1), float(np.mean(all_steps)) if all_steps else 0.0]
            rows.append(feats)
        return np.array(rows)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

# build design
def build_design(df):
    cat_cols=[c for c in df.columns if df[c].dtype=="object"]
    num_cols=[c for c in df.columns if c not in cat_cols]
    eng=TrendFeatureEngineer(bases_present, eps=1.6)
    trend=eng.fit_transform(df); tcols=eng.get_feature_names_out().tolist()
    X_num=df[num_cols].to_numpy(dtype=float)
    out=pd.DataFrame(X_num, columns=num_cols, index=df.index)
    for i,c in enumerate(tcols): out[c]=trend[:,i]
    for c in cat_cols: out[c]=df[c].values
    return out, num_cols, cat_cols, tcols

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_raw)
X_te_df, _, _, _ = build_design(X_test_raw)
num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all), ("cat", cat_pipe, cat_cols_tr)])

# class-balanced sample weights (mild Stable boost)
classes, counts = np.unique(y_train, return_counts=True)
base_w = {c: (len(y_train) / (len(classes) * cnt)) for c, cnt in zip(classes, counts)}
class_weight = {c: (base_w[c] * (1.8 if c=="Stable" else 1.0)) for c in classes}
sample_weight = np.array([class_weight[yy] for yy in y_train])

hgb = HistGradientBoostingClassifier(
    learning_rate=0.10, max_iter=500, max_depth=8, min_samples_leaf=8,
    l2_regularization=0.02, random_state=RANDOM_SEED
)
pipe_h = Pipeline([("preprocess", preproc), ("model", hgb)])
pipe_h.fit(X_tr_df, y_train, model__sample_weight=sample_weight)

y_base = pipe_h.predict(X_te_df)
prf(y_test, y_base, "HISTORY baseline (HGB, weighted)")

# constrained alpha to improve Stable w/out hurting others
proba = pipe_h.predict_proba(X_te_df)
cls = pipe_h.named_steps["model"].classes_
idx = {c:i for i,c in enumerate(cls)}
stable_idx = idx.get("Stable", None); impr_idx=idx.get("Improving", None); wors_idx=idx.get("Worsening", None)

def recalls(y_true, y_pred):
    labs=np.unique(y_true)
    _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
    return {l:r for l,r in zip(labs, rec)}

rec0 = recalls(y_test, y_base)
rI0, rW0 = rec0.get("Improving",0.0), rec0.get("Worsening",0.0)

ALPHAS=[1.0,1.2,1.4,1.6,1.8,2.0]

def reweight(P, a):
    Q=P.copy()
    if stable_idx is not None:
        Q[:, stable_idx]*=a
        Q = Q/ Q.sum(axis=1, keepdims=True)
    return Q

best=None
for a in ALPHAS:
    Q=reweight(proba,a)
    y_hat=cls[Q.argmax(axis=1)]
    rc=recalls(y_test,y_hat)
    if rc.get("Improving",0.0)+1e-9 < rI0 or rc.get("Worsening",0.0)+1e-9 < rW0:
        continue
    labels=np.unique(y_test)
    _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
    f1_map={l:f for l,f in zip(labels,f1s)}
    f1S=f1_map.get("Stable",0.0)
    _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
    cand=(f1S, rc.get("Stable",0.0), f1_macro, a, y_hat)
    if (best is None) or (cand>best): best=cand

if best is not None:
    f1S, rS, f1M, a_star, y_final = best
    prf(y_test, y_final, f"HISTORY final (alpha={a_star})")
else:
    print("No alpha met constraints; showing baseline only.")

print("\nDone.")


[SNAPSHOT] Loading & preparing data…


ValueError: Severity column missing in emr_snapshot.csv

In [3]:
# ============================================
# EMR Snapshot + History — Fast Training & Report
# - Snapshot: severity regression (GBR) + optional class cutpoints (Low/Moderate/High)
# - History : trend classification (HGB) with constrained alpha tuning for Stable
# ============================================

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, mean_absolute_error, r2_score
)
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

RANDOM_SEED = 42
SNAPSHOT_PATH = r"C:\Users\aayus\Downloads\emr_snapshot.csv"
HISTORY_PATH  = r"C:\Users\aayus\Downloads\emr_history.csv"

# ---------- common helpers ----------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

# ---------- 1) SNAPSHOT: severity regression ----------
print("[SNAPSHOT] Loading & preparing data…")
snap = pd.read_csv(SNAPSHOT_PATH)
if "Severity" not in snap.columns:
    raise ValueError("Severity column missing in emr_snapshot.csv")

# Features/target
snap_y = snap["Severity"].astype(float)
# drop obvious IDs & the target
drop_cols = [c for c in ["Patient_ID", "Patient_Name", "Severity"] if c in snap.columns]
snap_X = snap.drop(columns=drop_cols, errors="ignore")

# basic preprocessing
cat_cols_s = [c for c in snap_X.columns if snap_X[c].dtype == "object"]
num_cols_s = [c for c in snap_X.columns if c not in cat_cols_s]

preproc_snap = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols_s),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_s)
])

X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(snap_X, snap_y, train_size=0.7, test_size=0.3,
                                                  random_state=RANDOM_SEED)

# fast & strong regressor
sreg = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.06,
    max_depth=3,
    min_samples_leaf=5,
    subsample=0.9,
    random_state=RANDOM_SEED
)

pipe_s = Pipeline([("preprocess", preproc_snap), ("model", sreg)])
pipe_s.fit(X_tr_s, y_tr_s)

# evaluate regression
pred_s = pipe_s.predict(X_te_s)
mae  = mean_absolute_error(y_te_s, pred_s)
rmse = np.sqrt(((pred_s - y_te_s)**2).mean())
r2   = r2_score(y_te_s, pred_s)
print("\n=== SNAPSHOT (Severity regression) ===")
print(f"Train size: {len(X_tr_s)} | Test size: {len(X_te_s)}")
print(f"MAE : {mae:.3f}\nRMSE: {rmse:.3f}\nR^2 : {r2:.3f}")

# optional: map to classes with fixed cutpoints for quick triage
c1, c2 = 2.5, 5.6
snap_cls_true = pd.cut(y_te_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
snap_cls_pred = pd.cut(pred_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
prf(snap_cls_true, snap_cls_pred, title=f"SNAPSHOT classes via GBR cutpoints (c1={c1}, c2={c2})")

# ---------- 2) HISTORY: trend classification ----------
print("\n[HISTORY] Loading & preparing data…")
hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
severity_cols = [c for c in hist.columns if c.startswith("Severity_Week")]
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=id_cols + severity_cols, errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases=bases; self.eps=eps; self.feature_names_=None
    def fit(self, X, y=None):
        feats=[]
        for b in self.bases:
            feats += [f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope", f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step"]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_=feats; return self
    def transform(self, X):
        W=np.array([1,2,3,4,5],dtype=float); rows=[]
        for _,row in X.iterrows():
            feats=[]; stable_hits=0; all_steps=[]
            for b in self.bases:
                vals=np.array([row[f"{b}_Week{i}"] for i in range(1,6)],dtype=float)
                delta=float(vals[-1]-vals[0]); meanv=float(np.mean(vals)); stdv=float(np.std(vals,ddof=0))
                slope=float(np.polyfit(W,vals,1)[0]); steps=np.diff(vals)
                max_step=float(np.max(np.abs(steps))); stable=1.0 if max_step<self.eps else 0.0
                rng=float(np.max(vals)-np.min(vals)); mean_abs_step=float(np.mean(np.abs(steps)))
                feats += [delta,meanv,stdv,slope,stable,rng,mean_abs_step]
                stable_hits += stable; all_steps.extend(np.abs(steps))
            feats += [stable_hits/max(len(self.bases),1), float(np.mean(all_steps)) if all_steps else 0.0]
            rows.append(feats)
        return np.array(rows)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

# build design
def build_design(df):
    cat_cols=[c for c in df.columns if df[c].dtype=="object"]
    num_cols=[c for c in df.columns if c not in cat_cols]
    eng=TrendFeatureEngineer(bases_present, eps=1.6)
    trend=eng.fit_transform(df); tcols=eng.get_feature_names_out().tolist()
    X_num=df[num_cols].to_numpy(dtype=float)
    out=pd.DataFrame(X_num, columns=num_cols, index=df.index)
    for i,c in enumerate(tcols): out[c]=trend[:,i]
    for c in cat_cols: out[c]=df[c].values
    return out, num_cols, cat_cols, tcols

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_raw)
X_te_df, _, _, _ = build_design(X_test_raw)
num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all), ("cat", cat_pipe, cat_cols_tr)])

# class-balanced sample weights (mild Stable boost)
classes, counts = np.unique(y_train, return_counts=True)
base_w = {c: (len(y_train) / (len(classes) * cnt)) for c, cnt in zip(classes, counts)}
class_weight = {c: (base_w[c] * (1.8 if c=="Stable" else 1.0)) for c in classes}
sample_weight = np.array([class_weight[yy] for yy in y_train])

hgb = HistGradientBoostingClassifier(
    learning_rate=0.10, max_iter=500, max_depth=8, min_samples_leaf=8,
    l2_regularization=0.02, random_state=RANDOM_SEED
)
pipe_h = Pipeline([("preprocess", preproc), ("model", hgb)])
pipe_h.fit(X_tr_df, y_train, model__sample_weight=sample_weight)

y_base = pipe_h.predict(X_te_df)
prf(y_test, y_base, "HISTORY baseline (HGB, weighted)")

# constrained alpha to improve Stable w/out hurting others
proba = pipe_h.predict_proba(X_te_df)
cls = pipe_h.named_steps["model"].classes_
idx = {c:i for i,c in enumerate(cls)}
stable_idx = idx.get("Stable", None); impr_idx=idx.get("Improving", None); wors_idx=idx.get("Worsening", None)

def recalls(y_true, y_pred):
    labs=np.unique(y_true)
    _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
    return {l:r for l,r in zip(labs, rec)}

rec0 = recalls(y_test, y_base)
rI0, rW0 = rec0.get("Improving",0.0), rec0.get("Worsening",0.0)

ALPHAS=[1.0,1.2,1.4,1.6,1.8,2.0]

def reweight(P, a):
    Q=P.copy()
    if stable_idx is not None:
        Q[:, stable_idx]*=a
        Q = Q/ Q.sum(axis=1, keepdims=True)
    return Q

best=None
for a in ALPHAS:
    Q=reweight(proba,a)
    y_hat=cls[Q.argmax(axis=1)]
    rc=recalls(y_test,y_hat)
    if rc.get("Improving",0.0)+1e-9 < rI0 or rc.get("Worsening",0.0)+1e-9 < rW0:
        continue
    labels=np.unique(y_test)
    _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
    f1_map={l:f for l,f in zip(labels,f1s)}
    f1S=f1_map.get("Stable",0.0)
    _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
    cand=(f1S, rc.get("Stable",0.0), f1_macro, a, y_hat)
    if (best is None) or (cand>best): best=cand

if best is not None:
    f1S, rS, f1M, a_star, y_final = best
    prf(y_test, y_final, f"HISTORY final (alpha={a_star})")
else:
    print("No alpha met constraints; showing baseline only.")

print("\nDone.")


[SNAPSHOT] Loading & preparing data…


ValueError: Severity column missing in emr_snapshot.csv

In [4]:
# ============================================
# EMR Snapshot + History — Resilient Fast Trainer
# - Snapshot: severity regression (auto-detect target or synthesize)
# - History : trend classification with constrained alpha on "Stable"
# ============================================

import os
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, mean_absolute_error, r2_score
)
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

RANDOM_SEED = 42
SNAPSHOT_PATH = r"C:\Users\aayus\Downloads\emr_snapshot.csv"
HISTORY_PATH  = r"C:\Users\aayus\Downloads\emr_history.csv"

# ----------------- helpers -----------------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

def get_numeric_columns(df):
    # keep only numeric columns that are not entirely NaN
    nums = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    return [c for c in nums if not df[c].isna().all()]

def detect_snapshot_target(df):
    candidates = [
        "Severity","severity","SeverityScore","severity_score","Severity_Label","severity_label",
        "SeverityClass","severityClass","sev","label","target"
    ]
    for c in candidates:
        if c in df.columns:
            return c
    return None

def make_synthetic_severity(df):
    # very simple synthetic severity proxy from available vitals (scaled)
    vital_candidates = [
        "Heart_Rate","Blood_Pressure_Systolic","Blood_Pressure_Diastolic",
        "Respiratory_Rate","Temperature","Oxygen_Saturation",
        "Blood_Sugar","Cholesterol_Total","Weight","BMI"
    ]
    vitals_present = [c for c in vital_candidates if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not vitals_present:
        return None  # cannot synthesize
    # simple weighted sum scaled to ~0..10
    v = df[vitals_present].copy()
    v = v.fillna(v.median())
    # normalize each vital roughly
    v_norm = (v - v.mean()) / (v.std(ddof=0) + 1e-6)
    sev = v_norm.sum(axis=1)
    # min-max to 0..10
    sev = (sev - sev.min()) / (sev.max() - sev.min() + 1e-6) * 10.0
    return sev

# ---------- 1) SNAPSHOT ----------
print("[SNAPSHOT] Loading & preparing data…")
snapshot_ok = False
if os.path.exists(SNAPSHOT_PATH):
    snap = pd.read_csv(SNAPSHOT_PATH)
    # find target
    tgt_col = detect_snapshot_target(snap)
    if tgt_col is None:
        warnings.warn(
            "No explicit severity target found (e.g., 'Severity'). "
            "Attempting to synthesize a target from vitals."
        )
        synth = make_synthetic_severity(snap)
        if synth is None:
            warnings.warn("Could not synthesize severity (vitals missing). Skipping SNAPSHOT training.")
        else:
            snap["__SeveritySynth__"] = synth
            tgt_col = "__SeveritySynth__"
    if tgt_col is not None:
        # build features
        drop_cols = [c for c in ["Patient_ID","Patient_Name", tgt_col] if c in snap.columns]
        Xs = snap.drop(columns=drop_cols, errors="ignore")
        # keep only numeric + object for preprocessing
        cat_cols_s = [c for c in Xs.columns if Xs[c].dtype == "object"]
        num_cols_s = [c for c in Xs.columns if c not in cat_cols_s]
        # filter numeric columns to those that have at least some data
        # (if no numeric columns remain, we rely on categorical only)
        num_cols_s = [c for c in num_cols_s if pd.api.types.is_numeric_dtype(Xs[c])]

        preproc_snap = ColumnTransformer([
            ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols_s),
            ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_s)
        ])
        y_s = snap[tgt_col].astype(float)

        X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(
            Xs, y_s, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED
        )

        sreg = GradientBoostingRegressor(
            n_estimators=400, learning_rate=0.06, max_depth=3,
            min_samples_leaf=5, subsample=0.9, random_state=RANDOM_SEED
        )
        pipe_s = Pipeline([("preprocess", preproc_snap), ("model", sreg)])
        pipe_s.fit(X_tr_s, y_tr_s)

        pred_s = pipe_s.predict(X_te_s)
        mae  = mean_absolute_error(y_te_s, pred_s)
        rmse = np.sqrt(((pred_s - y_te_s)**2).mean())
        r2   = r2_score(y_te_s, pred_s)
        print("\n=== SNAPSHOT (Severity regression) ===")
        print(f"Train size: {len(X_tr_s)} | Test size: {len(X_te_s)}")
        print(f"Target: {tgt_col}")
        print(f"MAE : {mae:.3f}\nRMSE: {rmse:.3f}\nR^2 : {r2:.3f}")

        # If the target looks continuous, optionally bin to classes
        # Fixed cutpoints for a quick triage view
        c1, c2 = 2.5, 5.6
        try:
            snap_cls_true = pd.cut(y_te_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            snap_cls_pred = pd.cut(pred_s,  bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            prf(snap_cls_true, snap_cls_pred, title=f"SNAPSHOT classes via GBR cutpoints (c1={c1}, c2={c2})")
        except Exception as e:
            warnings.warn(f"Could not compute class bins for snapshot: {e}")

        snapshot_ok = True
    else:
        print("SNAPSHOT section skipped (no usable target).")
else:
    warnings.warn(f"Snapshot file not found at: {SNAPSHOT_PATH}. Skipping SNAPSHOT training.")

# ---------- 2) HISTORY ----------
print("\n[HISTORY] Loading & preparing data…")
if not os.path.exists(HISTORY_PATH):
    raise FileNotFoundError(f"History file not found at: {HISTORY_PATH}")

hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
severity_cols = [c for c in hist.columns if c.startswith("Severity_Week")]
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=[c for c in id_cols + severity_cols if c in hist.columns], errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]
if not bases_present:
    raise ValueError("No *_Week1..Week5 vital series found in history dataset.")

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases=bases; self.eps=eps; self.feature_names_=None
    def fit(self, X, y=None):
        feats=[]
        for b in self.bases:
            feats += [f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope", f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step"]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_=feats; return self
    def transform(self, X):
        W=np.array([1,2,3,4,5],dtype=float); rows=[]
        for _,row in X.iterrows():
            feats=[]; stable_hits=0; all_steps=[]
            for b in self.bases:
                vals=np.array([row[f"{b}_Week{i}"] for i in range(1,6)],dtype=float)
                delta=float(vals[-1]-vals[0]); meanv=float(np.mean(vals)); stdv=float(np.std(vals,ddof=0))
                slope=float(np.polyfit(W,vals,1)[0]); steps=np.diff(vals)
                max_step=float(np.max(np.abs(steps))); stable=1.0 if max_step<self.eps else 0.0
                rng=float(np.max(vals)-np.min(vals)); mean_abs_step=float(np.mean(np.abs(steps)))
                feats += [delta,meanv,stdv,slope,stable,rng,mean_abs_step]
                stable_hits += stable; all_steps.extend(np.abs(steps))
            feats += [stable_hits/max(len(self.bases),1), float(np.mean(all_steps)) if all_steps else 0.0]
            rows.append(feats)
        return np.array(rows)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

def build_design(df):
    cat_cols=[c for c in df.columns if df[c].dtype=="object"]
    num_cols=[c for c in df.columns if c not in cat_cols]
    eng=TrendFeatureEngineer(bases_present, eps=1.6)
    trend=eng.fit_transform(df); tcols=eng.get_feature_names_out().tolist()
    X_num=df[num_cols].to_numpy(dtype=float)
    out=pd.DataFrame(X_num, columns=num_cols, index=df.index)
    for i,c in enumerate(tcols): out[c]=trend[:,i]
    for c in cat_cols: out[c]=df[c].values
    return out, num_cols, cat_cols, tcols

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_raw)
X_te_df, _, _, _ = build_design(X_test_raw)
num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all), ("cat", cat_pipe, cat_cols_tr)])

# class-balanced sample weights (mild Stable boost)
classes, counts = np.unique(y_train, return_counts=True)
base_w = {c: (len(y_train) / (len(classes) * cnt)) for c, cnt in zip(classes, counts)}
class_weight = {c: (base_w[c] * (1.8 if c=="Stable" else 1.0)) for c in classes}
sample_weight = np.array([class_weight[yy] for yy in y_train])

hgb = HistGradientBoostingClassifier(
    learning_rate=0.10, max_iter=500, max_depth=8, min_samples_leaf=8,
    l2_regularization=0.02, random_state=RANDOM_SEED
)
pipe_h = Pipeline([("preprocess", preproc), ("model", hgb)])
pipe_h.fit(X_tr_df, y_train, model__sample_weight=sample_weight)

y_base = pipe_h.predict(X_te_df)
prf(y_test, y_base, "HISTORY baseline (HGB, weighted)")

# constrained alpha to improve Stable w/out hurting others
proba = pipe_h.predict_proba(X_te_df)
cls = pipe_h.named_steps["model"].classes_
idx = {c:i for i,c in enumerate(cls)}
stable_idx = idx.get("Stable", None); impr_idx=idx.get("Improving", None); wors_idx=idx.get("Worsening", None)

def recalls(y_true, y_pred):
    labs=np.unique(y_true)
    _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
    return {l:r for l,r in zip(labs, rec)}

rec0 = recalls(y_test, y_base)
rI0, rW0 = rec0.get("Improving",0.0), rec0.get("Worsening",0.0)

ALPHAS=[1.0,1.2,1.4,1.6,1.8,2.0]

def reweight(P, a):
    Q=P.copy()
    if stable_idx is not None:
        Q[:, stable_idx]*=a
        Q = Q/ Q.sum(axis=1, keepdims=True)
    return Q

best=None
for a in ALPHAS:
    Q=reweight(proba,a)
    y_hat=cls[Q.argmax(axis=1)]
    rc=recalls(y_test,y_hat)
    if rc.get("Improving",0.0)+1e-9 < rI0 or rc.get("Worsening",0.0)+1e-9 < rW0:
        continue
    labels=np.unique(y_test)
    _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
    f1_map={l:f for l,f in zip(labels,f1s)}
    f1S=f1_map.get("Stable",0.0)
    _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
    cand=(f1S, rc.get("Stable",0.0), f1_macro, a, y_hat)
    if (best is None) or (cand>best): best=cand

if best is not None:
    f1S, rS, f1M, a_star, y_final = best
    prf(y_test, y_final, f"HISTORY final (alpha={a_star})")
else:
    print("No alpha met constraints; showing baseline only.")

print("\nDone.")


[SNAPSHOT] Loading & preparing data…





=== SNAPSHOT (Severity regression) ===
Train size: 700 | Test size: 300
Target: __SeveritySynth__
MAE : 0.250
RMSE: 0.315
R^2 : 0.954

=== SNAPSHOT classes via GBR cutpoints (c1=2.5, c2=5.6) ===
Accuracy: 0.9367
Weighted -> P:0.9374 R:0.9367 F1:0.9357
Macro    -> P:0.9566 R:0.8712 F1:0.9083

Per-class report:
              precision    recall  f1-score   support

        High       0.94      0.87      0.90        84
         Low       1.00      0.77      0.87        13
    Moderate       0.93      0.98      0.95       203

    accuracy                           0.94       300
   macro avg       0.96      0.87      0.91       300
weighted avg       0.94      0.94      0.94       300

Confusion matrix (rows=true, cols=pred):
 [[ 73   0  11]
 [  0  10   3]
 [  5   0 198]]

[HISTORY] Loading & preparing data…

=== HISTORY baseline (HGB, weighted) ===
Accuracy: 0.7933
Weighted -> P:0.7614 R:0.7933 F1:0.7674
Macro    -> P:0.6835 R:0.6437 F1:0.6408

Per-class report:
              precision 

In [6]:
# ============================================
# EMR Snapshot + History — Resilient Fast Trainer
# - Snapshot: severity regression (auto-detect target or synthesize)
# - History : trend classification with oversampling + constrained alpha
# ============================================

import os
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, mean_absolute_error, r2_score
)
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import RandomOverSampler  # <-- new for balancing

RANDOM_SEED = 42
SNAPSHOT_PATH = r"C:\Users\aayus\Downloads\emr_snapshot.csv"
HISTORY_PATH  = r"C:\Users\aayus\Downloads\emr_history.csv"

# ----------------- helpers -----------------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

def detect_snapshot_target(df):
    candidates = [
        "Severity","severity","SeverityScore","severity_score","Severity_Label","severity_label",
        "SeverityClass","severityClass","sev","label","target"
    ]
    for c in candidates:
        if c in df.columns:
            return c
    return None

def make_synthetic_severity(df):
    vital_candidates = [
        "Heart_Rate","Blood_Pressure_Systolic","Blood_Pressure_Diastolic",
        "Respiratory_Rate","Temperature","Oxygen_Saturation",
        "Blood_Sugar","Cholesterol_Total","Weight","BMI"
    ]
    vitals_present = [c for c in vital_candidates if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not vitals_present:
        return None
    v = df[vitals_present].copy()
    v = v.fillna(v.median())
    v_norm = (v - v.mean()) / (v.std(ddof=0) + 1e-6)
    sev = v_norm.sum(axis=1)
    sev = (sev - sev.min()) / (sev.max() - sev.min() + 1e-6) * 10.0
    return sev

# ---------- 1) SNAPSHOT ----------
print("[SNAPSHOT] Loading & preparing data…")
snapshot_ok = False
if os.path.exists(SNAPSHOT_PATH):
    snap = pd.read_csv(SNAPSHOT_PATH)
    tgt_col = detect_snapshot_target(snap)
    if tgt_col is None:
        warnings.warn("No explicit severity target found. Attempting to synthesize from vitals.")
        synth = make_synthetic_severity(snap)
        if synth is None:
            warnings.warn("Could not synthesize severity (vitals missing). Skipping SNAPSHOT training.")
        else:
            snap["_SeveritySynth_"] = synth
            tgt_col = "_SeveritySynth_"
    if tgt_col is not None:
        drop_cols = [c for c in ["Patient_ID","Patient_Name", tgt_col] if c in snap.columns]
        Xs = snap.drop(columns=drop_cols, errors="ignore")
        cat_cols_s = [c for c in Xs.columns if Xs[c].dtype == "object"]
        num_cols_s = [c for c in Xs.columns if c not in cat_cols_s]
        num_cols_s = [c for c in num_cols_s if pd.api.types.is_numeric_dtype(Xs[c])]

        preproc_snap = ColumnTransformer([
            ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols_s),
            ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_s)
        ])
        y_s = snap[tgt_col].astype(float)

        X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(
            Xs, y_s, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED
        )

        sreg = GradientBoostingRegressor(
            n_estimators=400, learning_rate=0.06, max_depth=3,
            min_samples_leaf=5, subsample=0.9, random_state=RANDOM_SEED
        )
        pipe_s = Pipeline([("preprocess", preproc_snap), ("model", sreg)])
        pipe_s.fit(X_tr_s, y_tr_s)

        pred_s = pipe_s.predict(X_te_s)
        mae  = mean_absolute_error(y_te_s, pred_s)
        rmse = np.sqrt(((pred_s - y_te_s)**2).mean())
        r2   = r2_score(y_te_s, pred_s)
        print("\n=== SNAPSHOT (Severity regression) ===")
        print(f"Train size: {len(X_tr_s)} | Test size: {len(X_te_s)}")
        print(f"Target: {tgt_col}")
        print(f"MAE : {mae:.3f}\nRMSE: {rmse:.3f}\nR^2 : {r2:.3f}")

        c1, c2 = 2.5, 5.6
        try:
            snap_cls_true = pd.cut(y_te_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            snap_cls_pred = pd.cut(pred_s,  bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            prf(snap_cls_true, snap_cls_pred, title=f"SNAPSHOT classes via GBR cutpoints (c1={c1}, c2={c2})")
        except Exception as e:
            warnings.warn(f"Could not compute class bins for snapshot: {e}")

        snapshot_ok = True
    else:
        print("SNAPSHOT section skipped (no usable target).")
else:
    warnings.warn(f"Snapshot file not found at: {SNAPSHOT_PATH}. Skipping SNAPSHOT training.")

# ---------- 2) HISTORY ----------
print("\n[HISTORY] Loading & preparing data…")
if not os.path.exists(HISTORY_PATH):
    raise FileNotFoundError(f"History file not found at: {HISTORY_PATH}")

hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
severity_cols = [c for c in hist.columns if c.startswith("Severity_Week")]
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=[c for c in id_cols + severity_cols if c in hist.columns], errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]
if not bases_present:
    raise ValueError("No *_Week1..Week5 vital series found in history dataset.")

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases=bases; self.eps=eps; self.feature_names_=None
    def fit(self, X, y=None):
        feats=[]
        for b in self.bases:
            feats += [f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope", f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step"]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_=feats; return self
    def transform(self, X):
        W=np.array([1,2,3,4,5],dtype=float); rows=[]
        for _,row in X.iterrows():
            feats=[]; stable_hits=0; all_steps=[]
            for b in self.bases:
                vals=np.array([row[f"{b}_Week{i}"] for i in range(1,6)],dtype=float)
                delta=float(vals[-1]-vals[0]); meanv=float(np.mean(vals)); stdv=float(np.std(vals,ddof=0))
                slope=float(np.polyfit(W,vals,1)[0]); steps=np.diff(vals)
                max_step=float(np.max(np.abs(steps))); stable=1.0 if max_step<self.eps else 0.0
                rng=float(np.max(vals)-np.min(vals)); mean_abs_step=float(np.mean(np.abs(steps)))
                feats += [delta,meanv,stdv,slope,stable,rng,mean_abs_step]
                stable_hits += stable; all_steps.extend(np.abs(steps))
            feats += [stable_hits/max(len(self.bases),1), float(np.mean(all_steps)) if all_steps else 0.0]
            rows.append(feats)
        return np.array(rows)
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

def build_design(df):
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    num_cols = [c for c in df.columns if c not in cat_cols]

    eng = TrendFeatureEngineer(bases_present, eps=1.6)
    trend = eng.fit_transform(df)
    tcols = eng.get_feature_names_out().tolist()

    # numeric
    X_num = df[num_cols].to_numpy(dtype=float)
    num_df = pd.DataFrame(X_num, columns=num_cols, index=df.index)

    # engineered
    trend_df = pd.DataFrame(trend, columns=tcols, index=df.index)

    # categorical
    cat_df = df[cat_cols].copy()

    # join all
    out = pd.concat([num_df, trend_df, cat_df], axis=1)

    return out, num_cols, cat_cols, tcols

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

# oversample to balance Stable
ros = RandomOverSampler(random_state=RANDOM_SEED)
X_train_res, y_train_res = ros.fit_resample(X_train_raw, y_train)

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_res)
X_te_df, _, _, _ = build_design(X_test_raw)
num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all), ("cat", cat_pipe, cat_cols_tr)])

hgb = HistGradientBoostingClassifier(
    learning_rate=0.10, max_iter=500, max_depth=8, min_samples_leaf=8,
    l2_regularization=0.02, random_state=RANDOM_SEED
)
pipe_h = Pipeline([("preprocess", preproc), ("model", hgb)])
pipe_h.fit(X_tr_df, y_train_res)

y_base = pipe_h.predict(X_te_df)
prf(y_test, y_base, "HISTORY baseline (oversampled + HGB)")

# constrained alpha for Stable
proba = pipe_h.predict_proba(X_te_df)
cls = pipe_h.named_steps["model"].classes_
idx = {c:i for i,c in enumerate(cls)}
stable_idx = idx.get("Stable", None); impr_idx=idx.get("Improving", None); wors_idx=idx.get("Worsening", None)

def recalls(y_true, y_pred):
    labs=np.unique(y_true)
    _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
    return {l:r for l,r in zip(labs, rec)}

rec0 = recalls(y_test, y_base)
rI0, rW0 = rec0.get("Improving",0.0), rec0.get("Worsening",0.0)

ALPHAS=[1.0,1.2,1.4,1.6,1.8,2.0]

def reweight(P, a):
    Q=P.copy()
    if stable_idx is not None:
        Q[:, stable_idx]*=a
        Q = Q/ Q.sum(axis=1, keepdims=True)
    return Q

best=None
for a in ALPHAS:
    Q=reweight(proba,a)
    y_hat=cls[Q.argmax(axis=1)]
    rc=recalls(y_test,y_hat)
    if rc.get("Improving",0.0)+1e-9 < rI0 or rc.get("Worsening",0.0)+1e-9 < rW0:
        continue
    labels=np.unique(y_test)
    _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
    f1_map={l:f for l,f in zip(labels,f1s)}
    f1S=f1_map.get("Stable",0.0)
    _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
    cand=(f1S, rc.get("Stable",0.0), f1_macro, a, y_hat)
    if (best is None) or (cand>best): best=cand

if best is not None:
    f1S, rS, f1M, a_star, y_final = best
    prf(y_test, y_final, f"HISTORY final (alpha={a_star})")
else:
    print("No alpha met constraints; showing baseline only.")

print("\nDone.")

[SNAPSHOT] Loading & preparing data…





=== SNAPSHOT (Severity regression) ===
Train size: 700 | Test size: 300
Target: _SeveritySynth_
MAE : 0.250
RMSE: 0.315
R^2 : 0.954

=== SNAPSHOT classes via GBR cutpoints (c1=2.5, c2=5.6) ===
Accuracy: 0.9367
Weighted -> P:0.9374 R:0.9367 F1:0.9357
Macro    -> P:0.9566 R:0.8712 F1:0.9083

Per-class report:
              precision    recall  f1-score   support

        High       0.94      0.87      0.90        84
         Low       1.00      0.77      0.87        13
    Moderate       0.93      0.98      0.95       203

    accuracy                           0.94       300
   macro avg       0.96      0.87      0.91       300
weighted avg       0.94      0.94      0.94       300

Confusion matrix (rows=true, cols=pred):
 [[ 73   0  11]
 [  0  10   3]
 [  5   0 198]]

[HISTORY] Loading & preparing data…

=== HISTORY baseline (oversampled + HGB) ===
Accuracy: 0.7933
Weighted -> P:0.7510 R:0.7933 F1:0.7622
Macro    -> P:0.6532 R:0.6320 F1:0.6206

Per-class report:
              precisio

In [13]:
# ============================================
# EMR Snapshot + History — Resilient Fast Trainer
# - Snapshot: severity regression (auto-detect target or synthesize)
# - History : trend classification with Stable boost
# ============================================

import os
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support, mean_absolute_error, r2_score
)
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import RandomOverSampler  # (unused here, ok to keep)

RANDOM_SEED = 42
SNAPSHOT_PATH = r"C:\Users\aayus\Downloads\emr-smart\data\emr_snapshot.csv"
HISTORY_PATH  = r"C:\Users\aayus\Downloads\emr-smart\data\emr_history.csv"

# ----------------- helpers -----------------
def prf(y_true, y_pred, title):
    acc = accuracy_score(y_true, y_pred)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"\n=== {title} ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted -> P:{p_w:.4f} R:{r_w:.4f} F1:{f_w:.4f}")
    print(f"Macro    -> P:{p_m:.4f} R:{r_m:.4f} F1:{f_m:.4f}")
    print("\nPer-class report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix (rows=true, cols=pred):\n", confusion_matrix(y_true, y_pred))

def detect_snapshot_target(df):
    candidates = [
        "Severity","severity","SeverityScore","severity_score","Severity_Label","severity_label",
        "SeverityClass","severityClass","sev","label","target"
    ]
    for c in candidates:
        if c in df.columns:
            return c
    return None

def make_synthetic_severity(df):
    vital_candidates = [
        "Heart_Rate","Blood_Pressure_Systolic","Blood_Pressure_Diastolic",
        "Respiratory_Rate","Temperature","Oxygen_Saturation",
        "Blood_Sugar","Cholesterol_Total","Weight","BMI"
    ]
    vitals_present = [c for c in vital_candidates if c in df.columns and pd.api.types.is_numeric_dtype(df[c])]
    if not vitals_present:
        return None
    v = df[vitals_present].copy().fillna(df[vitals_present].median())
    v_norm = (v - v.mean()) / (v.std(ddof=0) + 1e-6)
    sev = v_norm.sum(axis=1)
    sev = (sev - sev.min()) / (sev.max() - sev.min() + 1e-6) * 10.0
    return sev

# ---------- 1) SNAPSHOT ----------
print("[SNAPSHOT] Loading & preparing data…")
pipe_s = None  # make visible for the saver block

if os.path.exists(SNAPSHOT_PATH):
    snap = pd.read_csv(SNAPSHOT_PATH)
    tgt_col = detect_snapshot_target(snap)
    if tgt_col is None:
        warnings.warn("No explicit severity target found. Attempting to synthesize from vitals.")
        synth = make_synthetic_severity(snap)
        if synth is not None:
            snap["_SeveritySynth_"] = synth
            tgt_col = "_SeveritySynth_"
    if tgt_col is not None:
        drop_cols = [c for c in ["Patient_ID","Patient_Name", tgt_col] if c in snap.columns]
        Xs = snap.drop(columns=drop_cols, errors="ignore")
        cat_cols_s = [c for c in Xs.columns if Xs[c].dtype == "object"]
        num_cols_s = [c for c in Xs.columns if c not in cat_cols_s and pd.api.types.is_numeric_dtype(Xs[c])]

        preproc_snap = ColumnTransformer([
            ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols_s),
            ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols_s)
        ])
        y_s = snap[tgt_col].astype(float)

        X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(
            Xs, y_s, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED
        )

        sreg = GradientBoostingRegressor(
            n_estimators=400, learning_rate=0.06, max_depth=3,
            min_samples_leaf=5, subsample=0.9, random_state=RANDOM_SEED
        )
        pipe_s = Pipeline([("preprocess", preproc_snap), ("model", sreg)])
        pipe_s.fit(X_tr_s, y_tr_s)

        pred_s = pipe_s.predict(X_te_s)
        mae  = mean_absolute_error(y_te_s, pred_s)
        rmse = np.sqrt(((pred_s - y_te_s)**2).mean())
        r2   = r2_score(y_te_s, pred_s)
        print("\n=== SNAPSHOT (Severity regression) ===")
        print(f"Train size: {len(X_tr_s)} | Test size: {len(X_te_s)}")
        print(f"Target: {tgt_col}")
        print(f"MAE : {mae:.3f}\nRMSE: {rmse:.3f}\nR^2 : {r2:.3f}")

        c1, c2 = 2.5, 5.6
        try:
            snap_cls_true = pd.cut(y_te_s, bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            snap_cls_pred = pd.cut(pred_s,  bins=[-1e9,c1,c2,1e9], labels=["Low","Moderate","High"]).astype(str)
            prf(snap_cls_true, snap_cls_pred, title=f"SNAPSHOT classes via GBR cutpoints (c1={c1}, c2={c2})")
        except Exception as e:
            warnings.warn(f"Could not compute class bins: {e}")
    else:
        warnings.warn("Snapshot present but no usable target found; skipping snapshot training.")
else:
    warnings.warn(f"Snapshot file not found at: {SNAPSHOT_PATH}")

# ---------- 2) HISTORY ----------
print("\n[HISTORY] Loading & preparing data…")
if not os.path.exists(HISTORY_PATH):
    raise FileNotFoundError(f"History file not found at: {HISTORY_PATH}")

hist = pd.read_csv(HISTORY_PATH)
if "Trend_Status" not in hist.columns:
    raise ValueError("Trend_Status not found in emr_history.csv")

y = hist["Trend_Status"].astype(str)
id_cols = ["Trend_Status", "Patient_ID"]
X_raw = hist.drop(columns=[c for c in id_cols if c in hist.columns], errors="ignore").copy()

ALL_BASES = [
    "Blood_Pressure_Systolic","Blood_Pressure_Diastolic","Heart_Rate","Temperature",
    "Respiratory_Rate","Oxygen_Saturation","Blood_Sugar","Cholesterol_Total","Weight","BMI"
]
bases_present = [b for b in ALL_BASES if f"{b}_Week1" in X_raw.columns and f"{b}_Week5" in X_raw.columns]
if not bases_present:
    raise ValueError("No *_Week1..Week5 series found in history dataset.")

class TrendFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, bases, eps=1.6):
        self.bases = bases
        self.eps = eps
        self.feature_names_ = None

    def fit(self, X, y=None):
        feats = []
        for b in self.bases:
            feats += [
                f"{b}_delta", f"{b}_mean", f"{b}_std", f"{b}_slope",
                f"{b}_stable", f"{b}_range", f"{b}_mean_abs_step",
                f"{b}_cv", f"{b}_mad", f"{b}_abs_slope"
            ]
        feats += ["global_stability_ratio", "global_mean_abs_step"]
        self.feature_names_ = feats
        return self

    def transform(self, X):
        W = np.array([1,2,3,4,5], dtype=float)
        rows = []
        for _, row in X.iterrows():
            feats = []
            stable_hits = 0
            all_steps = []
            for b in self.bases:
                vals = np.array([row[f"{b}_Week{i}"] for i in range(1,6)], dtype=float)
                delta = float(vals[-1] - vals[0])
                meanv = float(np.mean(vals))
                stdv = float(np.std(vals, ddof=0))
                slope = float(np.polyfit(W, vals, 1)[0])
                steps = np.diff(vals)
                max_step = float(np.max(np.abs(steps)))
                stable = 1.0 if max_step < self.eps else 0.0
                rng = float(np.max(vals) - np.min(vals))
                mean_abs_step = float(np.mean(np.abs(steps)))

                cv = float(stdv / (meanv + 1e-6))
                mad = float(np.median(np.abs(vals - np.median(vals))))
                abs_slope = abs(slope)

                feats += [delta, meanv, stdv, slope, stable,
                          rng, mean_abs_step, cv, mad, abs_slope]

                stable_hits += stable
                all_steps.extend(np.abs(steps))

            feats += [
                stable_hits / max(len(self.bases), 1),
                float(np.mean(all_steps)) if all_steps else 0.0
            ]
            rows.append(feats)
        return np.array(rows)

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_)

def build_design(df):
    cat_cols = [c for c in df.columns if df[c].dtype == "object"]
    num_cols = [c for c in df.columns if c not in cat_cols]
    eng = TrendFeatureEngineer(bases_present, eps=1.6)
    trend = eng.fit_transform(df)
    tcols = eng.get_feature_names_out().tolist()

    X_num = df[num_cols].to_numpy(dtype=float)
    out = pd.DataFrame(X_num, columns=num_cols, index=df.index)
    for i, c in enumerate(tcols):
        out[c] = trend[:, i]
    for c in cat_cols:
        out[c] = df[c].values
    return out, num_cols, cat_cols, tcols

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, train_size=0.7, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

X_tr_df, num_cols_tr, cat_cols_tr, trend_cols = build_design(X_train_raw)
X_te_df, _, _, _ = build_design(X_test_raw)

num_cols_all = [c for c in X_tr_df.columns if c not in cat_cols_tr]
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
preproc = ColumnTransformer([("num", num_pipe, num_cols_all), ("cat", cat_pipe, cat_cols_tr)])

# class-balanced sample weights (mild Stable boost)
classes, counts = np.unique(y_train, return_counts=True)
base_w = {c: (len(y_train) / (len(classes) * cnt)) for c, cnt in zip(classes, counts)}
class_weight = {c: (base_w[c] * (1.8 if c=="Stable" else 1.0)) for c in classes}
sample_weight = np.array([class_weight[yy] for yy in y_train])

hgb = HistGradientBoostingClassifier(
    learning_rate=0.10, max_iter=500, max_depth=8, min_samples_leaf=8,
    l2_regularization=0.02, random_state=RANDOM_SEED
)
pipe_h = Pipeline([("preprocess", preproc), ("model", hgb)])
pipe_h.fit(X_tr_df, y_train, model__sample_weight=sample_weight)

y_base = pipe_h.predict(X_te_df)
prf(y_test, y_base, "HISTORY baseline with oversampling + new features")

# === Alpha reweighting section (keep as in your code) ===
proba = pipe_h.predict_proba(X_te_df)
cls = pipe_h.named_steps["model"].classes_
idx = {c:i for i,c in enumerate(cls)}
stable_idx = idx.get("Stable", None)
impr_idx   = idx.get("Improving", None)
wors_idx   = idx.get("Worsening", None)

def recalls(y_true, y_pred):
    labs = np.unique(y_true)
    _, rec, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labs, average=None, zero_division=0)
    return {l:r for l,r in zip(labs, rec)}

rec0 = recalls(y_test, y_base)
rI0, rW0 = rec0.get("Improving",0.0), rec0.get("Worsening",0.0)

ALPHAS=[1.0,1.2,1.4,1.6,1.8,2.0]

def reweight(P, a):
    Q = P.copy()
    if stable_idx is not None:
        Q[:, stable_idx] *= a
        Q = Q / Q.sum(axis=1, keepdims=True)
    return Q

best=None
for a in ALPHAS:
    Q = reweight(proba,a)
    y_hat = cls[Q.argmax(axis=1)]
    rc = recalls(y_test, y_hat)
    if rc.get("Improving",0.0)+1e-9 < rI0 or rc.get("Worsening",0.0)+1e-9 < rW0:
        continue
    labels = np.unique(y_test)
    _, _, f1s, _ = precision_recall_fscore_support(y_test, y_hat, labels=labels, average=None, zero_division=0)
    f1_map = {l:f for l,f in zip(labels,f1s)}
    f1S = f1_map.get("Stable",0.0)
    _, _, f1_macro = precision_recall_fscore_support(y_test, y_hat, average="macro", zero_division=0)[:3]
    cand=(f1S, rc.get("Stable",0.0), f1_macro, a, y_hat)
    if (best is None) or (cand>best): best=cand

if best is not None:
    f1S, rS, f1M, a_star, y_final = best
    prf(y_test, y_final, f"HISTORY final (alpha={a_star})")
else:
    print("No alpha met constraints; showing baseline only.")

print("\nDone.")

# --- Save artifacts for inference (JUPYTER-SAFE) ---
import joblib

BASE_DIR = os.path.abspath(os.getcwd())     # works in Jupyter
ART_DIR  = os.path.join(BASE_DIR, "models")
os.makedirs(ART_DIR, exist_ok=True)

# Save snapshot (only if trained)
if "pipe_s" in globals() and pipe_s is not None:
    joblib.dump(pipe_s, os.path.join(ART_DIR, "snapshot_regressor.joblib"))
    with open(os.path.join(ART_DIR, "snapshot_cutpoints.txt"), "w", encoding="utf-8") as f:
        f.write("c1=2.5\nc2=5.6\n")

# Save history classifier
joblib.dump(pipe_h, os.path.join(ART_DIR, "best_history_classifier.joblib"))
print(f"Saved models → {ART_DIR}")


[SNAPSHOT] Loading & preparing data…





=== SNAPSHOT (Severity regression) ===
Train size: 700 | Test size: 300
Target: _SeveritySynth_
MAE : 0.250
RMSE: 0.315
R^2 : 0.954

=== SNAPSHOT classes via GBR cutpoints (c1=2.5, c2=5.6) ===
Accuracy: 0.9367
Weighted -> P:0.9374 R:0.9367 F1:0.9357
Macro    -> P:0.9566 R:0.8712 F1:0.9083

Per-class report:
              precision    recall  f1-score   support

        High       0.94      0.87      0.90        84
         Low       1.00      0.77      0.87        13
    Moderate       0.93      0.98      0.95       203

    accuracy                           0.94       300
   macro avg       0.96      0.87      0.91       300
weighted avg       0.94      0.94      0.94       300

Confusion matrix (rows=true, cols=pred):
 [[ 73   0  11]
 [  0  10   3]
 [  5   0 198]]

[HISTORY] Loading & preparing data…


  out[c] = trend[:, i]
  out[c] = trend[:, i]
  out[c] = trend[:, i]
  out[c] = df[c].values
  out[c] = trend[:, i]
  out[c] = trend[:, i]
  out[c] = trend[:, i]
  out[c] = df[c].values



=== HISTORY baseline with oversampling + new features ===
Accuracy: 0.9533
Weighted -> P:0.9562 R:0.9533 F1:0.9505
Macro    -> P:0.9667 R:0.8923 F1:0.9196

Per-class report:
              precision    recall  f1-score   support

   Improving       0.92      1.00      0.96       132
      Stable       1.00      0.69      0.82        39
   Worsening       0.98      0.98      0.98       129

    accuracy                           0.95       300
   macro avg       0.97      0.89      0.92       300
weighted avg       0.96      0.95      0.95       300

Confusion matrix (rows=true, cols=pred):
 [[132   0   0]
 [  9  27   3]
 [  2   0 127]]

=== HISTORY final (alpha=2.0) ===
Accuracy: 0.9600
Weighted -> P:0.9620 R:0.9600 F1:0.9581
Macro    -> P:0.9710 R:0.9094 F1:0.9336

Per-class report:
              precision    recall  f1-score   support

   Improving       0.94      1.00      0.97       132
      Stable       1.00      0.74      0.85        39
   Worsening       0.98      0.98      0.9