In [1]:
import os, json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

SYSTEM = (
    "You are a clinical summarizer for an EMR assistant. "
    "Given structured patient fusion data, produce ONE concise paragraph (≤150 words) that: "
    "1) summarizes current status (only abnormal vitals), "
    "2) describes 5-week trend (Improving/Stable/Worsening + notable deltas), "
    "3) lists alerts and practical next steps. "
    "Use calibrated language ('suggests', 'consider'). Do not diagnose or prescribe. No PII beyond patient_id."
)

USER_TEMPLATE = (
    "FUSION JSON:\n{fusion}\n\n"
    "Write one paragraph. Include:\n"
    "- Current status with abnormal vitals only.\n"
    "- Trend summary with notable changes.\n"
    "- Alerts + suggested next steps (brief).\n"
    "End with: 'This supports decisions and is not a diagnosis.'"
)

def summarize_fusion(fu: dict, model="gpt-4.1-mini"):
    user = USER_TEMPLATE.format(fusion=json.dumps(fu, ensure_ascii=False))
    resp = client.chat.completions.create(
        model=model,
        temperature=0.2,
        messages=[{"role":"system","content":SYSTEM},
                  {"role":"user","content":user}]
    )
    return resp.choices[0].message.content.strip()

if __name__ == "__main__":
    ndjson_path = os.path.join(os.path.dirname(__file__), "outputs", "fusion.ndjson")
    out_path    = os.path.join(os.path.dirname(__file__), "outputs", "summaries.txt")
    assert os.path.exists(ndjson_path), "Run predict_and_fuse.py first."
    with open(ndjson_path, "r", encoding="utf-8") as f, open(out_path, "w", encoding="utf-8") as w:
        for line in f:
            fu = json.loads(line)
            para = summarize_fusion(fu)
            w.write(f"{fu['patient_id']}: {para}\n\n")
    print(f"Wrote summaries → {out_path}")


ModuleNotFoundError: No module named 'openai'

In [2]:
!pip install openai


Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Downloading openai-1.108.0-py3-none-any.whl.metadata (29 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.11.0-cp312-cp312-win_amd64.whl.metadata (5.3 kB)
Downloading openai-1.108.0-py3-none-any.whl (948 kB)
   ---------------------------------------- 0.0/948.1 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/948.1 kB ? eta -:--:--
   --------------------------------- ------ 786.4/948.1 kB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 948.1/948.1 kB 2.3 MB/s eta 0:00:00
Downloading jiter-0.11.0-cp312-cp312-win_amd64.whl (203 kB)
Installing collected packages: jiter, openai
Successfully installed jiter-0.11.0 openai-1.108.0




In [6]:
# Summarize EMR fusion records with OpenAI (uses .env)
# Paths assume your project layout under C:\Users\aayus\Downloads\emr-smart

import os, json, textwrap
from pathlib import Path
from dotenv import load_dotenv

# ---------- config ----------
ROOT      = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR   = ROOT / "outputs"
IN_PATH   = OUT_DIR / "fusion.ndjson"
OUT_PATH  = OUT_DIR / "summaries.txt"
MODEL     = "gpt-4o-mini"
MAX_WORDS = 150  # soft cap; prompt asks for ≤150 words

# ---------- env / client ----------
load_dotenv()  # loads .env from current working dir or parents
api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.strip():
    raise RuntimeError(
        "OPENAI_API_KEY missing. Create a .env file in your project root with:\n"
        "OPENAI_API_KEY=sk-... (your key)\n"
        f"Expected .env near: {ROOT}"
    )

from openai import OpenAI
client = OpenAI(api_key=api_key)

# ---------- prompts ----------
SYSTEM = (
    "You are a clinical summarizer for an EMR assistant. "
    "Given structured patient fusion data, produce ONE concise paragraph (≤150 words) that: "
    "1) starts with quick context (patient_id if available), "
    "2) states current severity and confidence, "
    "3) describes 5-week trend with probabilities, "
    "4) surfaces any alerts, and "
    "5) ends with a short ‘next steps’ line using calibrated language (e.g., ‘consider review/monitoring’). "
    "Avoid diagnoses/prescriptions and PII beyond patient_id. Neutral tone."
)

def make_user_prompt(rec: dict) -> str:
    pid      = rec.get("patient_id", "unknown")
    snap     = rec.get("snapshot") or {}
    hist     = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    sev_num  = snap.get("severity_numeric", "")
    sev_lab  = snap.get("severity_label", "")
    trend    = hist.get("trend", "")
    proba    = hist.get("proba", {})

    # flatten probs in deterministic order
    p_imp = proba.get("Improving",  None)
    p_sta = proba.get("Stable",     None)
    p_wor = proba.get("Worsening",  None)

    # compact text the model can use reliably
    lines = [
        f"patient_id: {pid}",
        f"severity_now: {sev_num} ({sev_lab})",
        f"trend_5w: {trend}",
        f"proba: Improving={p_imp}, Stable={p_sta}, Worsening={p_wor}",
        f"alerts: {alerts}",
        f"limit_words: {MAX_WORDS}",
    ]
    return "\n".join(lines)

# ---------- IO checks ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH} (run fuse_infer_generate.py first)")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- run ----------
written = 0
with open(IN_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)

        # Build the prompt
        user_prompt = make_user_prompt(rec)

        # Call OpenAI
        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM},
                    {"role": "user",   "content": user_prompt},
                ],
                temperature=0.2,
            )
            text = (resp.choices[0].message.content or "").strip()
            # safety: hard-trim if the model exceeded requested length
            if len(text.split()) > MAX_WORDS + 10:
                text = " ".join(text.split()[:MAX_WORDS]) + "..A."
        except Exception as e:
            text = f"(generation error: {e})"

        pid = rec.get("patient_id", "unknown")
        fout.write(f"[{pid}]\n{text}\n\n")
        written += 1

print(f"Wrote {written} summaries → {OUT_PATH}")


RuntimeError: OPENAI_API_KEY missing. Create a .env file in your project root with:
OPENAI_API_KEY=sk-... (your key)
Expected .env near: C:\Users\aayus\Downloads\emr-smart

In [7]:
# Summarize EMR fusion records with OpenAI (uses .env)
# Paths assume your project layout under C:\Users\aayus\Downloads\emr-smart

import os, json, textwrap
from pathlib import Path
from dotenv import load_dotenv

# ---------- config ----------
ROOT      = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR   = ROOT / "outputs"
IN_PATH   = OUT_DIR / "fusion.ndjson"
OUT_PATH  = OUT_DIR / "summaries.txt"
MODEL     = "gpt-4o-mini"
MAX_WORDS = 150  # soft cap; prompt asks for ≤150 words

# ---------- env / client ----------
load_dotenv()  # loads .env from current working dir or parents
api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.strip():
    raise RuntimeError(
        "OPENAI_API_KEY missing. Create a .env file in your project root with:\n"
        "OPENAI_API_KEY=sk-... (your key)\n"
        f"Expected .env near: {ROOT}"
    )

from openai import OpenAI
client = OpenAI(api_key=api_key)

# ---------- prompts ----------
SYSTEM = (
    "You are a clinical summarizer for an EMR assistant. "
    "Given structured patient fusion data, produce ONE concise paragraph (≤150 words) that: "
    "1) starts with quick context (patient_id if available), "
    "2) states current severity and confidence, "
    "3) describes 5-week trend with probabilities, "
    "4) surfaces any alerts, and "
    "5) ends with a short ‘next steps’ line using calibrated language (e.g., ‘consider review/monitoring’). "
    "Avoid diagnoses/prescriptions and PII beyond patient_id. Neutral tone."
)

def make_user_prompt(rec: dict) -> str:
    pid      = rec.get("patient_id", "unknown")
    snap     = rec.get("snapshot") or {}
    hist     = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    sev_num  = snap.get("severity_numeric", "")
    sev_lab  = snap.get("severity_label", "")
    trend    = hist.get("trend", "")
    proba    = hist.get("proba", {})

    # flatten probs in deterministic order
    p_imp = proba.get("Improving",  None)
    p_sta = proba.get("Stable",     None)
    p_wor = proba.get("Worsening",  None)

    # compact text the model can use reliably
    lines = [
        f"patient_id: {pid}",
        f"severity_now: {sev_num} ({sev_lab})",
        f"trend_5w: {trend}",
        f"proba: Improving={p_imp}, Stable={p_sta}, Worsening={p_wor}",
        f"alerts: {alerts}",
        f"limit_words: {MAX_WORDS}",
    ]
    return "\n".join(lines)

# ---------- IO checks ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH} (run fuse_infer_generate.py first)")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- run ----------
written = 0
with open(IN_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)

        # Build the prompt
        user_prompt = make_user_prompt(rec)

        # Call OpenAI
        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM},
                    {"role": "user",   "content": user_prompt},
                ],
                temperature=0.2,
            )
            text = (resp.choices[0].message.content or "").strip()
            # safety: hard-trim if the model exceeded requested length
            if len(text.split()) > MAX_WORDS + 10:
                text = " ".join(text.split()[:MAX_WORDS]) + "..A."
        except Exception as e:
            text = f"(generation error: {e})"

        pid = rec.get("patient_id", "unknown")
        fout.write(f"[{pid}]\n{text}\n\n")
        written += 1

print(f"Wrote {written} summaries → {OUT_PATH}")


RuntimeError: OPENAI_API_KEY missing. Create a .env file in your project root with:
OPENAI_API_KEY=sk-... (your key)
Expected .env near: C:\Users\aayus\Downloads\emr-smart

In [8]:
# Robust OpenAI key loader + summarizer
import os, json
from pathlib import Path
from dotenv import load_dotenv

# --- Point to your project root explicitly ---
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
ENV_PATH = ROOT / ".env"

# Load .env from explicit path (works even if CWD is elsewhere)
if not ENV_PATH.exists():
    raise RuntimeError(f".env not found at: {ENV_PATH}\nCreate it with a single line:\nOPENAI_API_KEY=sk-...")

load_dotenv(dotenv_path=str(ENV_PATH))
api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.strip():
    raise RuntimeError(
        "OPENAI_API_KEY not found after loading .env.\n"
        "Check for typos (exact name), no quotes, and restart the kernel."
    )

from openai import OpenAI
client = OpenAI(api_key=api_key)

# ----- paths -----
BASE_OUT = ROOT / "outputs"
IN_PATH  = BASE_OUT / "fusion.ndjson"
OUT_PATH = BASE_OUT / "summaries.txt"

if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH} (run fuse_infer_generate.py first)")

system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with quick patient context, describe current severity, 5-week trend, key abnormal vitals, and any alerts. "
    "Use concise, neutral, non-alarming language and avoid definitive diagnoses; suggest clinical actions when appropriate."
)

written = 0
with open(IN_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        rec = json.loads(line)
        pid = rec.get("patient_id", "unknown")
        snapshot = rec.get("snapshot") or {}
        history  = rec.get("history")  or {}
        alerts   = rec.get("alerts")   or []

        user_prompt = (
            f"Patient ID: {pid}\n"
            f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
            f"Trend: {history.get('trend')}\n"
            f"Class probabilities: {history.get('proba')}\n"
            f"Alerts: {alerts}\n"
            "Summarize for a clinician."
        )

        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=0.2,
        )
        text = (resp.choices[0].message.content or "").strip()
        fout.write(f"[{pid}]\n{text}\n\n")
        written += 1

print(f"Wrote {written} summaries -> {OUT_PATH}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [11]:
# --- OpenAI EMR Summarizer (robust .env loading) ---

import os, json
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI

# Project root + outputs
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
ENV_PATH = ROOT / ".env"
BASE_OUT = ROOT / "outputs"
IN_PATH  = BASE_OUT / "fusion.ndjson"
OUT_PATH = BASE_OUT / "summaries.txt"

# 1) Load .env robustly (handles wrong CWD)
if not ENV_PATH.exists():
    raise RuntimeError(f".env not found at: {ENV_PATH}\nCreate it with one line:\nOPENAI_API_KEY=sk-...")

# Try multiple encodings in case file was saved with BOM/UTF-16
loaded = False
for enc in ("utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be"):
    try:
        if load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc):
            if os.getenv("OPENAI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass
if not loaded:
    raise RuntimeError(
        f"Could not read OPENAI_API_KEY from {ENV_PATH}. "
        "Re-save the file as UTF-8 (no quotes) with content:\nOPENAI_API_KEY=sk-..."
    )

api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.strip():
    raise RuntimeError("OPENAI_API_KEY not loaded. Check your .env text and restart the kernel.")

client = OpenAI(api_key=api_key)

# 2) Validate inputs
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}\nRun fuse_infer_generate.py first.")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# 3) Prompts
system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with quick patient context, describe current severity, 5-week trend, key abnormal vitals, and any alerts. "
    "Use concise, neutral, non-alarming language and avoid definitive diagnoses; suggest clinical actions when appropriate. "
    "Use calibrated language (e.g., 'suggests', 'consider'); do not include PII beyond patient_id."
)

# 4) Generate summaries
written = 0
with open(IN_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue

        rec = json.loads(line)
        pid = rec.get("patient_id", "unknown")
        snapshot = rec.get("snapshot") or {}
        history  = rec.get("history")  or {}
        alerts   = rec.get("alerts")   or []

        user_prompt = (
            f"Patient ID: {pid}\n"
            f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
            f"Trend: {history.get('trend')}\n"
            f"Class probabilities: {history.get('proba')}\n"
            f"Alerts: {alerts}\n"
            "Summarize for a clinician."
        )

        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=0.2,
        )
        text = (resp.choices[0].message.content or "").strip()
        fout.write(f"[{pid}]\n{text}\n\n")
        written += 1

print(f"Wrote {written} summaries -> {OUT_PATH}")


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [12]:
# summaries_generate.py — resilient OpenAI summarizer with retries, throttle, resume, cache

import os, json, time, hashlib
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from openai import APIConnectionError, RateLimitError, InternalServerError, OpenAIError

# ---------------- Paths ----------------
ROOT      = Path(r"C:\Users\aayus\Downloads\emr-smart")
DATA_DIR  = ROOT / "outputs"
IN_PATH   = DATA_DIR / "fusion.ndjson"
OUT_TXT   = DATA_DIR / "summaries.txt"
OUT_CSV   = DATA_DIR / "summaries.csv"
CACHE_JL  = DATA_DIR / "summaries_cache.ndjson"   # patient_id + fingerprint -> summary

# ---------------- Config knobs ----------------
MODEL_NAME          = "gpt-4o-mini"
MAX_PER_RUN         = 200          #  set smaller while testing (e.g., 20)
START_AT_INDEX      = 0            #  start from a later index to resume a large batch
SLEEP_BETWEEN_CALLS = 2.0          #  seconds between requests to avoid rate caps
MAX_RETRIES         = 7            #  total attempts per patient on 429/5xx
TEMPERATURE         = 0.2

SYSTEM_PROMPT = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with quick patient context, then current severity, 5-week trend, and noteworthy vitals or risks. "
    "Reference model probabilities only if helpful (e.g., 'high likelihood of Worsening'). "
    "Use calibrated, non-alarming language (e.g., 'suggests', 'consider'); avoid firm diagnoses or prescriptions. "
    "Do not include PII beyond patient_id. End with: 'This supports decisions and is not a diagnosis.'"
)

USER_TEMPLATE = (
    "Patient ID: {pid}\n"
    "Current severity: {sev_num} ({sev_label})\n"
    "Trend: {trend}\n"
    "Class probabilities: {proba}\n"
    "Alerts: {alerts}\n"
    "Summarize succinctly for a clinician."
)

# --------------- Helpers ----------------
def ensure_env():
    env_path = ROOT / ".env"
    if not env_path.exists():
        raise RuntimeError(f".env not found at {env_path}. Create a UTF-8 file with:\nOPENAI_API_KEY=sk-...")

    loaded = False
    for enc in ("utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be"):
        try:
            if load_dotenv(dotenv_path=str(env_path), override=True, encoding=enc):
                if os.getenv("OPENAI_API_KEY"):
                    loaded = True
                    break
        except Exception:
            pass
    if not loaded:
        raise RuntimeError("Could not read OPENAI_API_KEY from .env. Re-save as UTF-8 (no quotes).")

def stable_fingerprint(obj) -> str:
    """Hash the fusion record to skip re-summarization if unchanged."""
    txt = json.dumps(obj, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(txt.encode("utf-8")).hexdigest()

def load_cache():
    seen = {}
    if CACHE_JL.exists():
        with open(CACHE_JL, "r", encoding="utf-8") as f:
            for line in f:
                if not line.strip():
                    continue
                rec = json.loads(line)
                seen[(rec["patient_id"], rec["fingerprint"])] = rec["summary"]
    return seen

def append_cache(pid, fp, summary):
    with open(CACHE_JL, "a", encoding="utf-8") as f:
        f.write(json.dumps({"patient_id": pid, "fingerprint": fp, "summary": summary}, ensure_ascii=False) + "\n")

def append_outputs(pid, summary):
    OUT_TXT.parent.mkdir(parents=True, exist_ok=True)
    with open(OUT_TXT, "a", encoding="utf-8") as ft:
        ft.write(f"[{pid}]\n{summary}\n\n")
    # also CSV (append header if missing)
    make_header = not OUT_CSV.exists()
    with open(OUT_CSV, "a", encoding="utf-8") as fc:
        if make_header:
            fc.write("patient_id,summary\n")
        # simple CSV escaping for commas/quotes/newlines
        clean = summary.replace('"', '""').replace("\n", " ").strip()
        fc.write(f'"{pid}","{clean}"\n')

def call_with_retries(client, messages):
    delay = 2.0
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                temperature=TEMPERATURE,
            )
            return (resp.choices[0].message.content or "").strip()
        except RateLimitError as e:
            # Honor Retry-After if present
            retry_after = None
            try:
                retry_after = float(getattr(getattr(e, "response", None), "headers", {}).get("Retry-After", None))
            except Exception:
                pass
            sleep_for = retry_after if retry_after else delay
            print(f"[429] Rate limited. Sleeping {sleep_for:.1f}s (attempt {attempt}/{MAX_RETRIES})")
            time.sleep(sleep_for)
            delay = min(delay * 2, 60)  # cap backoff
        except (InternalServerError, APIConnectionError) as e:
            print(f"[{type(e).__name__}] transient error. Sleeping {delay:.1f}s (attempt {attempt}/{MAX_RETRIES})")
            time.sleep(delay)
            delay = min(delay * 2, 60)
        except OpenAIError as e:
            # Other non-retriable API errors — return a note but keep pipeline alive
            return f"Summary unavailable due to API error: {str(e)[:200]}"
    return "Summary unavailable after repeated API rate limits."

def build_user_prompt(rec):
    pid     = rec.get("patient_id", "unknown")
    snap    = rec.get("snapshot") or {}
    hist    = rec.get("history")  or {}
    alerts  = rec.get("alerts")   or []

    return USER_TEMPLATE.format(
        pid=pid,
        sev_num=snap.get("severity_numeric"),
        sev_label=snap.get("severity_label"),
        trend=hist.get("trend"),
        proba=hist.get("proba"),
        alerts="; ".join(alerts) if alerts else "None"
    )

# --------------- Main ----------------
def main():
    ensure_env()
    api_key = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=api_key)

    if not IN_PATH.exists():
        raise FileNotFoundError(f"Missing input {IN_PATH}. Run fuse_infer_generate.py first.")

    cache = load_cache()
    done = 0
    total = 0

    # Read all lines first so indexing works
    with open(IN_PATH, "r", encoding="utf-8") as f:
        fusion_lines = [ln for ln in (ln.strip() for ln in f) if ln]

    print(f"Found {len(fusion_lines)} fusion records.")
    for idx, line in enumerate(fusion_lines):
        total += 1
        if idx < START_AT_INDEX:
            continue
        if done >= MAX_PER_RUN:
            break

        rec = json.loads(line)
        pid = rec.get("patient_id", f"row_{idx}")
        fp  = stable_fingerprint(rec)

        # Skip if exact same record was summarized earlier
        if (pid, fp) in cache:
            summary = cache[(pid, fp)]
            append_outputs(pid, summary)
            done += 1
            continue

        user_prompt = build_user_prompt(rec)
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": user_prompt},
        ]

        # Call API with retries/backoff
        summary = call_with_retries(client, messages)
        append_outputs(pid, summary)
        append_cache(pid, fp, summary)
        done += 1

        # Gentle throttle to avoid 429s
        time.sleep(SLEEP_BETWEEN_CALLS)

    print(f"Completed {done} summaries (out of {total})")
    print(f"Outputs:\n - {OUT_TXT}\n - {OUT_CSV}\n - {CACHE_JL}")

if __name__ == "__main__":
    main()


Found 1000 fusion records.
[429] Rate limited. Sleeping 2.0s (attempt 1/7)
[429] Rate limited. Sleeping 4.0s (attempt 2/7)
[429] Rate limited. Sleeping 8.0s (attempt 3/7)
[429] Rate limited. Sleeping 16.0s (attempt 4/7)
[429] Rate limited. Sleeping 32.0s (attempt 5/7)
[429] Rate limited. Sleeping 60.0s (attempt 6/7)
[429] Rate limited. Sleeping 60.0s (attempt 7/7)
[429] Rate limited. Sleeping 2.0s (attempt 1/7)
[429] Rate limited. Sleeping 4.0s (attempt 2/7)
[429] Rate limited. Sleeping 8.0s (attempt 3/7)
[429] Rate limited. Sleeping 16.0s (attempt 4/7)
[429] Rate limited. Sleeping 32.0s (attempt 5/7)
[429] Rate limited. Sleeping 60.0s (attempt 6/7)
[429] Rate limited. Sleeping 60.0s (attempt 7/7)
[429] Rate limited. Sleeping 2.0s (attempt 1/7)
[429] Rate limited. Sleeping 4.0s (attempt 2/7)
[429] Rate limited. Sleeping 8.0s (attempt 3/7)
[429] Rate limited. Sleeping 16.0s (attempt 4/7)
[429] Rate limited. Sleeping 32.0s (attempt 5/7)
[429] Rate limited. Sleeping 60.0s (attempt 6/7)
[4

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\aayus\\Downloads\\emr-smart\\outputs\\summaries.csv'

In [None]:
# --- OpenAI EMR Summarizer (limit to 10 requests) ---

import os, json, time, random
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI

# === Paths ===
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
ENV_PATH = ROOT / ".env"
BASE_OUT = ROOT / "outputs"
IN_PATH  = BASE_OUT / "fusion.ndjson"
OUT_PATH = BASE_OUT / "summaries_10.txt"

# === Load .env (robust encodings) ===
if not ENV_PATH.exists():
    raise RuntimeError(f".env not found at: {ENV_PATH}\nPut a line like:\nOPENAI_API_KEY=sk-...")

loaded = False
for enc in ("utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be"):
    try:
        if load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc):
            if os.getenv("OPENAI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass
if not loaded:
    raise RuntimeError(
        f"Could not read OPENAI_API_KEY from {ENV_PATH}. "
        "Re-save the file as UTF-8 (no quotes) with content:\nOPENAI_API_KEY=sk-..."
    )

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# === Inputs ===
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}\nRun fuse_infer_generate.py first.")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# === Config: make at most N calls ===
MAX_REQUESTS = 10
MODEL = "gpt-4o-mini"
SLEEP_BETWEEN_CALLS = 2.0   # seconds, gentle throttle
MAX_RETRIES = 2              # per request
BACKOFF = [2, 4]             # on 429

system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with quick patient context, describe current severity, 5-week trend, any key abnormalities, and alerts. "
    "Use concise, neutral, non-alarming language; avoid definitive diagnoses; suggest reasonable next steps. "
    "Use calibrated language (e.g., 'suggests', 'consider'); do not include PII beyond patient_id."
)

# --- Read fusion NDJSON and pick up to 10 records ---
records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except Exception:
            continue

if not records:
    raise RuntimeError("No records found in fusion.ndjson")

# Option A: take the first 10
# subset = records[:MAX_REQUESTS]

# Option B (default here): random sample 10 to get variety
random.seed(42)
subset = random.sample(records, min(MAX_REQUESTS, len(records)))

print(f"Generating summaries for {len(subset)} patients…")

def one_summary(rec):
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    user_prompt = (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Summarize for a clinician."
    )

    # retry loop (handles rate limits briefly)
    last_err = None
    for attempt in range(1, MAX_RETRIES + 2):  # e.g., attempts = 1..3 if MAX_RETRIES=2
        try:
            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user",   "content": user_prompt},
                ],
                temperature=0.2,
            )
            text = (resp.choices[0].message.content or "").strip()
            return pid, text, None
        except Exception as e:
            msg = str(e)
            last_err = msg
            # If it's an insufficient_quota error, no point in retrying more than once
            if "insufficient_quota" in msg or "You exceeded your current quota" in msg:
                return pid, None, "insufficient_quota"
            # If it's a 429 rate limit, back off gently
            if "429" in msg or "Rate limit" in msg or "rate" in msg.lower():
                sleep_for = BACKOFF[min(attempt-1, len(BACKOFF)-1)]
                print(f"[429] Sleeping {sleep_for}s (attempt {attempt}/{MAX_RETRIES+1})")
                time.sleep(sleep_for)
                continue
            # Other transient errors: short sleep and retry
            time.sleep(1.0)
            continue
    return pid, None, last_err or "unknown_error"

written = 0
failed  = 0

with open(OUT_PATH, "w", encoding="utf-8") as fout:
    for i, rec in enumerate(subset, start=1):
        pid, text, err = one_summary(rec)
        if err == "insufficient_quota":
            print("Stopped: insufficient quota on your OpenAI account.")
            break
        if text:
            fout.write(f"[{pid}]\n{text}\n\n")
            written += 1
            print(f"✓ {i}/{len(subset)}  -> wrote summary for patient {pid}")
        else:
            failed += 1
            print(f"✗ {i}/{len(subset)}  -> failed for patient {pid}: {err}")
        time.sleep(SLEEP_BETWEEN_CALLS)

print(f"\nDone. Wrote {written} summaries, failed {failed}.")
print(f"Output: {OUT_PATH}")


In [None]:
# summarize_10.py — make only 10 LLM requests

import os, json, time, math
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI, RateLimitError, APIConnectionError, APIStatusError

# ---------- paths ----------
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
ENV_PATH = ROOT / ".env"
BASE_OUT = ROOT / "outputs"
IN_PATH  = BASE_OUT / "fusion.ndjson"
OUT_PATH = BASE_OUT / "summaries.txt"

# ---------- load API key (robust encodings) ----------
if not ENV_PATH.exists():
    raise RuntimeError(f".env not found at: {ENV_PATH}\nPut: OPENAI_API_KEY=sk-...")

loaded = False
for enc in ("utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be"):
    try:
        if load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc):
            if os.getenv("OPENAI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass
if not loaded:
    raise RuntimeError("Could not read OPENAI_API_KEY from .env (save as UTF-8, no quotes).")

api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.strip():
    raise RuntimeError("OPENAI_API_KEY missing after load. Check .env.")

client = OpenAI(api_key=api_key)

# ---------- inputs ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input file: {IN_PATH} (run fuse_infer_generate.py first)")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# ---------- prompts ----------
SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in ~5–7 sentences. "
    "Start with quick patient context, describe current severity, 5-week trend, key abnormal vitals, and any alerts. "
    "Use neutral, concise language; avoid diagnoses; suggest actions with calibrated terms (e.g., 'consider'). "
    "Do not include PII beyond patient_id."
)

def build_user_prompt(rec: dict) -> str:
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []
    return (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Write one concise, clinician-facing paragraph (≤150 words). "
        "End with: 'This supports decisions and is not a diagnosis.'"
    )

# ---------- small retry wrapper (max 3) ----------
def ask_llm(prompt: str, max_retries: int = 3):
    delay = 2.0
    for attempt in range(1, max_retries + 1):
        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": SYSTEM},
                    {"role": "user",   "content": prompt},
                ],
                temperature=0.2,
            )
            return (resp.choices[0].message.content or "").strip()
        except RateLimitError:
            if attempt == max_retries:
                return f"[LLM skipped after rate limit attempts: {attempt}]"
            time.sleep(delay)
            delay *= 2
        except (APIConnectionError, APIStatusError) as e:
            if attempt == max_retries:
                return f"[LLM skipped due to API error: {getattr(e, 'message', str(e))}]"
            time.sleep(delay)
            delay *= 2
        except Exception as e:
            # Any other unexpected error — don’t crash the run
            return f"[LLM skipped due to unexpected error: {e}]"

# ---------- run for only 10 records ----------
MAX_REQUESTS = 10
written = 0

with open(IN_PATH, "r", encoding="utf-8") as fin, open(OUT_PATH, "w", encoding="utf-8") as fout:
    for line_idx, line in enumerate(fin, start=1):
        if written >= MAX_REQUESTS:
            break
        line = line.strip()
        if not line:
            continue
        try:
            rec = json.loads(line)
        except json.JSONDecodeError:
            continue

        pid = rec.get("patient_id", f"row_{line_idx}")
        prompt = build_user_prompt(rec)
        text = ask_llm(prompt, max_retries=3)

        fout.write(f"[{pid}]\n{text}\n\n")
        written += 1
        # tiny spacing to be gentle even when not rate-limited
        time.sleep(0.5)

print(f"Made {written} requests. Wrote summaries → {OUT_PATH}")


In [None]:
# summarize_10.py — make at most 10 LLM requests; fallback when rate limited

import os, json, time, csv
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
from openai import OpenAIError, RateLimitError

ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
ENV_PATH = ROOT / ".env"
BASE_OUT = ROOT / "outputs"
IN_PATH  = BASE_OUT / "fusion.ndjson"
OUT_PATH = BASE_OUT / "summaries.csv"

# ---------- robust .env load ----------
if not ENV_PATH.exists():
    raise RuntimeError(f".env not found at: {ENV_PATH}\nPut: OPENAI_API_KEY=sk-...")

loaded = False
for enc in ("utf-8", "utf-8-sig", "utf-16", "utf-16-le", "utf-16-be"):
    try:
        if load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc):
            if os.getenv("OPENAI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass
if not loaded:
    raise RuntimeError(
        f"Could not read OPENAI_API_KEY from {ENV_PATH}. "
        "Re-save the file as UTF-8 (no quotes) with content:\nOPENAI_API_KEY=sk-..."
    )

api_key = os.getenv("OPENAI_API_KEY", "").strip()
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not loaded; fix your .env and restart the kernel.")

client = OpenAI(api_key=api_key)

# ---------- input checks ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}. Run your fuse script first.")
BASE_OUT.mkdir(parents=True, exist_ok=True)

# ---------- prompts ----------
SYSTEM_PROMPT = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Open with quick context; describe current severity, 5-week trend, notable abnormalities, and any alerts. "
    "Use concise, neutral, non-alarmist language and calibrated phrasing (e.g., 'suggests', 'consider'). "
    "Avoid diagnoses and prescriptions. No PII beyond patient_id."
)

def fallback_summary(rec: dict) -> str:
    """Build a safe paragraph if the API is unavailable."""
    pid = rec.get("patient_id", "unknown")
    snap = rec.get("snapshot") or {}
    hist = rec.get("history") or {}
    alerts = rec.get("alerts") or []
    sev_val = snap.get("severity_numeric")
    sev_lab = snap.get("severity_label") or "Unknown"
    trend   = hist.get("trend") or "Unknown"
    proba   = hist.get("proba") or {}
    p_imp = proba.get("Improving")
    p_sta = proba.get("Stable")
    p_wor = proba.get("Worsening")

    parts = [
        f"Patient {pid}: current severity ≈ {sev_val if sev_val is not None else 'N/A'} ({sev_lab}).",
        f"Five-week trend classified as {trend}.",
        f"Class probabilities — Improving: {p_imp if p_imp is not None else 'N/A'}, Stable: {p_sta if p_sta is not None else 'N/A'}, Worsening: {p_wor if p_wor is not None else 'N/A'}.",
    ]
    if alerts:
        parts.append("Alerts: " + "; ".join(alerts) + ".")
    parts.append("Overall, findings suggest the above pattern and should be interpreted alongside clinical judgment. This supports decisions and is not a diagnosis.")
    return " ".join(parts)

def summarize_with_llm(rec: dict) -> str:
    """Try OpenAI once with short output. If 429/other error, raise for caller to fallback."""
    pid = rec.get("patient_id", "unknown")
    snap = rec.get("snapshot") or {}
    hist = rec.get("history") or {}
    alerts = rec.get("alerts") or []

    user_prompt = (
        f"Patient ID: {pid}\n"
        f"Current severity: {snap.get('severity_numeric')} ({snap.get('severity_label')})\n"
        f"Trend: {hist.get('trend')}\n"
        f"Class probabilities: {hist.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Write one concise paragraph (≤150 words) for a clinician."
    )

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user",   "content": user_prompt},
        ],
        temperature=0.2,
        max_tokens=220,  # short output to reduce token usage
        n=1,
    )
    return (resp.choices[0].message.content or "").strip()

# ---------- main: only 10 requests ----------
rows = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    all_lines = [ln for ln in f if ln.strip()]

N = min(10, len(all_lines))
print(f"Found {len(all_lines)} fusion records. Will summarize first {N}.")

for i in range(N):
    rec = json.loads(all_lines[i])
    pid = rec.get("patient_id", f"row_{i+1}")

    try:
        text = summarize_with_llm(rec)
    except RateLimitError:
        # If you are out of quota or rate-limited, use a clear fallback
        text = fallback_summary(rec)
    except OpenAIError:
        text = fallback_summary(rec)
    except Exception:
        text = fallback_summary(rec)

    rows.append({"patient_id": pid, "summary": text})

    # gentle pacing to avoid bursts; tweak as needed
    time.sleep(1.0)

# write CSV
with open(OUT_PATH, "w", newline="", encoding="utf-8") as fout:
    writer = csv.DictWriter(fout, fieldnames=["patient_id", "summary"])
    writer.writeheader()
    writer.writerows(rows)

print(f"Wrote {len(rows)} summaries -> {OUT_PATH}")


In [13]:
# --- EMR Summaries: single-call LLM with automatic local fallback ---

import os, json, math
from pathlib import Path
import pandas as pd

ROOT      = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR   = ROOT / "outputs"
IN_NDJSON = OUT_DIR / "fusion.ndjson"
OUT_CSV   = OUT_DIR / "summaries.csv"

# how many summaries to produce now
MAX_RECORDS = 10          # keep small while testing
BATCH_SIZE  = 10          # all 10 in a single LLM call
MODEL       = "gpt-4o-mini"

# ---------- helpers ----------
def local_summarize_one(rec: dict) -> str:
    """Deterministic, no-LLM fallback summary."""
    pid = rec.get("patient_id","unknown")
    hist = rec.get("history") or {}
    snap = rec.get("snapshot") or {}
    alerts = rec.get("alerts") or []

    sev_num = snap.get("severity_numeric", None)
    sev_lab = snap.get("severity_label", "")
    trend   = hist.get("trend", "")
    proba   = hist.get("proba", {})

    # pick top class prob
    if isinstance(proba, dict) and proba:
        top_class = max(proba.items(), key=lambda kv: kv[1])[0]
        top_prob  = max(proba.values())
    else:
        top_class, top_prob = "", None

    bits = []
    bits.append(f"Patient {pid}:")
    if sev_num is not None:
        bits.append(f"current severity ≈ {sev_num:.2f} ({sev_lab or 'Unknown'}).")
    else:
        bits.append(f"current severity not available.")
    if trend:
        bits.append(f"5-week trend: {trend}.")
    if top_class:
        if top_prob is not None:
            bits.append(f"Model confidence leans {top_class} (~{top_prob:.3f}).")
        else:
            bits.append(f"Model confidence leans {top_class}.")
    if alerts:
        bits.append("Alerts: " + "; ".join(alerts) + ".")
    # simple guidance text
    if sev_lab == "High" or trend == "Worsening":
        bits.append("Consider closer monitoring, revisit medications, or additional evaluation as clinically indicated.")
    else:
        bits.append("Continue routine care; monitor for changes.")

    bits.append("This supports decisions and is not a diagnosis.")
    return " ".join(bits)

def load_fusion_rows(path: Path, limit: int):
    recs = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= limit: break
            line = line.strip()
            if not line: continue
            recs.append(json.loads(line))
    return recs

def to_rows_for_csv(records, texts):
    rows = []
    for rec, txt in zip(records, texts):
        rows.append({"patient_id": rec.get("patient_id","unknown"), "summary": txt})
    return rows

# ---------- main ----------
if not IN_NDJSON.exists():
    raise FileNotFoundError(f"Missing {IN_NDJSON}. Run your fuse_infer_generate step first.")

records = load_fusion_rows(IN_NDJSON, MAX_RECORDS)
print(f"Loaded {len(records)} fusion records.")

# Try a single batched OpenAI call (optional, only if key present)
from dotenv import load_dotenv
load_dotenv(dotenv_path=str(ROOT/".env"))

api_key = os.getenv("OPENAI_API_KEY", "").strip()
use_llm = bool(api_key)

all_summaries = []

if use_llm:
    try:
        # Prepare one user prompt that includes up to BATCH_SIZE records
        batches = [records[i:i+BATCH_SIZE] for i in range(0, len(records), BATCH_SIZE)]
        from openai import OpenAI
        client = OpenAI(api_key=api_key)

        for batch in batches:
            # single prompt for multiple patients
            lines = []
            for rec in batch:
                pid = rec.get("patient_id","unknown")
                hist = rec.get("history") or {}
                snap = rec.get("snapshot") or {}
                alerts = rec.get("alerts") or []

                sev_num = snap.get("severity_numeric", None)
                sev_lab = snap.get("severity_label", "")
                trend   = hist.get("trend", "")
                proba   = hist.get("proba", {})

                lines.append(json.dumps({
                    "patient_id": pid,
                    "severity_numeric": sev_num,
                    "severity_label": sev_lab,
                    "trend": trend,
                    "proba": proba,
                    "alerts": alerts
                }, ensure_ascii=False))

            user_prompt = (
                "You are a clinical assistant. For each JSON line below, write ONE concise 5–7 sentence paragraph "
                "for clinicians: start with patient context, state current severity and 5-week trend, comment on class "
                "probabilities (confidence) if useful, include any alerts, and end with "
                "'This supports decisions and is not a diagnosis.'\n\n"
                "Return output as NDJSON where each line = "
                "{\"patient_id\":\"...\",\"summary\":\"...\"} in the same order.\n\n"
                "INPUT LINES:\n" + "\n".join(lines)
            )

            resp = client.chat.completions.create(
                model=MODEL,
                messages=[
                    {"role":"system","content":"Be concise, neutral, and clinically appropriate; avoid prescriptions."},
                    {"role":"user","content": user_prompt}
                ],
                temperature=0.2,
                max_tokens=900,   # enough for ~10 short paragraphs
            )
            text = (resp.choices[0].message.content or "").strip()

            # parse NDJSON back
            for ln in text.splitlines():
                ln = ln.strip()
                if not ln:
                    continue
                try:
                    obj = json.loads(ln)
                    all_summaries.append(obj.get("summary","").strip())
                except Exception:
                    # if parsing fails, fall back for that line
                    idx = len(all_summaries)
                    if idx < len(batch):
                        all_summaries.append(local_summarize_one(batch[idx]))
                    else:
                        all_summaries.append("Summary unavailable (parse error).")
        # safety: if lengths mismatch, fill with local
        if len(all_summaries) < len(records):
            for i in range(len(all_summaries), len(records)):
                all_summaries.append(local_summarize_one(records[i]))

    except Exception as e:
        print(f"[LLM unavailable: {e}] Falling back to local summaries.")
        all_summaries = [local_summarize_one(r) for r in records]
else:
    print("[No OPENAI_API_KEY] Using local summaries.")
    all_summaries = [local_summarize_one(r) for r in records]

# save
OUT_DIR.mkdir(parents=True, exist_ok=True)
rows = to_rows_for_csv(records, all_summaries)
pd.DataFrame(rows).to_csv(OUT_CSV, index=False, encoding="utf-8")
print(f"Wrote {len(rows)} summaries -> {OUT_CSV}")

# show a preview
for i in range(min(3, len(rows))):
    print(f"\n[{rows[i]['patient_id']}]\n{rows[i]['summary']}\n")


Loaded 10 fusion records.


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [14]:
import os
from pathlib import Path
from dotenv import load_dotenv

ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
assert (ROOT/".env").exists(), ".env not found where expected"

# try multiple encodings in case an editor added a BOM
loaded = False
for enc in ("utf-8", "utf-8-sig"):
    if load_dotenv(dotenv_path=str(ROOT/".env"), override=True, encoding=enc):
        if os.getenv("OPENAI_API_KEY"):
            loaded = True
            break
assert loaded, "Could not load OPENAI_API_KEY from .env"

key = os.getenv("OPENAI_API_KEY")
print("Loaded OPENAI_API_KEY:", key[:10] + "..." if key else None)


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [1]:
import os
k = os.getenv("OPENAI_API_KEY")
print("Key detected?" , bool(k), "| length:", len(k) if k else 0)


Key detected? True | length: 164


In [2]:
# --- Summarize up to 10 patients with gentle pacing to avoid 429s ---

import os, json, time, csv
from pathlib import Path
from openai import OpenAI

ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
IN_PATH  = ROOT / "outputs" / "fusion.ndjson"
OUT_CSV  = ROOT / "outputs" / "summaries.csv"
OUT_TXT  = ROOT / "outputs" / "summaries.txt"

# limits/pacing
LIMIT = 10          # <= change this later (e.g., 25, 50) once it works
SLEEP_BETWEEN = 3   # seconds between requests to be gentle

# 1) sanity checks
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing {IN_PATH}. Run fuse_infer_generate.py first.")

api_key = os.getenv("OPENAI_API_KEY", "").strip()
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not visible in this kernel. Restart kernel or Windows session if needed.")

client = OpenAI(api_key=api_key)

system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Include patient context, current severity (numeric + label if present), 5-week trend, key risks, and alerts. "
    "Use concise, neutral, non-alarming language and calibrated phrasing ('suggests', 'consider'); "
    "avoid diagnoses or prescriptions; no PII beyond patient_id."
)

def build_user_prompt(rec: dict) -> str:
    pid = rec.get("patient_id", "unknown")
    snap = rec.get("snapshot") or {}
    hist = rec.get("history")  or {}
    alerts = rec.get("alerts") or []

    return (
        f"Patient ID: {pid}\n"
        f"Current severity: {snap.get('severity_numeric')} ({snap.get('severity_label')})\n"
        f"Trend: {hist.get('trend')}\n"
        f"Class probabilities: {hist.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Write one concise paragraph for a clinician."
    )

# 2) load first N fusion rows
fusion = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= LIMIT: break
        line=line.strip()
        if not line: continue
        fusion.append(json.loads(line))

print(f"Loaded {len(fusion)} fusion records (target {LIMIT}).")

# 3) make outputs dir and open writers
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
csv_rows = []
txt_chunks = []

def call_llm(user_prompt: str) -> str:
    """One gentle call to the API. On 429, return a clear placeholder."""
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=0.2,
            max_tokens=220,  # short, to reduce costs/rate pressure
        )
        return (resp.choices[0].message.content or "").strip()
    except Exception as e:
        # If it's a rate limit or any other error, return a short stub
        return f"Summary unavailable (API error: {type(e).__name__})."

# 4) loop with gentle pacing
for i, rec in enumerate(fusion, start=1):
    pid = rec.get("patient_id", f"row_{i}")
    prompt = build_user_prompt(rec)
    text = call_llm(prompt)

    # collect outputs
    csv_rows.append({"patient_id": pid, "summary": text})
    txt_chunks.append(f"[{pid}]\n{text}\n")

    print(f"[{pid}] done.")
    time.sleep(SLEEP_BETWEEN)

# 5) write CSV + TXT
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    for r in csv_rows: w.writerow(r)

with open(OUT_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(txt_chunks))

print("Saved:")
print(" ", OUT_CSV)
print(" ", OUT_TXT)


Loaded 10 fusion records (target 10).
[1] done.
[2] done.
[3] done.
[4] done.
[5] done.
[6] done.
[7] done.
[8] done.
[9] done.
[10] done.


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\aayus\\Downloads\\emr-smart\\outputs\\summaries.csv'

In [3]:
# --- Summarize up to 10 patients with gentle pacing to avoid 429s ---

import os, json, time, csv
from pathlib import Path
from openai import OpenAI

ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
IN_PATH  = ROOT / "outputs" / "fusion.ndjson"
OUT_CSV  = ROOT / "outputs" / "summaries.csv"
OUT_TXT  = ROOT / "outputs" / "summaries.txt"

# limits/pacing
LIMIT = 10          # <= change this later (e.g., 25, 50) once it works
SLEEP_BETWEEN = 3   # seconds between requests to be gentle

# 1) sanity checks
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing {IN_PATH}. Run fuse_infer_generate.py first.")

api_key = os.getenv("OPENAI_API_KEY", "").strip()
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not visible in this kernel. Restart kernel or Windows session if needed.")

client = OpenAI(api_key=api_key)

system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Include patient context, current severity (numeric + label if present), 5-week trend, key risks, and alerts. "
    "Use concise, neutral, non-alarming language and calibrated phrasing ('suggests', 'consider'); "
    "avoid diagnoses or prescriptions; no PII beyond patient_id."
)

def build_user_prompt(rec: dict) -> str:
    pid = rec.get("patient_id", "unknown")
    snap = rec.get("snapshot") or {}
    hist = rec.get("history")  or {}
    alerts = rec.get("alerts") or []

    return (
        f"Patient ID: {pid}\n"
        f"Current severity: {snap.get('severity_numeric')} ({snap.get('severity_label')})\n"
        f"Trend: {hist.get('trend')}\n"
        f"Class probabilities: {hist.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Write one concise paragraph for a clinician."
    )

# 2) load first N fusion rows
fusion = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= LIMIT: break
        line=line.strip()
        if not line: continue
        fusion.append(json.loads(line))

print(f"Loaded {len(fusion)} fusion records (target {LIMIT}).")

# 3) make outputs dir and open writers
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
csv_rows = []
txt_chunks = []

def call_llm(user_prompt: str) -> str:
    """One gentle call to the API. On 429, return a clear placeholder."""
    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=0.2,
            max_tokens=220,  # short, to reduce costs/rate pressure
        )
        return (resp.choices[0].message.content or "").strip()
    except Exception as e:
        # If it's a rate limit or any other error, return a short stub
        return f"Summary unavailable (API error: {type(e).__name__})."

# 4) loop with gentle pacing
for i, rec in enumerate(fusion, start=1):
    pid = rec.get("patient_id", f"row_{i}")
    prompt = build_user_prompt(rec)
    text = call_llm(prompt)

    # collect outputs
    csv_rows.append({"patient_id": pid, "summary": text})
    txt_chunks.append(f"[{pid}]\n{text}\n")

    print(f"[{pid}] done.")
    time.sleep(SLEEP_BETWEEN)

# 5) write CSV + TXT
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    for r in csv_rows: w.writerow(r)

with open(OUT_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(txt_chunks))

print("Saved:")
print(" ", OUT_CSV)
print(" ", OUT_TXT)


Loaded 10 fusion records (target 10).
[1] done.
[2] done.
[3] done.
[4] done.
[5] done.
[6] done.
[7] done.
[8] done.
[9] done.
[10] done.


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\aayus\\Downloads\\emr-smart\\outputs\\summaries.csv'

In [4]:
# --- EMR Summaries: robust writer with 10-record cap and .env handling ---

import os, json, csv, time, tempfile, shutil
from datetime import datetime
from pathlib import Path

# ---- paths ----
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR = ROOT / "outputs"
IN_PATH  = OUT_DIR / "fusion.ndjson"
OUT_CSV  = OUT_DIR / "summaries.csv"
OUT_TXT  = OUT_DIR / "summaries.txt"

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- load .env robustly (utf-8 or utf-8-sig) ----
from dotenv import load_dotenv
loaded = False
for enc in ("utf-8", "utf-8-sig"):
    try:
        if load_dotenv(dotenv_path=str(ROOT/".env"), override=True, encoding=enc):
            if os.getenv("OPENAI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass

api_key = (os.getenv("OPENAI_API_KEY") or "").strip()
use_llm = bool(api_key)

# ---- read up to 10 fusion records ----
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}. Run fuse_infer_generate.py first.")

records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        records.append(json.loads(line))
        if len(records) >= 10:  # cap at 10 requests
            break

print(f"Loaded {len(records)} fusion records (target 10).")

# ---- OpenAI client (optional) ----
client = None
if use_llm:
    from openai import OpenAI
    client = OpenAI(api_key=api_key)

system_prompt = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with quick patient context, describe current severity, 5-week trend, key abnormal vitals, and any alerts. "
    "Use concise, neutral, non-alarming language and avoid definitive diagnoses; suggest clinical actions when appropriate. "
    "Use calibrated language (e.g., 'suggests', 'consider'); do not include PII beyond patient_id."
)

def build_user_prompt(rec):
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []
    return (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Summarize for a clinician (5–7 sentences)."
    )

def local_template_summary(rec):
    """Fallback when no API key or rate-limited."""
    pid = rec.get("patient_id","unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []
    sev = snapshot.get("severity_label") or "N/A"
    sevn = snapshot.get("severity_numeric")
    trend = history.get("trend") or "N/A"
    probs = history.get("proba") or {}
    alert_txt = "; ".join(alerts) if alerts else "No immediate alerts."
    return (
        f"Patient {pid}: Current severity {sevn} ({sev}). "
        f"Five-week trend classified as {trend}. "
        f"Model confidence — Improving: {probs.get('Improving','-')}, Stable: {probs.get('Stable','-')}, Worsening: {probs.get('Worsening','-')}. "
        f"Alerts: {alert_txt}. "
        "Consider reviewing vitals evolution and corroborating with clinical notes; this is decision support, not a diagnosis."
    )

summaries = []

# ---- generate summaries (with graceful fallback) ----
for i, rec in enumerate(records, start=1):
    try:
        if client:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user",   "content": build_user_prompt(rec)},
                ],
                temperature=0.2,
            )
            text = (resp.choices[0].message.content or "").strip()
        else:
            text = local_template_summary(rec)
        summaries.append({"patient_id": rec.get("patient_id","unknown"), "summary": text})
        print(f"[{i}] done.")
        # Tiny pause to be gentle with API, even for 10
        time.sleep(0.3)
    except Exception as e:
        # fallback for rate limits or any other error
        text = local_template_summary(rec)
        summaries.append({"patient_id": rec.get("patient_id","unknown"),
                          "summary": text + " (Note: LLM fallback used due to API error.)"})
        print(f"[{i}] fallback used -> {e.__class__.__name__}: {e}")

# ---- safe write helpers (atomic with fallback filename on Windows lock) ----
def safe_write_text(path: Path, content: str) -> Path:
    path = Path(path)
    try:
        with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), encoding="utf-8", newline="") as tmp:
            tmp.write(content)
            tmp_name = tmp.name
        try:
            os.replace(tmp_name, path)  # atomic on Windows when possible
            return path
        except PermissionError:
            # destination is locked, write timestamped file
            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
            alt = path.with_name(f"{path.stem}_{ts}{path.suffix}")
            shutil.move(tmp_name, alt)
            return alt
    except PermissionError:
        # parent folder permission or file lock before temp write
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        alt = path.with_name(f"{path.stem}_{ts}{path.suffix}")
        with open(alt, "w", encoding="utf-8", newline="") as f:
            f.write(content)
        return alt

def safe_write_csv(path: Path, rows: list[dict], fieldnames: list[str]) -> Path:
    path = Path(path)
    # write to temp first
    try:
        with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), encoding="utf-8", newline="") as tmp:
            w = csv.DictWriter(tmp, fieldnames=fieldnames)
            w.writeheader()
            for r in rows:
                w.writerow(r)
            tmp_name = tmp.name
        try:
            os.replace(tmp_name, path)
            return path
        except PermissionError:
            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
            alt = path.with_name(f"{path.stem}_{ts}{path.suffix}")
            shutil.move(tmp_name, alt)
            return alt
    except PermissionError:
        # parent or temp blocked — write to timestamped directly
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        alt = path.with_name(f"{path.stem}_{ts}{path.suffix}")
        with open(alt, "w", encoding="utf-8", newline="") as f:
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writeheader()
            for r in rows:
                w.writerow(r)
        return alt

# ---- write outputs (atomic, with fallback) ----
csv_path = safe_write_csv(OUT_CSV, summaries, fieldnames=["patient_id","summary"])
txt_blob = "\n\n".join(f"[{s['patient_id']}]\n{s['summary']}" for s in summaries)
txt_path = safe_write_text(OUT_TXT, txt_blob)

print("Wrote:")
print(" -", csv_path)
print(" -", txt_path)
print("Done.")


Loaded 10 fusion records (target 10).
[1] fallback used -> RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
[2] fallback used -> RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
[3] fallback used -> RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-erro

In [5]:
pip install google-generativeai python-dotenv


Defaulting to user installation because normal site-packages is not writeable
Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.182.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.40.3-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-ge



In [7]:
# --- EMR LLM Summaries via Google Gemini (10 requests) ---
# Requires: pip install google-generativeai python-dotenv
# Files:
#   - Input : C:\Users\aayus\Downloads\emr-smart\outputs\fusion.ndjson
#   - Output: C:\Users\aayus\Downloads\emr-smart\outputs\summaries_gemini.csv / .txt

import os, json, time, csv
from pathlib import Path
from dotenv import load_dotenv

# ---------- paths ----------
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR = ROOT / "outputs"
IN_PATH  = OUT_DIR / "fusion.ndjson"
CSV_PATH = OUT_DIR / "summaries_gemini.csv"
TXT_PATH = OUT_DIR / "summaries_gemini.txt"

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- load .env robustly ----------
ENV_PATH = ROOT / ".env"
loaded = False
for enc in ("utf-8", "utf-8-sig"):
    try:
        if load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc):
            if os.getenv("GEMINI_API_KEY"):
                loaded = True
                break
    except Exception:
        pass
if not loaded:
    raise RuntimeError(
        f"Could not read GEMINI_API_KEY from {ENV_PATH}. "
        "Create a UTF-8 .env with:\nGEMINI_API_KEY=AIzaSyDkhQyzvDh4WCN5UGAvuYQFu3RpDsUKlqM"
    )

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
GEMINI_MODEL   = os.getenv("GEMINI_MODEL", "gemini-2.0-flash").strip()  # change to 'gemini-2.5-flash' if your key supports it

# ---------- import & configure gemini ----------
import google.generativeai as genai
genai.configure(api_key=GEMINI_API_KEY)

# Build model with a system instruction
SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with patient context, describe current severity (with label), 5-week trend, any notable vitals, and the alerts. "
    "Use neutral, calibrated language (e.g., 'suggests', 'consider'), do not diagnose or prescribe, and avoid PII beyond patient_id. "
    "End with: 'This supports decisions and is not a diagnosis.'"
)
model = genai.GenerativeModel(
    model_name=GEMINI_MODEL,
    system_instruction=SYSTEM
)

# ---------- read fusion records ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}. Run your fuse_infer_generate step first.")

records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except Exception:
            # skip malformed
            continue

# limit to first 10
BATCH_LIMIT = 10
records = records[:BATCH_LIMIT]
print(f"Loaded {len(records)} fusion records (target {BATCH_LIMIT}).")

# ---------- helper: one summary with retries + fallback ----------
def summarize_one(rec, attempt=0, max_attempts=3):
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    user_prompt = (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Summarize for a clinician."
    )

    try:
        resp = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0.2,
                "max_output_tokens": 300
            }
        )
        text = (resp.text or "").strip()
        if not text:
            raise RuntimeError("Empty response")
        return text, None
    except Exception as e:
        err = f"{type(e).__name__}: {e}"
        # basic backoff
        if attempt + 1 < max_attempts:
            time.sleep(1.5 * (attempt + 1))
            return summarize_one(rec, attempt=attempt+1, max_attempts=max_attempts)
        # fallback summary if API fails
        snapshot_label = snapshot.get("severity_label") or "Unknown"
        trend_label    = history.get("trend") or "Unknown"
        proba_str      = history.get("proba") or {}
        alert_str      = "; ".join(alerts) if alerts else "None"
        fallback = (
            f"Patient {pid}: Current severity appears '{snapshot_label}'. "
            f"Trend over 5 weeks: {trend_label}. "
            f"Class probabilities: {proba_str}. "
            f"Alerts: {alert_str}. "
            "Overall summary generated without LLM due to API limits. "
            "This supports decisions and is not a diagnosis."
        )
        return fallback, err

# ---------- run & collect ----------
rows = []
for i, rec in enumerate(records, start=1):
    text, err = summarize_one(rec)
    if err:
        print(f"[{i}] fallback used -> {err}")
    else:
        print(f"[{i}] done.")
    rows.append({"patient_id": rec.get("patient_id", "unknown"), "summary": text})

# ---------- write outputs (CSV + TXT) ----------
# Use temp files to avoid PermissionError if the file was left open elsewhere
csv_tmp = OUT_DIR / (CSV_PATH.stem + ".tmp.csv")
txt_tmp = OUT_DIR / (TXT_PATH.stem + ".tmp.txt")

# CSV
with open(csv_tmp, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    w.writerows(rows)
# TXT
with open(txt_tmp, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(f"[{r['patient_id']}]\n{r['summary']}\n\n")

# Replace old files atomically
if CSV_PATH.exists():
    CSV_PATH.unlink(missing_ok=True)
if TXT_PATH.exists():
    TXT_PATH.unlink(missing_ok=True)
csv_tmp.replace(CSV_PATH)
txt_tmp.replace(TXT_PATH)

print("Wrote:")
print(" ", CSV_PATH)
print(" ", TXT_PATH)
print("Done.")


RuntimeError: Could not read GEMINI_API_KEY from C:\Users\aayus\Downloads\emr-smart\.env. Create a UTF-8 .env with:
GEMINI_API_KEY=AIzaSyDkhQyzvDh4WCN5UGAvuYQFu3RpDsUKlqM

In [8]:
# --- EMR LLM Summaries via Google Gemini (robust .env, 10 requests) ---
# Requires: pip install google-generativeai python-dotenv

import os, json, time, csv, io, sys
from pathlib import Path
from dotenv import load_dotenv

# ---------- paths ----------
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR = ROOT / "outputs"
IN_PATH  = OUT_DIR / "fusion.ndjson"
CSV_PATH = OUT_DIR / "summaries_gemini.csv"
TXT_PATH = OUT_DIR / "summaries_gemini.txt"
ENV_PATH = ROOT / ".env"

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- OPTIONAL: hardcode fallback (last resort) ----------
# If you want to bypass .env entirely, uncomment the next line and paste your key:
# os.environ["GEMINI_API_KEY"] = "AIza...YOUR_KEY..."

def _try_load_env():
    """Load .env with multiple encodings; fall back to manual parse if needed."""
    # 1) Already set in environment?
    if os.getenv("GEMINI_API_KEY"):
        return True, "envvar-present"

    # 2) Try python-dotenv with utf-8 and utf-8-sig
    if ENV_PATH.exists():
        for enc in ("utf-8", "utf-8-sig"):
            try:
                ok = load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc)
                if ok and os.getenv("GEMINI_API_KEY"):
                    return True, f"dotenv-{enc}"
            except Exception:
                pass

        # 3) Manual parse (handles BOM/odd encodings)
        try:
            with open(ENV_PATH, "rb") as fb:
                raw = fb.read()
            # decode forgivingly
            text = raw.decode("utf-8-sig", errors="ignore")
            key = None
            for line in text.splitlines():
                s = line.strip()
                if not s or s.startswith("#"):
                    continue
                if s.upper().startswith("GEMINI_API_KEY="):
                    key = s.split("=", 1)[1].strip()
                    # strip optional surrounding quotes
                    if (key.startswith('"') and key.endswith('"')) or (key.startswith("'") and key.endswith("'")):
                        key = key[1:-1]
                    break
            if key:
                os.environ["GEMINI_API_KEY"] = key
                # optional model line
                for line in text.splitlines():
                    s = line.strip()
                    if s.upper().startswith("GEMINI_MODEL="):
                        val = s.split("=", 1)[1].strip()
                        if (val.startswith('"') and val.endswith('"')) or (val.startswith("'") and val.endswith("'")):
                            val = val[1:-1]
                        os.environ["GEMINI_MODEL"] = val
                        break
                return True, "manual-parse"
        except Exception:
            pass

    # 4) Nothing worked
    return False, "not-found-or-unreadable"

ok, how = _try_load_env()
if not ok:
    raise RuntimeError(
        f"Could not read GEMINI_API_KEY from {ENV_PATH} and none set in environment.\n"
        "Fix by creating a UTF-8 .env with:\n"
        "GEMINI_API_KEY=AIza...your_key\n"
        "GEMINI_MODEL=gemini-2.5-flash\n"
        "Or set it for this session: os.environ['GEMINI_API_KEY']='AIza...'\n"
    )

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
GEMINI_MODEL   = os.getenv("GEMINI_MODEL", "gemini-2.5-flash").strip()
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY resolved empty after loading. Check your .env formatting (no quotes).")

print(f"[.env] load mode: {how}; model={GEMINI_MODEL}")

# ---------- import & configure gemini ----------
try:
    import google.generativeai as genai
except Exception:
    raise RuntimeError("Missing dependency. Run: pip install google-generativeai python-dotenv")

genai.configure(api_key=GEMINI_API_KEY)

SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with patient context, describe current severity (with label), 5-week trend, any notable vitals, and the alerts. "
    "Use neutral, calibrated language (e.g., 'suggests', 'consider'), do not diagnose or prescribe, and avoid PII beyond patient_id. "
    "End with: 'This supports decisions and is not a diagnosis.'"
)

try:
    model = genai.GenerativeModel(
        model_name=GEMINI_MODEL,
        system_instruction=SYSTEM
    )
except Exception as e:
    raise RuntimeError(f"Gemini model init failed: {e}")

# ---------- read fusion records ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}. Run your fuse_infer_generate step first.")

records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except Exception:
            continue

BATCH_LIMIT = 10
records = records[:BATCH_LIMIT]
print(f"Loaded {len(records)} fusion records (target {BATCH_LIMIT}).")

# ---------- summarize helper ----------
def summarize_one(rec, attempt=0, max_attempts=3):
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    user_prompt = (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Summarize for a clinician."
    )

    try:
        resp = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0.2,
                "max_output_tokens": 300
            }
        )
        text = (resp.text or "").strip()
        if not text:
            raise RuntimeError("Empty response")
        return text, None
    except Exception as e:
        err = f"{type(e).__name__}: {e}"
        if attempt + 1 < max_attempts:
            time.sleep(1.2 * (attempt + 1))
            return summarize_one(rec, attempt=attempt+1, max_attempts=max_attempts)
        # fallback
        snapshot_label = snapshot.get("severity_label") or "Unknown"
        trend_label    = history.get("trend") or "Unknown"
        proba_str      = history.get("proba") or {}
        alert_str      = "; ".join(alerts) if alerts else "None"
        fallback = (
            f"Patient {pid}: Current severity appears '{snapshot_label}'. "
            f"Trend over 5 weeks: {trend_label}. "
            f"Class probabilities: {proba_str}. "
            f"Alerts: {alert_str}. "
            "Overall summary generated without LLM due to API limits. "
            "This supports decisions and is not a diagnosis."
        )
        return fallback, err

# ---------- run 10 requests ----------
rows = []
for i, rec in enumerate(records, start=1):
    text, err = summarize_one(rec)
    if err:
        print(f"[{i}] fallback used -> {err}")
    else:
        print(f"[{i}] done.")
    rows.append({"patient_id": rec.get("patient_id", "unknown"), "summary": text})

# ---------- write outputs (atomic replace) ----------
csv_tmp = OUT_DIR / (CSV_PATH.stem + ".tmp.csv")
txt_tmp = OUT_DIR / (TXT_PATH.stem + ".tmp.txt")

with open(csv_tmp, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    w.writerows(rows)

with open(txt_tmp, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(f"[{r['patient_id']}]\n{r['summary']}\n\n")

if CSV_PATH.exists(): CSV_PATH.unlink(missing_ok=True)
if TXT_PATH.exists(): TXT_PATH.unlink(missing_ok=True)
csv_tmp.replace(CSV_PATH)
txt_tmp.replace(TXT_PATH)

print("Wrote:")
print(" ", CSV_PATH)
print(" ", TXT_PATH)
print("Done.")


RuntimeError: Could not read GEMINI_API_KEY from C:\Users\aayus\Downloads\emr-smart\.env and none set in environment.
Fix by creating a UTF-8 .env with:
GEMINI_API_KEY=AIza...your_key
GEMINI_MODEL=gemini-2.5-flash
Or set it for this session: os.environ['GEMINI_API_KEY']='AIza...'


In [10]:
# --- EMR LLM Summaries via Google Gemini (robust .env, 10 requests) ---
# Requires: pip install google-generativeai python-dotenv

import os, json, time, csv, io, sys
from pathlib import Path
from dotenv import load_dotenv

# ---------- paths ----------
ROOT = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR = ROOT / "outputs"
IN_PATH  = OUT_DIR / "fusion.ndjson"
CSV_PATH = OUT_DIR / "summaries_gemini.csv"
TXT_PATH = OUT_DIR / "summaries_gemini.txt"
ENV_PATH = ROOT / ".env"

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- OPTIONAL: hardcode fallback (last resort) ----------
# If you want to bypass .env entirely, uncomment the next line and paste your key:
# os.environ["GEMINI_API_KEY"] = "AIza...YOUR_KEY..."

def _try_load_env():
    """Load .env with multiple encodings; fall back to manual parse if needed."""
    # 1) Already set in environment?
    if os.getenv("GEMINI_API_KEY"):
        return True, "envvar-present"

    # 2) Try python-dotenv with utf-8 and utf-8-sig
    if ENV_PATH.exists():
        for enc in ("utf-8", "utf-8-sig"):
            try:
                ok = load_dotenv(dotenv_path=str(ENV_PATH), override=True, encoding=enc)
                if ok and os.getenv("GEMINI_API_KEY"):
                    return True, f"dotenv-{enc}"
            except Exception:
                pass

        # 3) Manual parse (handles BOM/odd encodings)
        try:
            with open(ENV_PATH, "rb") as fb:
                raw = fb.read()
            # decode forgivingly
            text = raw.decode("utf-8-sig", errors="ignore")
            key = None
            for line in text.splitlines():
                s = line.strip()
                if not s or s.startswith("#"):
                    continue
                if s.upper().startswith("GEMINI_API_KEY="):
                    key = s.split("=", 1)[1].strip()
                    # strip optional surrounding quotes
                    if (key.startswith('"') and key.endswith('"')) or (key.startswith("'") and key.endswith("'")):
                        key = key[1:-1]
                    break
            if key:
                os.environ["GEMINI_API_KEY"] = key
                # optional model line
                for line in text.splitlines():
                    s = line.strip()
                    if s.upper().startswith("GEMINI_MODEL="):
                        val = s.split("=", 1)[1].strip()
                        if (val.startswith('"') and val.endswith('"')) or (val.startswith("'") and val.endswith("'")):
                            val = val[1:-1]
                        os.environ["GEMINI_MODEL"] = val
                        break
                return True, "manual-parse"
        except Exception:
            pass

    # 4) Nothing worked
    return False, "not-found-or-unreadable"

ok, how = _try_load_env()
if not ok:
    raise RuntimeError(
        f"Could not read GEMINI_API_KEY from {ENV_PATH} and none set in environment.\n"
        "Fix by creating a UTF-8 .env with:\n"
        "GEMINI_API_KEY=AIza...your_key\n"
        "GEMINI_MODEL=gemini-2.5-flash\n"
        "Or set it for this session: os.environ['GEMINI_API_KEY']='AIza...'\n"
    )

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "").strip()
GEMINI_MODEL   = os.getenv("GEMINI_MODEL", "gemini-2.5-flash").strip()
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY resolved empty after loading. Check your .env formatting (no quotes).")

print(f"[.env] load mode: {how}; model={GEMINI_MODEL}")

# ---------- import & configure gemini ----------
try:
    import google.generativeai as genai
except Exception:
    raise RuntimeError("Missing dependency. Run: pip install google-generativeai python-dotenv")

genai.configure(api_key=GEMINI_API_KEY)

SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "Start with patient context, describe current severity (with label), 5-week trend, any notable vitals, and the alerts. "
    "Use neutral, calibrated language (e.g., 'suggests', 'consider'), do not diagnose or prescribe, and avoid PII beyond patient_id. "
    "End with: 'This supports decisions and is not a diagnosis.'"
)

try:
    model = genai.GenerativeModel(
        model_name=GEMINI_MODEL,
        system_instruction=SYSTEM
    )
except Exception as e:
    raise RuntimeError(f"Gemini model init failed: {e}")

# ---------- read fusion records ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}. Run your fuse_infer_generate step first.")

records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except Exception:
            continue

BATCH_LIMIT = 10
records = records[:BATCH_LIMIT]
print(f"Loaded {len(records)} fusion records (target {BATCH_LIMIT}).")

# ---------- summarize helper ----------
def summarize_one(rec, attempt=0, max_attempts=3):
    pid = rec.get("patient_id", "unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []

    user_prompt = (
        f"Patient ID: {pid}\n"
        f"Current severity: {snapshot.get('severity_numeric')} ({snapshot.get('severity_label')})\n"
        f"Trend: {history.get('trend')}\n"
        f"Class probabilities: {history.get('proba')}\n"
        f"Alerts: {alerts}\n"
        "Summarize for a clinician."
    )

    try:
        resp = model.generate_content(
            user_prompt,
            generation_config={
                "temperature": 0.2,
                "max_output_tokens": 300
            }
        )
        text = (resp.text or "").strip()
        if not text:
            raise RuntimeError("Empty response")
        return text, None
    except Exception as e:
        err = f"{type(e).__name__}: {e}"
        if attempt + 1 < max_attempts:
            time.sleep(1.2 * (attempt + 1))
            return summarize_one(rec, attempt=attempt+1, max_attempts=max_attempts)
        # fallback
        snapshot_label = snapshot.get("severity_label") or "Unknown"
        trend_label    = history.get("trend") or "Unknown"
        proba_str      = history.get("proba") or {}
        alert_str      = "; ".join(alerts) if alerts else "None"
        fallback = (
            f"Patient {pid}: Current severity appears '{snapshot_label}'. "
            f"Trend over 5 weeks: {trend_label}. "
            f"Class probabilities: {proba_str}. "
            f"Alerts: {alert_str}. "
            "Overall summary generated without LLM due to API limits. "
            "This supports decisions and is not a diagnosis."
        )
        return fallback, err

# ---------- run 10 requests ----------
rows = []
for i, rec in enumerate(records, start=1):
    text, err = summarize_one(rec)
    if err:
        print(f"[{i}] fallback used -> {err}")
    else:
        print(f"[{i}] done.")
    rows.append({"patient_id": rec.get("patient_id", "unknown"), "summary": text})

# ---------- write outputs (atomic replace) ----------
csv_tmp = OUT_DIR / (CSV_PATH.stem + ".tmp.csv")
txt_tmp = OUT_DIR / (TXT_PATH.stem + ".tmp.txt")

with open(csv_tmp, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    w.writerows(rows)

with open(txt_tmp, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(f"[{r['patient_id']}]\n{r['summary']}\n\n")

if CSV_PATH.exists(): CSV_PATH.unlink(missing_ok=True)
if TXT_PATH.exists(): TXT_PATH.unlink(missing_ok=True)
csv_tmp.replace(CSV_PATH)
txt_tmp.replace(TXT_PATH)

print("Wrote:")
print(" ", CSV_PATH)
print(" ", TXT_PATH)
print("Done.")


RuntimeError: Could not read GEMINI_API_KEY from C:\Users\aayus\Downloads\emr-smart\.env and none set in environment.
Fix by creating a UTF-8 .env with:
GEMINI_API_KEY=AIza...your_key
GEMINI_MODEL=gemini-2.5-flash
Or set it for this session: os.environ['GEMINI_API_KEY']='AIza...'


In [2]:
import os
os.environ["GEMINI_API_KEY"] = "AIzaSyDc7_BiAtyxG3BKmeNS2uOcXgi0zLoIiAI"
os.environ["GEMINI_MODEL"]   = "gemini-2.5-flash"  # or gemini-2.0-flash if that's your plan
print("Gemini key present?", bool(os.getenv("GEMINI_API_KEY")))


Gemini key present? True


In [2]:
# --- EMR LLM Summaries via Google Gemini (safe + rate-limited) ---
# pip install google-generativeai python-dotenv

import os, json, time, csv, re
from pathlib import Path

ROOT    = Path(r"C:\Users\aayus\Downloads\emr-smart")
OUT_DIR = ROOT / "outputs"
IN_PATH = OUT_DIR / "fusion.ndjson"
CSV_OUT = OUT_DIR / "summaries_gemini.csv"
TXT_OUT = OUT_DIR / "summaries_gemini.txt"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- env (use existing env; read .env only if needed) ----------
if not os.getenv("GEMINI_API_KEY"):
    try:
        from dotenv import load_dotenv
        for enc in ("utf-8", "utf-8-sig"):
            if load_dotenv(dotenv_path=str(ROOT/".env"), override=False, encoding=enc):
                if os.getenv("GEMINI_API_KEY"):
                    break
    except Exception:
        pass

GEMINI_API_KEY = (os.getenv("GEMINI_API_KEY") or "").strip()
GEMINI_MODEL   = (os.getenv("GEMINI_MODEL") or "gemini-2.5-flash").strip()
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY missing. Put it in .env or set os.environ['GEMINI_API_KEY'].")

# ---------- model ----------
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

genai.configure(api_key=GEMINI_API_KEY)

SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "State patient context, current severity (with label), 5-week trend, any notable vitals, and the alerts. "
    "Use neutral, calibrated language (e.g., 'suggests', 'consider'); avoid diagnoses or prescriptions; "
    "no PII beyond patient_id. End with: 'This supports decisions and is not a diagnosis.'"
)

# Relax safety so summaries aren’t blocked as ‘medical advice’
SAFETY = {
    HarmCategory.HARM_CATEGORY_MEDICAL: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    HarmCategory.HARM_CATEGORY_SEXUAL: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

model = genai.GenerativeModel(
    model_name=GEMINI_MODEL,
    system_instruction=SYSTEM,
    safety_settings=SAFETY
)

# ---------- load fusion ----------
if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH} (run fuse_infer_generate first).")

records = []
with open(IN_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            try:
                records.append(json.loads(line))
            except Exception:
                pass

# keep small batch to stay under free-tier limits
BATCH_LIMIT    = 5     # if you need 10 later, set to 10
SLEEP_BETWEEN  = 7.0   # seconds between requests (<= ~8/min)
MAX_ATTEMPTS   = 3
records = records[:BATCH_LIMIT]
print(f"Loaded {len(records)} fusion records (target {BATCH_LIMIT}).")

# ---------- helpers ----------
def extract_text(resp):
    """
    Robustly extract text from a Gemini response.
    If blocked (no parts), return "".
    """
    try:
        if hasattr(resp, "text") and resp.text:
            return resp.text.strip()
    except Exception:
        pass

    # fall back: scan candidates/parts
    try:
        for cand in getattr(resp, "candidates", []) or []:
            # finish_reason 2 = SAFETY (blocked). Skip these.
            if getattr(cand, "finish_reason", None) == 2:
                continue
            parts = getattr(getattr(cand, "content", None), "parts", []) or []
            chunks = []
            for p in parts:
                # for text parts
                if hasattr(p, "text") and p.text:
                    chunks.append(p.text)
                # for dict-like parts
                elif isinstance(p, dict) and p.get("text"):
                    chunks.append(p["text"])
            if chunks:
                return "\n".join(chunks).strip()
    except Exception:
        pass
    return ""

def parse_retry_delay_seconds(msg: str, default_sec: float = 25.0) -> float:
    m = re.search(r"retry_delay\s*\{\s*seconds:\s*([0-9]+)", msg)
    if m:
        try:
            return float(m.group(1))
        except Exception:
            pass
    return default_sec

def fallback_summary(rec, note: str):
    pid      = rec.get("patient_id","unknown")
    snapshot = rec.get("snapshot") or {}
    history  = rec.get("history")  or {}
    alerts   = rec.get("alerts")   or []
    return (
        f"Patient {pid}: Current severity appears '{(snapshot.get('severity_label') or 'Unknown')}'. "
        f"Trend over 5 weeks: {(history.get('trend') or 'Unknown')}. "
        f"Class probabilities: {(history.get('proba') or {})}. "
        f"Alerts: {('; '.join(alerts)) if alerts else 'None'}. "
        f"(LLM fallback: {note}) This supports decisions and is not a diagnosis."
    )

def summarize_one(rec):
    prompt = (
        f"Patient ID: {rec.get('patient_id','unknown')}\n"
        f"Current severity: { (rec.get('snapshot') or {}).get('severity_numeric') } "
        f"({ (rec.get('snapshot') or {}).get('severity_label') })\n"
        f"Trend: { (rec.get('history') or {}).get('trend') }\n"
        f"Class probabilities: { (rec.get('history') or {}).get('proba') }\n"
        f"Alerts: { rec.get('alerts') or [] }\n"
        "Summarize for a clinician."
    )

    last_err = None
    for attempt in range(1, MAX_ATTEMPTS+1):
        try:
            resp = model.generate_content(
                prompt,
                generation_config={"temperature": 0.2, "max_output_tokens": 300}
            )
            text = extract_text(resp)
            if text:
                return text, None
            # blocked or empty
            last_err = "Empty/blocked response (finish_reason likely SAFETY)"
            # small wait then try again once
            time.sleep(0.8 * attempt)
        except Exception as e:
            last_err = f"{type(e).__name__}: {e}"
            # handle 429 with server-provided delay if present
            if "ResourceExhausted" in str(type(e)) or "429" in str(e):
                time.sleep(parse_retry_delay_seconds(str(e), 25.0))
            else:
                time.sleep(1.2 * attempt)

    # fallback after retries
    return fallback_summary(rec, note=last_err), last_err

# ---------- run ----------
rows = []
for i, rec in enumerate(records, start=1):
    txt, err = summarize_one(rec)
    if err:
        print(f"[{i}] fallback used -> {err}")
    else:
        print(f"[{i}] done.")
    rows.append({"patient_id": rec.get("patient_id","unknown"), "summary": txt})
    # respect free-tier per-minute limits
    if i < len(records):
        time.sleep(SLEEP_BETWEEN)

# ---------- write outputs atomically ----------
tmp_csv = OUT_DIR / (CSV_OUT.stem + ".tmp.csv")
tmp_txt = OUT_DIR / (TXT_OUT.stem + ".tmp.txt")

with open(tmp_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["patient_id","summary"])
    w.writeheader()
    w.writerows(rows)

with open(tmp_txt, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(f"[{r['patient_id']}]\n{r['summary']}\n\n")

if CSV_OUT.exists(): CSV_OUT.unlink(missing_ok=True)
if TXT_OUT.exists(): TXT_OUT.unlink(missing_ok=True)
tmp_csv.replace(CSV_OUT)
tmp_txt.replace(TXT_OUT)

print("Wrote:")
print(" ", CSV_OUT)
print(" ", TXT_OUT)
print("Done.")


AttributeError: type object 'HarmCategory' has no attribute 'HARM_CATEGORY_MEDICAL'

In [3]:
import os, json
import google.generativeai as genai

# ---------- API key ----------
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    raise RuntimeError("Set GEMINI_API_KEY in your environment first")
genai.configure(api_key=API_KEY)

# ---------- Model ----------
SYSTEM = (
    "You are a clinical assistant. Summarize EMR outputs for a clinician in 5–7 sentences. "
    "State patient context, current severity (with label), 5-week trend, any notable vitals, and the alerts. "
    "Use neutral, calibrated language (e.g., 'suggests', 'consider'); avoid diagnoses or prescriptions; "
    "no PII beyond patient_id. End with: 'This supports decisions and is not a diagnosis.'"
)

model = genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    system_instruction=SYSTEM
)

# ---------- Example record ----------
fusion_example = {
    "patient_id": "P001",
    "snapshot": {"severity_score": 0.67, "severity_class": "Moderate"},
    "history": {"trend_label": "Stable"},
    "derived": {"alerts": ["BP elevated", "O2 borderline"]}
}

prompt = f"Summarize this EMR record:\n{json.dumps(fusion_example, indent=2)}"

# ---------- Generate ----------
resp = model.generate_content(prompt)

summary = (resp.text or "").strip()
if not summary:
    summary = "No text returned by Gemini."

# ---------- Save ----------
out_path = "emr_summary.txt"
with open(out_path, "w", encoding="utf-8") as f:
    f.write(summary)

print(f"Written summary to {out_path}\n")
print(summary)

Written summary to emr_summary.txt

For patient P001, the current assessment indicates a severity score of 0.67, categorizing the patient's condition as Moderate. Over the past five weeks, the patient's trend has been assessed as Stable. Notable vital sign alerts include elevated blood pressure and borderline oxygen saturation. These findings suggest a need to consider these parameters. The system has generated alerts for both elevated BP and borderline O2. This supports decisions and is not a diagnosis.
