# LLM Tutorial

**Objective**: Predict whether a COVID‑19 patient had a sever outcome - an inpatient or emergency encounter - within 30 days after onset of COVID.

**Data Source**: SyntheticMass [COVID-19 10K, CSV](https://mitre.box.com/shared/static/9iglv8kbs1pfi7z8phjl9sbpjk08spze.zip)

## Import Dataset

We'll use three tables: `patients`, `encounters` and `conditions`.

In [1]:
import pandas as pd

patients = pd.read_csv("../10k_synthea_covid19_csv/patients.csv")
encounters = pd.read_csv("../10k_synthea_covid19_csv/encounters.csv")
conditions = pd.read_csv("../10k_synthea_covid19_csv/conditions.csv")

## Prepare Dataset

Prepare the dataset for use in our prompts:

- **Make dates easier to work with**: Convert date strings to datetime
- **Determine first onset of COVID**: Identify patients with COVID conditions and determine the date of first onset
- **Label training examples**: by whether or not they had a severe encounter within 30 days of first onset of COVID
- **Build Featureset**: Identify features for each training example based on demographic an comorbidity info
- **Convert Dataframe to Text**: Convert examples from dataframe rows to a consistent text format "Patient Cards"

### Make dates easier to work with

In [2]:
parse_dates = ["START","STOP","DATE","BIRTHDATE","DEATHDATE","ONSET","RECORDED_DATE"]

# normalize date columns
for df in (patients, encounters, conditions):
    for c in [c for c in parse_dates if c in df.columns]:
        df[c] = pd.to_datetime(
            df[c],
            errors="coerce",
            utc=True,
        ).dt.tz_localize(None)

### Determine first onset of COVID per patient

In [3]:
covid_cond = conditions[
    conditions["DESCRIPTION"].str.contains("COVID", case=False, na=False)
].copy()
first_onset = covid_cond.groupby("PATIENT")["START"].min().rename("COVID_ONSET")

### Label training examples

In [4]:
# Join encounters with info about first onset of COVID for each patient patient
enc_join_covid= encounters.merge(
    first_onset,
    left_on="PATIENT",
    right_index=True,
    how="inner")


# determine if each row is within the 30 day window of the
# first onset of COVID
window = (
    enc_join_covid["START"] >= enc_join_covid["COVID_ONSET"]
) & (
    enc_join_covid["START"] <= enc_join_covid["COVID_ONSET"] +
    pd.Timedelta(days=30)
)

# determine if encounters marked within the window are severe
severe_hit = enc_join_covid.loc[
    window & enc_join_covid["ENCOUNTERCLASS"].str.lower().isin(
        ["inpatient","emergency"]
    )
].groupby("PATIENT").size().gt(0)

# create a label for each patient based on whether they had a severe encounter
# within 30 days of first onset of COVID
label = severe_hit.reindex(
    first_onset.index,
    fill_value=False
).astype(
        int
).rename("severe_30d")

### Build Featureset

In [5]:
# determine age at onset
age_at_onset = (
    (
        first_onset - first_onset.to_frame().join(
            patients.set_index("Id"),
            how="left")["BIRTHDATE"]
    ).dt.days / 365.25
).round()

# get a list of comorbidities
COMORBID_KEYWORDS = [
    "diabetes",
    "hypertension",
    "asthma",
    "copd",
    "coronary artery",
    "heart failure",
    "obesity",
    "chronic kidney",
    "ckd",
    "cancer",
    "immunodefic",
    "hyperlipid"
]
# merge other conditions with first onset of COVID
pre = conditions.merge(
    first_onset,
    left_on="PATIENT",
    right_index=True,
    how="inner"
)
# filter out conditions that occurred after onset of COVID
pre = pre[pre["START"] < pre["COVID_ONSET"]]
# for each patient, create column with flags for each comorbidity
# indicating whether the patient has the comorbidity before onset of COVID
flags = pre.assign(**{
        kw: pre["DESCRIPTION"].str.contains(kw, na=False) for kw in COMORBID_KEYWORDS
}).groupby("PATIENT")[COMORBID_KEYWORDS].max().astype(bool)

# construct our dataset
frame = pd.DataFrame(index=first_onset.index)
frame["age_at_onset"] = age_at_onset.astype("Int64")
frame["gender"] = patients.set_index("Id").reindex(frame.index)["GENDER"]
# make sure flags has an entry for every patient in frame
flags = flags.reindex(frame.index, fill_value=False)
frame = frame.join(flags, how="left", on="PATIENT")
frame = frame.join(label, how="left").dropna(subset=["severe_30d"])

frame.head()

Unnamed: 0_level_0,age_at_onset,gender,diabetes,hypertension,asthma,copd,coronary artery,heart failure,obesity,chronic kidney,ckd,cancer,immunodefic,hyperlipid,severe_30d
PATIENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0000b247-1def-417a-a783-41c8682be022,12,F,False,False,False,False,False,False,False,False,False,False,False,False,0
00049ee8-5953-4edd-a277-b9c1b1a7f16b,35,M,True,False,False,False,False,False,True,False,False,False,False,False,0
00079a57-24a8-430f-b4f8-a1cf34f90060,29,F,False,False,False,False,False,False,False,False,False,False,False,False,1
0008a63c-c95c-46c2-9ef3-831d68892019,27,M,False,False,False,False,False,False,True,False,False,False,False,False,1
00093cdd-a9f0-4ad8-87e9-53534501f008,67,F,False,False,False,False,False,False,True,False,False,False,False,False,1


### Convert Dataframe to Text

We now have all of our examples labeled, but LLMs work better with consistent, compact text inputs, so we convert each row into a short “patient card".

In [6]:
import textwrap

def patient_card(df, pid) -> str:
    row = df.loc[pid]
    comorbs = [kw for kw in COMORBID_KEYWORDS if bool(row.get(kw, False))]
    comorb_str = ", ".join(sorted(set(c.title() for c in comorbs))) or "None"
    age = int(row["age_at_onset"]) if pd.notna(row["age_at_onset"]) else "Unknown"
    gender = str(row["gender"])
    return textwrap.dedent(f"""
    Patient Info:
      - Age at onset of COVID: {age}
      - Gender: {gender}
      - Known Comorbidities: {comorb_str}
    Definition of severe_30d: inpatient or emergency encounter within 30 days after COVID onset.
    """)

patient_zero_id = frame.index[0]
print(patient_card(frame, patient_zero_id))


Patient Info:
  - Age at onset of COVID: 12
  - Gender: F
  - Known Comorbidities: None
Definition of severe_30d: inpatient or emergency encounter within 30 days after COVID onset.



## Zero-Shot Prompts

Now that we have a dataset and a way to generate examples. We'll start by making simple, zero-shot prompts

First we need to setup OpenAI API.

In [7]:
import getpass
# get the OpenAI API key and confirm it is set
openai_key = getpass.getpass("MY_API_KEY: ")
if openai_key and openai_key.strip():
    print("OPENAI_KEY is non-empty")
else:
    print("OPENAI_KEY is empty or not set")


OPENAI_KEY is non-empty


### Ask GPT 5 Nano to classify some patients without providing any examples.

In [12]:
from openai import OpenAI
import os, json, random, hashlib, pathlib as pl
client = OpenAI(api_key=openai_key)

MODEL = os.getenv("OPENAI_MODEL", "gpt-5-nano")
RNG = random.Random(3)

PROJ = pl.Path.cwd()
CACHE = PROJ / "_cache"
CACHE.mkdir(exist_ok=True)

# simple cache to control costs
def _key(s: str) -> str:
    return hashlib.sha1(s.encode()).hexdigest()[:16]

def cache_call(tag: str, prompt: str, fn):
    path = CACHE / f"{tag}-{_key(prompt)}.json"
    if path.exists():
        cached = path.read_text()
        try:
            return json.loads(cached)
        except json.JSONDecodeError:
            return cached
    out = fn()
    if isinstance(out, (dict, list)):
        path.write_text(json.dumps(out))
    else:
        path.write_text(str(out))
    return out

BASE_SYSTEM = (
    "You are a clinical risk scorer working on determining whether a patient is likely to have a severe outcome within 30 days of COVID onset. "
    "A severe outcome is defined as an inpatient or emergency encounter. "
    "Task: in a single word, classify the patient as 'severe' or 'non-severe'."
)

def llm_predict(card_text: str, shots=None) -> list[str]:
    messages = []
    if shots:
        for ex_card, ex_out in shots:
            messages.append({"role": "user", "content": ex_card})
            messages.append({"role": "assistant", "content": str(ex_out)})
    messages.append({"role": "user", "content": card_text + "Predict severe_30d as 'severe' or 'non-severe'."})

    def _call():
        r = client.responses.create(
            model=MODEL,
            input=[{"role":"system","content": BASE_SYSTEM}, *messages],
        )
        return r.output_text.strip() if hasattr(r, "output_text") else ""

    out = cache_call("risk", json.dumps(messages), _call)
    if isinstance(out, list):
        return out
    if isinstance(out, dict):
        return [json.dumps(out)]
    return [str(out).strip()]

# sanity check
llm_predict(patient_card(frame, patient_zero_id))[0]


'non-severe'