In [1]:
# --- EDIT THESE TWO ONLY ---
DATA_ROOT = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root"  
# 🔼 now pointing to your Dropbox "root" folder that contains patients/, conditions/, etc.

OUT_ROOT  = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients"

# Bronze/Silver target dirs
BRONZE_DIR = fr"{OUT_ROOT}\Output1"   # raw parquet
SILVER_DIR = fr"{OUT_ROOT}\Output2"   # cleaned parquet

# Entities we’ll ingest
ENTITIES = [
    "patients", "conditions", "encounters", "medications",
    "observations", "allergies", "careplans", "immunizations", "procedures"
]


In [2]:
from pathlib import Path

DATA_ROOT = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root"  # <- adjust if needed

root = Path(DATA_ROOT)
print("DATA_ROOT exists:", root.exists())
print("DATA_ROOT resolves to:", root.resolve())

# list immediate children so we can see folders present
print("Children:", [p.name for p in root.iterdir()] if root.exists() else "N/A")


DATA_ROOT exists: True
DATA_ROOT resolves to: C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root
Children: ['allergies', 'careplans', 'conditions', 'encounters', 'immunizations', 'medications', 'observations', 'patients', 'procedures']


In [3]:
DATA_ROOT = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root"
OUT_ROOT  = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients"


In [4]:

from pathlib import Path
root = Path(DATA_ROOT)
print("DATA_ROOT exists:", root.exists(), "| resolves to:", root.resolve())
print("Children (first 20):", [p.name for p in list(root.iterdir())[:20]])


DATA_ROOT exists: True | resolves to: C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root
Children (first 20): ['allergies', 'careplans', 'conditions', 'encounters', 'immunizations', 'medications', 'observations', 'patients', 'procedures']


In [11]:
from pathlib import Path

ENTITIES = ["patients","conditions","encounters","medications",
            "observations","allergies","careplans","immunizations","procedures"]

def find_entity_dir(entity: str) -> str | None:
    root = Path(DATA_ROOT)
    # look for a folder literally named the entity, at any depth
    for p in root.rglob(entity):
        if p.is_dir() and list(p.glob("*.csv")):   # ensure it actually has CSVs
            return str(p.resolve())
    return None

for e in ENTITIES:
    print(e, "→", find_entity_dir(e))


patients → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\patients
conditions → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\conditions
encounters → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\encounters
medications → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\medications
observations → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\observations
allergies → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\allergies
careplans → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\careplans
immunizations → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\immunizations
procedures → C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\procedures


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (SparkSession.builder
         .appName("synthea-etl")
         .config("spark.sql.parquet.compression.codec","snappy")
         .getOrCreate())

def bronze_parquet(entity: str) -> str:
    return fr"{OUT_ROOT}\Output1\{entity}.parquet"

ingest_report = []
for e in ENTITIES:
    d = find_entity_dir(e)
    if not d:
        print(f"⚠️  Skipping {e}: could not find CSV folder under {DATA_ROOT}")
        continue

    print(f"→ Reading {e} from {d} (recursive)")
    df = (spark.read.option("header", True)
                     .option("recursiveFileLookup", "true")
                     .csv(d))   # points to the folder; Spark finds all CSVs under it

    outp = bronze_parquet(e)
    (df.write.mode("overwrite").parquet(outp))
    ingest_report.append((e, df.count(), len(df.columns), outp))

ingest_report


→ Reading patients from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\patients (recursive)
→ Reading conditions from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\conditions (recursive)
→ Reading encounters from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\encounters (recursive)
→ Reading medications from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\medications (recursive)
→ Reading observations from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\observations (recursive)
→ Reading allergies from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\allergies (recursive)
→ Reading careplans from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\careplans (recursive)
→ Reading immunizations from C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root\immunizations (recursive)
→ Reading procedures from C:

[('patients',
  133443,
  17,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\patients.parquet'),
 ('conditions',
  485284,
  6,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\conditions.parquet'),
 ('encounters',
  1257518,
  7,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\encounters.parquet'),
 ('medications',
  398481,
  8,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\medications.parquet'),
 ('observations',
  5383758,
  7,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\observations.parquet'),
 ('allergies',
  52211,
  6,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\allergies.parquet'),
 ('careplans',
  797496,
  9,
  'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output1\\careplans.parquet'),
 ('immunizations',
 

In [13]:
from pyspark.sql import functions as F, types as T

DATE_RX = r"^\d{4}-\d{2}-\d{2}$"   # 2020-12-31

patients = spark.read.parquet(r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output1\patients.parquet")
conditions = spark.read.parquet(r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output1\conditions.parquet")

patients_s = (
    patients
      .withColumn("patient_id", F.upper("Id"))
      .withColumn("birthdate",
          F.when(F.col("BIRTHDATE").rlike(DATE_RX), F.to_date("BIRTHDATE")).otherwise(F.lit(None).cast(T.DateType()))
      )
      .withColumn("deathdate",
          F.when(F.col("DEATHDATE").rlike(DATE_RX), F.to_date("DEATHDATE")).otherwise(F.lit(None).cast(T.DateType()))
      )
      .withColumn("gender", F.upper(F.trim("GENDER")))
      .withColumn("age_years",
          F.floor(F.months_between(F.current_date(), F.col("birthdate"))/12)
      )
      .select("patient_id","gender","birthdate","deathdate","age_years")
)

conditions_s = (
    conditions
      .withColumn("patient_id", F.upper("PATIENT"))
      .withColumn("start_date",
          F.when(F.col("START").rlike(DATE_RX), F.to_date("START")).otherwise(F.lit(None).cast(T.DateType()))
      )
      .withColumn("stop_date",
          F.when(F.col("STOP").rlike(DATE_RX), F.to_date("STOP")).otherwise(F.lit(None).cast(T.DateType()))
      )
      .withColumn("is_active", F.col("stop_date").isNull())
      .select("patient_id","DESCRIPTION","CODE","start_date","stop_date","is_active")
)

patients_s.write.mode("overwrite").parquet(r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output2\patients_clean.parquet")
conditions_s.write.mode("overwrite").parquet(r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output2\conditions_clean.parquet")


In [14]:
spark.conf.set("spark.sql.ansi.enabled", "false")


In [15]:
patients.select("BIRTHDATE").filter(~F.col("BIRTHDATE").rlike(DATE_RX)).show(20, truncate=False)
conditions.select("START").filter(~F.col("START").rlike(DATE_RX)).show(20, truncate=False)


+----------------------------------------------+
|BIRTHDATE                                     |
+----------------------------------------------+
|S99957786                                     |
|0e769953-331e-40c0-ab30-e4eeadf347f9          |
|M                                             |
|S99989445                                     |
|bd303fee-fcb0-4e62-a89e-5a29199107c0          |
|1952-04-01d90ce1c7-574c-4df6-aa06-fb12fc590b06|
|white                                         |
|b8e021b8-ead0-4953-8d63-5fad90acf985          |
|white                                         |
|2f8ab7e4-479a-4f9a-aded-9fa32c69b723          |
|253e0804-e557-4fba-8314-1fa0f321194c          |
|Mrs.                                          |
|1951-02-21eea2956b-60bf-45f5-84a0-ad8eef8eb84a|
|fb4f3feb-e4c5-4f9a-b459-7e12ff0bb524          |
|8177 Travis Rest Wareham MA 02571 US          |
|1937-05-0552caee87-dc30-441f-98bf-7bc59c049419|
|S99953828                                     |
|1989-09-20ab7668f5-

In [16]:
# Inspect malformed birthdates
patients.select("BIRTHDATE") \
        .filter(~F.col("BIRTHDATE").rlike(DATE_RX)) \
        .distinct() \
        .show(20, truncate=False)

# Inspect malformed condition start dates
conditions.select("START") \
          .filter(~F.col("START").rlike(DATE_RX)) \
          .distinct() \
          .show(20, truncate=False)


+----------------------------------------------+
|BIRTHDATE                                     |
+----------------------------------------------+
|b8e021b8-ead0-4953-8d63-5fad90acf985          |
|0e769953-331e-40c0-ab30-e4eeadf347f9          |
|a86fba66-0500-4566-b62e-5477345dd69a          |
|1952-04-01d90ce1c7-574c-4df6-aa06-fb12fc590b06|
|1937-05-0552caee87-dc30-441f-98bf-7bc59c049419|
|Mrs.                                          |
|S99953828                                     |
|1925-10-229f5284d6-27bc-4d88-baf5-e631c756cfcc|
|M                                             |
|white                                         |
|999-55-4216                                   |
|bd303fee-fcb0-4e62-a89e-5a29199107c0          |
|fb4f3feb-e4c5-4f9a-b459-7e12ff0bb524          |
|999-31-4368                                   |
|S99957786                                     |
|1989-09-20ab7668f5-e1c7-43cd-8d97-18aec145f1bb|
|1951-02-21eea2956b-60bf-45f5-84a0-ad8eef8eb84a|
|irish              

In [17]:
patients_clean = patients.filter(F.col("BIRTHDATE").rlike(DATE_RX))


In [18]:
from pyspark.sql import functions as F, types as T

IN_PAT  = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output1\patients.parquet"
IN_COND = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output1\conditions.parquet"

OUT_SILVER = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\Output2"


In [19]:
patients   = spark.read.parquet(IN_PAT)
conditions = spark.read.parquet(IN_COND)


In [20]:
DATE_RE = r'(\d{4}-\d{2}-\d{2})'   # capture group 1 = the date we want


In [21]:
# patients
valid_birth = patients.filter(F.col("BIRTHDATE").rlike(r"^\d{4}-\d{2}-\d{2}$")).count()
invalid_birth = patients.filter(~F.col("BIRTHDATE").rlike(r"^\d{4}-\d{2}-\d{2}$")).count()
print("patients BIRTHDATE → valid:", valid_birth, "| invalid:", invalid_birth)

# conditions
valid_start = conditions.filter(F.col("START").rlike(r"^\d{4}-\d{2}-\d{2}$")).count()
invalid_start = conditions.filter(~F.col("START").rlike(r"^\d{4}-\d{2}-\d{2}$")).count()
print("conditions START → valid:", valid_start, "| invalid:", invalid_start)


patients BIRTHDATE → valid: 133184 | invalid: 205
conditions START → valid: 485284 | invalid: 0


In [22]:
patients_clean_drop = patients.filter(F.col("BIRTHDATE").rlike(r"^\d{4}-\d{2}-\d{2}$"))
conditions_clean_drop = conditions.filter(F.col("START").rlike(r"^\d{4}-\d{2}-\d{2}$") | F.col("START").isNull())


In [23]:
# Patients
patients_s = (
    patients
      .withColumn("patient_id", F.upper("Id"))
      .withColumn("birthdate_str", F.regexp_extract(F.col("BIRTHDATE"), DATE_RE, 1))
      .withColumn("deathdate_str", F.regexp_extract(F.col("DEATHDATE"), DATE_RE, 1))
      .withColumn("birthdate", F.to_date("birthdate_str"))
      .withColumn("deathdate", F.to_date("deathdate_str"))
      .withColumn("gender", F.upper(F.trim("GENDER")))
      .withColumn("age_years", F.floor(F.months_between(F.current_date(), F.col("birthdate"))/12))
      .select("patient_id","gender","birthdate","deathdate","age_years")
)

# Conditions
conditions_s = (
    conditions
      .withColumn("patient_id", F.upper("PATIENT"))
      .withColumn("start_str", F.regexp_extract(F.col("START"), DATE_RE, 1))
      .withColumn("stop_str",  F.regexp_extract(F.col("STOP"),  DATE_RE, 1))
      .withColumn("start_date", F.to_date("start_str"))
      .withColumn("stop_date",  F.to_date("stop_str"))
      .withColumn("is_active",  F.col("stop_date").isNull())
      .select("patient_id","DESCRIPTION","CODE","start_date","stop_date","is_active")
)


In [25]:
patients_s.write.mode("overwrite").parquet(fr"{OUT_SILVER}\patients_clean.parquet")
conditions_s.write.mode("overwrite").parquet(fr"{OUT_SILVER}\conditions_clean.parquet")


In [26]:
p = spark.read.parquet(fr"{OUT_SILVER}\patients_clean.parquet")
c = spark.read.parquet(fr"{OUT_SILVER}\conditions_clean.parquet")

p.select("birthdate").filter(F.col("birthdate").isNull()).show(5)   # ideally few or none
c.select("start_date").filter(F.col("start_date").isNull()).show(5)

p.show(5, truncate=False)
c.show(5, truncate=False)


+---------+
|birthdate|
+---------+
|     NULL|
|     NULL|
|     NULL|
|     NULL|
|     NULL|
+---------+
only showing top 5 rows
+----------+
|start_date|
+----------+
+----------+

+------------------------------------+------+----------+----------+---------+
|patient_id                          |gender|birthdate |deathdate |age_years|
+------------------------------------+------+----------+----------+---------+
|CE82E429-A89E-4DD0-B288-A572E3B8C17F|F     |1996-04-09|NULL      |29       |
|DAEF6412-4C0B-49FF-8752-D0BBBD51963B|F     |2001-04-22|NULL      |24       |
|61ADAE8E-6CF7-4180-B369-9E6340E84609|F     |2005-07-02|2006-01-21|20       |
|94EB9D8D-112E-4D73-8AA3-8C4F9D2A5BE5|F     |1975-12-10|NULL      |49       |
|F273EC3F-9ADB-4834-95D9-4D0B788B9E1E|F     |1995-08-28|NULL      |29       |
+------------------------------------+------+----------+----------+---------+
only showing top 5 rows
+------------------------------------+------------------------------------------------+--

In [27]:
from pathlib import Path

DATA_ROOT = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients\root"
OUT_ROOT  = r"C:\Users\sohel\Dropbox\Ariana\Interview_Data\Python_Model\patients"

OUT1 = str(Path(OUT_ROOT) / "Output1")
OUT2 = str(Path(OUT_ROOT) / "Output2")

IN_PAT_BRONZE  = str(Path(OUT1) / "patients.parquet")
IN_COND_BRONZE = str(Path(OUT1) / "conditions.parquet")

IN_PAT_SILVER  = str(Path(OUT2) / "patients_clean.parquet")
IN_COND_SILVER = str(Path(OUT2) / "conditions_clean.parquet")

IN_PAT_SILVER, IN_COND_SILVER


('C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output2\\patients_clean.parquet',
 'C:\\Users\\sohel\\Dropbox\\Ariana\\Interview_Data\\Python_Model\\patients\\Output2\\conditions_clean.parquet')

In [28]:
from pyspark.sql import functions as F

p = spark.read.parquet(IN_PAT_SILVER)
c = spark.read.parquet(IN_COND_SILVER)

# Null counts + rates (patients)
p_nulls = (
    p.select(
        F.count(F.when(F.col("birthdate").isNull(), 1)).alias("null_birthdate"),
        F.count(F.when(F.col("deathdate").isNull(), 1)).alias("null_deathdate"),
        F.count("*").alias("rows"),
    )
    .withColumn("pct_null_birthdate", F.col("null_birthdate") / F.col("rows"))
    .withColumn("pct_null_deathdate", F.col("null_deathdate") / F.col("rows"))
)
p_nulls.show(truncate=False)

# Null counts + rates (conditions)
c_nulls = (
    c.select(
        F.count(F.when(F.col("start_date").isNull(), 1)).alias("null_start"),
        F.count(F.when(F.col("stop_date").isNull(), 1)).alias("null_stop"),
        F.count("*").alias("rows"),
    )
    .withColumn("pct_null_start", F.col("null_start") / F.col("rows"))
    .withColumn("pct_null_stop", F.col("null_stop") / F.col("rows"))
)
c_nulls.show(truncate=False)


+--------------+--------------+------+---------------------+------------------+
|null_birthdate|null_deathdate|rows  |pct_null_birthdate   |pct_null_deathdate|
+--------------+--------------+------+---------------------+------------------+
|240           |99683         |133443|0.0017985207167105057|0.7470080858493888|
+--------------+--------------+------+---------------------+------------------+

+----------+---------+------+--------------+------------------+
|null_start|null_stop|rows  |pct_null_start|pct_null_stop     |
+----------+---------+------+--------------+------------------+
|0         |219720   |485284|0.0           |0.4527658031173498|
+----------+---------+------+--------------+------------------+



In [29]:
spark.read.parquet(IN_PAT_SILVER).show(5, truncate=False)
spark.read.parquet(IN_COND_SILVER).show(5, truncate=False)


+------------------------------------+------+----------+----------+---------+
|patient_id                          |gender|birthdate |deathdate |age_years|
+------------------------------------+------+----------+----------+---------+
|CE82E429-A89E-4DD0-B288-A572E3B8C17F|F     |1996-04-09|NULL      |29       |
|DAEF6412-4C0B-49FF-8752-D0BBBD51963B|F     |2001-04-22|NULL      |24       |
|61ADAE8E-6CF7-4180-B369-9E6340E84609|F     |2005-07-02|2006-01-21|20       |
|94EB9D8D-112E-4D73-8AA3-8C4F9D2A5BE5|F     |1975-12-10|NULL      |49       |
|F273EC3F-9ADB-4834-95D9-4D0B788B9E1E|F     |1995-08-28|NULL      |29       |
+------------------------------------+------+----------+----------+---------+
only showing top 5 rows
+------------------------------------+------------------------------------------------+---------+----------+----------+---------+
|patient_id                          |DESCRIPTION                                     |CODE     |start_date|stop_date |is_active|
+-------------

In [30]:
p = spark.read.parquet(IN_PAT_SILVER)
c = spark.read.parquet(IN_COND_SILVER)

print("Patients_clean schema:")
p.printSchema()
p.show(5, truncate=False)

print("Conditions_clean schema:")
c.printSchema()
c.show(5, truncate=False)


Patients_clean schema:
root
 |-- patient_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- deathdate: date (nullable = true)
 |-- age_years: long (nullable = true)

+------------------------------------+------+----------+----------+---------+
|patient_id                          |gender|birthdate |deathdate |age_years|
+------------------------------------+------+----------+----------+---------+
|CE82E429-A89E-4DD0-B288-A572E3B8C17F|F     |1996-04-09|NULL      |29       |
|DAEF6412-4C0B-49FF-8752-D0BBBD51963B|F     |2001-04-22|NULL      |24       |
|61ADAE8E-6CF7-4180-B369-9E6340E84609|F     |2005-07-02|2006-01-21|20       |
|94EB9D8D-112E-4D73-8AA3-8C4F9D2A5BE5|F     |1975-12-10|NULL      |49       |
|F273EC3F-9ADB-4834-95D9-4D0B788B9E1E|F     |1995-08-28|NULL      |29       |
+------------------------------------+------+----------+----------+---------+
only showing top 5 rows
Conditions_clean schema:
root
 |-- patient_id: s

In [31]:
print("Patients:", p.count())
print("Conditions:", c.count())


Patients: 133443
Conditions: 485284


In [32]:
c.groupBy("DESCRIPTION").count().orderBy(F.desc("count")).show(10, truncate=False)


+------------------------------------+-----+
|DESCRIPTION                         |count|
+------------------------------------+-----+
|Viral sinusitis (disorder)          |78859|
|Acute viral pharyngitis (disorder)  |43557|
|Acute bronchitis (disorder)         |35626|
|Prediabetes                         |34448|
|Hypertension                        |30322|
|Chronic sinusitis (disorder)        |24753|
|Normal pregnancy                    |15492|
|Otitis media                        |12420|
|Streptococcal sore throat (disorder)|10484|
|Coronary Heart Disease              |8511 |
+------------------------------------+-----+
only showing top 10 rows


In [33]:
p.select("age_years").describe().show()


+-------+-----------------+
|summary|        age_years|
+-------+-----------------+
|  count|           133203|
|   mean|  56.740981809719|
| stddev|27.26726639062234|
|    min|                8|
|    max|              119|
+-------+-----------------+



In [34]:
patient_example = {
  "patient_id": "1234",
  "demographics": {"gender": "F", "birthdate": "1980-01-01", "age_years": 45},
  "conditions": [
    {"description": "Hypertension", "code": "38341003", "start": "2019-04-10", "stop": None, "active": True},
    {"description": "Prediabetes",  "code": "15777000", "start": "2021-07-05", "stop": None, "active": True}
  ]
}
print(patient_example)


{'patient_id': '1234', 'demographics': {'gender': 'F', 'birthdate': '1980-01-01', 'age_years': 45}, 'conditions': [{'description': 'Hypertension', 'code': '38341003', 'start': '2019-04-10', 'stop': None, 'active': True}, {'description': 'Prediabetes', 'code': '15777000', 'start': '2021-07-05', 'stop': None, 'active': True}]}


In [35]:
import json

patient_json = """
{
  "patient_id": "1234",
  "demographics": {"gender": "F", "birthdate": "1980-01-01", "age_years": 45},
  "conditions": [
    {"description": "Hypertension", "code": "38341003", "start": "2019-04-10", "stop": null, "active": true},
    {"description": "Prediabetes",  "code": "15777000", "start": "2021-07-05", "stop": null, "active": true}
  ]
}
"""

parsed = json.loads(patient_json)   # convert JSON string → Python dict
print(parsed)


{'patient_id': '1234', 'demographics': {'gender': 'F', 'birthdate': '1980-01-01', 'age_years': 45}, 'conditions': [{'description': 'Hypertension', 'code': '38341003', 'start': '2019-04-10', 'stop': None, 'active': True}, {'description': 'Prediabetes', 'code': '15777000', 'start': '2021-07-05', 'stop': None, 'active': True}]}


In [36]:
from pydantic import BaseModel
from typing import Optional, List
import json

class Condition(BaseModel):
    description: Optional[str] = None
    code: Optional[str] = None
    start: Optional[str] = None
    stop: Optional[str] = None
    active: Optional[bool] = None

class Demographics(BaseModel):
    gender: Optional[str] = None
    birthdate: Optional[str] = None
    age_years: Optional[int] = None

class PatientFacts(BaseModel):
    patient_id: str
    demographics: Demographics
    conditions: List[Condition] = []

patient_example = {
    "patient_id": "1234",
    "demographics": {"gender": "F", "birthdate": "1980-01-01", "age_years": 45},
    "conditions": [
        {"description": "Hypertension", "code": "38341003", "start": "2019-04-10", "stop": None, "active": True},
        {"description": "Prediabetes",  "code": "15777000", "start": "2021-07-05", "stop": None, "active": True}
    ]
}

validated = PatientFacts(**patient_example)

# EITHER:
print(validated.model_dump_json(indent=2))
# OR:
# print(json.dumps(validated.model_dump(), indent=2))


{
  "patient_id": "1234",
  "demographics": {
    "gender": "F",
    "birthdate": "1980-01-01",
    "age_years": 45
  },
  "conditions": [
    {
      "description": "Hypertension",
      "code": "38341003",
      "start": "2019-04-10",
      "stop": null,
      "active": true
    },
    {
      "description": "Prediabetes",
      "code": "15777000",
      "start": "2021-07-05",
      "stop": null,
      "active": true
    }
  ]
}


In [37]:
def build_prompt(facts: PatientFacts) -> str:
    # Dump Pydantic object to JSON
    compact = facts.model_dump()

    # Optional: clip lists so prompt stays short
    compact["conditions"] = compact.get("conditions", [])[:5]

    prompt = f"""
You are a medical assistant writing a brief, friendly note to a patient.
Use ONLY the facts supplied. Do not invent diagnoses or dates.
Keep the language plain (grade 6–8), about 4–6 sentences.

Facts (JSON):
{json.dumps(compact, indent=2)}

Instruction:
- Summarize recent conditions (prioritize 'active' ones).
- Mention demographics (age, gender).
- Add a friendly reminder for follow-up or healthy living.
- End with: "Contact your care team if symptoms change."

Write the message addressed directly to the patient:
"""
    return prompt.strip()


In [40]:
def build_prompt(facts):
    return f"""Summarize the following patient data into a short narrative:

Patient ID: {facts.patient_id}
Age: {facts.demographics['age']}
Sex: {facts.demographics['sex']}
Diagnosis: {facts.baseline['cancer_type']} stage {facts.baseline['stage']}
Visits: {facts.visits}
Medications: {facts.medications}
Labs: {facts.labs}
Adverse events: {facts.adverse_events}
Next visit: {facts.next_visit}

Narrative:
"""


In [41]:
from transformers import pipeline, set_seed

def generate_narrative(facts, model_name="google/flan-t5-small"):
    set_seed(42)
    prompt = build_prompt(facts)

    gen = pipeline("text2text-generation", model=model_name, device=-1)
    out = gen(prompt, max_length=200, do_sample=False)
    return out[0]["generated_text"].strip()

# Example run (assuming `validated` is your PatientFacts dict or Pydantic model)
narrative = generate_narrative(validated)
print("Generated Narrative:\n")
print(narrative)


TypeError: 'Demographics' object is not subscriptable