# Synthea Gold Transformations
Transforms cleaned Silver-layer tables into curated Gold-layer tables ready for reporting and dashboarding.


In [0]:
from pyspark.sql.functions import (
    col, to_date, year, month, dayofmonth,
    datediff, when, lpad, length, lit,
    current_date, floor, broadcast, trim,
    countDistinct, sum as spark_sum
)

In [0]:
# Widgets for workflow compatibility
dbutils.widgets.text("silver_base_path", "/FileStore/silver")
dbutils.widgets.text("gold_base_path", "/FileStore/gold")

silver_base = dbutils.widgets.get("silver_base_path")
gold_base = dbutils.widgets.get("gold_base_path")


Function and call to create fact_encounters_gold table (enriched fields from patients and organizations.)

In [0]:
def transform_fact_encounters_gold(
    encounters_path: str,
    patients_path: str,
    organizations_path: str,
    output_path: str
):
    """
    Creates Gold fact_encounters table with 
    """
    print("📥 Loading Silver tables...")
    encounters = spark.read.format("delta").load(encounters_path)
    patients = spark.read.format("delta").load(patients_path)
    organizations = spark.read.format("delta").load(organizations_path)

    print("🔄 Transforming data...")
    df = encounters.withColumn("start_date", to_date("start")) \
                   .withColumn("year", year("start")) \
                   .withColumn("month", month("start")) \
                   .withColumn("day", dayofmonth("start")) \
                   .withColumn("encounter_duration_minutes", 
                               (col("stop").cast("long") - col("start").cast("long")) / 60) \
                   .withColumn("cost_bucket", when(col("total_claim_cost") < 100, "<$100")
                                               .when(col("total_claim_cost") < 500, "$100–499")
                                               .otherwise("$500+"))

    df = df.join(
        broadcast(patients.select("id", "birthdate")), #broadcast to send silver patient info to all nodes, small table relative to encounters
        df.patient == patients.id,
        "left"
    ).withColumn("age_at_encounter", 
        (datediff(col("start_date"), col("birthdate")) / 365.25).cast("int")
    ).drop(patients.id)

    df = df.join(
        broadcast(organizations.select("id", "city", "state", "zip_cleaned")), #broadcast again (optimize bcs smaller table organization, reduce shuffle)
        df.organization == organizations.id,
        "left"
    ).withColumnRenamed("zip_cleaned", "org_zip").drop(organizations.id)

    # testing output 
    display(df.limit(10))

    # uncomment when ready to write delta table 
    print(f"📝 Writing to Gold path: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ Done! Row count: {df.count()}")


In [0]:
transform_fact_encounters_gold(
    encounters_path=f"{silver_base}/encounters",
    patients_path=f"{silver_base}/patients",
    organizations_path=f"{silver_base}/organizations",
    output_path=f"{gold_base}/fact_encounters"
)


📥 Loading Silver tables...
🔄 Transforming data...


id,start,stop,patient,organization,provider,payer,encounterclass,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reasoncode,reasondescription,ingestion_timestamp,source_file,start_date,year,month,day,encounter_duration_minutes,cost_bucket,birthdate,age_at_encounter,city,state,org_zip
96c9dae7-5534-9b62-ff62-f9070d16cccc,2021-10-15T04:28:53.000+0000,2021-10-15T04:43:53.000+0000,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,87f3b181-1c52-3161-a15a-ea2d9a04b35f,ade28ca4-9796-33b6-91d2-5656ecd6b2a5,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,wellness,410620009,Well child visit (procedure),136.8,347.38,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2021-10-15,2021,10,15,15.0,$100–499,2021-10-14,0,NORTH ANDOVER,MA,18455
4f7d84f3-8207-7c46-3291-3e5ed0bfe0df,2015-05-29T11:05:06.000+0000,2015-05-29T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2015-05-29,2015,5,29,15.0,$100–499,2013-12-20,1,NATICK,MA,17603
b7d27489-fe6c-bd22-a90c-789c1aa2a7ce,2015-06-13T11:05:06.000+0000,2015-06-13T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,05fcf001-ab24-3d7d-85f7-f64942d82738,40f1fbee-320e-3a02-902f-bb55d650ac67,e03e23c9-4df1-3eb6-a62d-f70f02301496,outpatient,185345009,Encounter for symptom (procedure),85.55,85.55,0.0,65363002.0,Otitis media (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2015-06-13,2015,6,13,15.0,<$100,2013-12-20,1,FRAMINGHAM,MA,17014
92f15155-d985-4d86-caef-feeac8a22454,2015-11-27T11:05:06.000+0000,2015-11-27T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2015-11-27,2015,11,27,15.0,$100–499,2013-12-20,1,NATICK,MA,17603
e8085e9a-5531-0fae-2d81-a3ee076fc878,2016-05-27T11:05:06.000+0000,2016-05-27T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,1135.6,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2016-05-27,2016,5,27,15.0,$500+,2013-12-20,2,NATICK,MA,17603
01d9dc5f-d51d-bf34-7198-dbf8f52e6052,2021-11-19T04:28:53.000+0000,2021-11-19T04:43:53.000+0000,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,87f3b181-1c52-3161-a15a-ea2d9a04b35f,ade28ca4-9796-33b6-91d2-5656ecd6b2a5,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,wellness,410620009,Well child visit (procedure),136.8,1135.6,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2021-11-19,2021,11,19,15.0,$500+,2021-10-14,0,NORTH ANDOVER,MA,18455
23998ffd-5bca-e28e-915c-42763fd6e7f0,2016-10-06T11:05:06.000+0000,2016-10-06T12:05:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,4887dc20-9ef6-3cc2-9c56-858388049a50,3d93c4aa-a703-3538-b7c9-1e9950006112,e03e23c9-4df1-3eb6-a62d-f70f02301496,emergency,50849002,Emergency room admission (procedure),146.18,146.18,0.0,110030002.0,Concussion injury of brain (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2016-10-06,2016,10,6,60.0,$100–499,2013-12-20,2,NATICK,MA,17604
1a82bcb4-4af2-1e09-5d17-1b9de6e50323,2016-11-25T11:05:06.000+0000,2016-11-25T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2016-11-25,2016,11,25,15.0,$100–499,2013-12-20,2,NATICK,MA,17603
d88400c9-90d5-5244-a5e8-beef337c655c,2016-12-05T11:05:06.000+0000,2016-12-05T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,05fcf001-ab24-3d7d-85f7-f64942d82738,40f1fbee-320e-3a02-902f-bb55d650ac67,e03e23c9-4df1-3eb6-a62d-f70f02301496,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,0.0,62106007.0,Concussion with no loss of consciousness (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2016-12-05,2016,12,5,15.0,<$100,2013-12-20,2,FRAMINGHAM,MA,17014
b31c0b80-b8bc-8dad-7361-fefb4cf784d3,2017-05-26T11:05:06.000+0000,2017-05-26T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,136.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv,2017-05-26,2017,5,26,15.0,$100–499,2013-12-20,3,NATICK,MA,17603


📝 Writing to Gold path: /FileStore/gold/fact_encounters
✅ Done! Row count: 176093


Function and call to create fact_conditions table (enriched with age at condition and date parts).

In [0]:

def transform_fact_conditions_gold(
    conditions_path: str,
    patients_path: str,
    output_path: str
):
    """
    Creates Gold 
    """
    print("📥 Loading Silver tables...")
    conditions = spark.read.format("delta").load(conditions_path)
    patients = spark.read.format("delta").load(patients_path)

    print("🔄 Transforming data...")
    df = conditions.withColumn("start_date", to_date("start")) \
                   .withColumn("year", year("start")) \
                   .withColumn("month", month("start")) \
                   .withColumn("day", dayofmonth("start"))

    df = df.join(
        broadcast(patients.select("id", "birthdate")),
        df.patient == patients.id,
        "left"
    ).withColumn("age_at_condition", 
        (datediff(col("start_date"), col("birthdate")) / 365.25).cast("int")
    ).drop(patients.id)

    # Optional display for dev inspection
    display(df.limit(10))

    # Uncomment when ready to write
    print(f"📝 Writing to Gold path: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ Done! Row count: {df.count()}")


In [0]:
transform_fact_conditions_gold(
    patients_path=f"{silver_base}/patients",
    conditions_path=f"{silver_base}/conditions",
    output_path=f"{gold_base}/fact_conditions"
) 

📥 Loading Silver tables...
🔄 Transforming data...


start,stop,patient,encounter,system,code,description,ingestion_timestamp,source_file,condition_type,start_date,year,month,day,birthdate,age_at_condition
2015-05-29,2016-05-27,8a4869c4-4545-219f-2c65-fc44f98f3edf,4f7d84f3-8207-7c46-3291-3e5ed0bfe0df,SNOMED-CT,314529007,Medication review due (situation),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,situation,2015-05-29,2015,5,29,2013-12-20,1
2021-10-14,2021-11-18,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,96c9dae7-5534-9b62-ff62-f9070d16cccc,SNOMED-CT,314529007,Medication review due (situation),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,situation,2021-10-14,2021,10,14,2021-10-14,0
2015-06-13,2015-11-27,8a4869c4-4545-219f-2c65-fc44f98f3edf,b7d27489-fe6c-bd22-a90c-789c1aa2a7ce,SNOMED-CT,65363002,Otitis media (disorder),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,disorder,2015-06-13,2015,6,13,2013-12-20,1
2016-10-06,2016-12-05,8a4869c4-4545-219f-2c65-fc44f98f3edf,23998ffd-5bca-e28e-915c-42763fd6e7f0,SNOMED-CT,110030002,Concussion injury of brain (disorder),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,disorder,2016-10-06,2016,10,6,2013-12-20,2
2016-10-06,2016-12-05,8a4869c4-4545-219f-2c65-fc44f98f3edf,23998ffd-5bca-e28e-915c-42763fd6e7f0,SNOMED-CT,62106007,Concussion with no loss of consciousness (disorder),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,disorder,2016-10-06,2016,10,6,2013-12-20,2
2016-11-25,2019-12-06,8a4869c4-4545-219f-2c65-fc44f98f3edf,1a82bcb4-4af2-1e09-5d17-1b9de6e50323,SNOMED-CT,314529007,Medication review due (situation),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,situation,2016-11-25,2016,11,25,2013-12-20,2
2022-01-20,2022-01-20,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,dabf85d7-5cdf-ff0b-1374-9b14f06893e5,SNOMED-CT,314529007,Medication review due (situation),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,situation,2022-01-20,2022,1,20,2021-10-14,0
2006-02-19,2020-04-08,c0ec9264-5dda-fea6-ddea-edf27c0e846b,1d0b7aa1-cb2a-1b69-acb5-39aac257236c,SNOMED-CT,232353008,Perennial allergic rhinitis with seasonal variation (disorder),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,disorder,2006-02-19,2006,2,19,2001-04-04,4
2016-05-11,2016-05-11,c0ec9264-5dda-fea6-ddea-edf27c0e846b,3ec5e7c4-4362-eedf-aa7e-5bddcacba3c4,SNOMED-CT,314529007,Medication review due (situation),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,situation,2016-05-11,2016,5,11,2001-04-04,15
2016-05-11,2016-05-26,c0ec9264-5dda-fea6-ddea-edf27c0e846b,3ec5e7c4-4362-eedf-aa7e-5bddcacba3c4,SNOMED-CT,66383009,Gingivitis (disorder),2025-06-16T02:50:32.469+0000,/FileStore/tables/conditions.csv,disorder,2016-05-11,2016,5,11,2001-04-04,15


📝 Writing to Gold path: /FileStore/gold/fact_conditions
✅ Done! Row count: 106336


Function and call to create fact_patient_summary_gold table (aggregating total cost, conditions, and encounters per patient).


In [0]:

def transform_fact_patient_summary_gold(
    encounters_path: str,
    conditions_path: str,
    patients_path: str,
    output_path: str
):

    print("📥 Loading Silver tables...")
    encounters = spark.read.format("delta").load(encounters_path)
    conditions = spark.read.format("delta").load(conditions_path)
    patients = spark.read.format("delta").load(patients_path)

    print("🔄 Aggregating patient-level metrics...")

    # Reference date for age calc (e.g., snapshot of Jan 1, 2025)
    reference_date = to_date(lit("2025-01-01"))

    # Aggregate encounters
    #repartition to support parallelism, reshuffling data across partitions based on patient, avoid skew  
    encounter_agg = encounters \
        .repartition("patient") \
        .groupBy("patient") \
        .agg(spark_sum("total_claim_cost").alias("total_claim_cost"),
        countDistinct("id").alias("num_encounters"))

    # Aggregate conditions
    condition_agg = conditions \
        .repartition("patient") \
        .groupBy("patient") \
        .agg(countDistinct("code").alias("num_conditions"))

    # Build patient summary
    df = patients.select("id", "birthdate", "gender") \
        .join(encounter_agg, patients.id == encounter_agg.patient, "left") \
        .join(condition_agg, patients.id == condition_agg.patient, "left") \
        .drop(encounter_agg.patient) \
        .drop(condition_agg.patient)

    # Add age and bucket
    df = df.withColumn("age", (datediff(reference_date, col("birthdate")) / 365.25).cast("int")) \
        .withColumn("age_bucket", when(col("age") < 18, "<18")
                                    .when(col("age") < 30, "18-29")
                                    .when(col("age") < 45, "30-44")
                                    .when(col("age") < 65, "45-64")
                                    .otherwise("65+")) \
        .withColumnRenamed("id", "patient_id") \
        .fillna(0)
    # Optional display for dev inspection
    display(df.limit(10))

    # Uncomment when ready to write
    print(f"📝 Writing to Gold path: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ Done! Row count: {df.count()}")


In [0]:
transform_fact_patient_summary_gold(
    encounters_path = f"{silver_base}/encounters",
    conditions_path= f"{silver_base}/conditions",
    patients_path= f"{silver_base}/patients",
    output_path= f"{gold_base}/fact_patient_summary"
)

📥 Loading Silver tables...
🔄 Aggregating patient-level metrics...


patient_id,birthdate,gender,total_claim_cost,num_encounters,num_conditions,age,age_bucket
0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,2021-10-14,M,8631.600000000002,14,4,3,<18
8a4869c4-4545-219f-2c65-fc44f98f3edf,2013-12-20,F,26143.67,24,6,11,<18
c0ec9264-5dda-fea6-ddea-edf27c0e846b,2001-04-04,M,464309.52,64,12,23,18-29
44efb83f-e2f9-d51f-4c9c-f246c94a6836,2007-04-22,F,135240.73999999996,41,16,17,<18
4d14f82e-4161-579c-0229-4be23711676b,2010-07-07,M,28308.99,22,6,14,<18
9e8bdf4c-12c3-d7ed-7aa6-93ac1a9d8aba,1974-05-16,F,80042.07,42,19,50,45-64
347be68f-7625-f7cd-1171-a0031767dfe8,1964-01-19,F,224522.62000000008,53,35,60,45-64
eeb5567d-5663-1ba9-b028-32c50a6fb5ba,1962-04-14,M,133423.64999999994,48,20,62,45-64
24746653-4b85-e66c-bfbe-b7f7e936e06d,2003-05-14,M,17113.83,15,12,21,18-29
bff652a6-0d09-2c89-2642-976aad2ca872,1964-03-14,F,53515.01,30,17,60,45-64


📝 Writing to Gold path: /FileStore/gold/fact_patient_summary
✅ Done! Row count: 2920


Dimension gold tables - patient, provider, and organization. 

In [0]:
def transform_dim_patient_gold(patients_path: str, output_path: str):

    print("📥 Loading Silver patients table...")
    df = spark.read.format("delta").load(patients_path)

    print("🔄 Transforming patient dimension...")

    df = df.select(
        col("id").alias("patient_id"),
        trim(col("gender")).alias("gender"),
        to_date("birthdate").alias("birthdate"),
        to_date("deathdate").alias("deathdate"),
        trim(col("ethnicity")).alias("ethnicity"),
        trim(col("race")).alias("race"),
        col("income").cast("int"),
        trim(col("county")).alias("county"),
        trim(col("state")).alias("state"),
        col("zip_clean").cast("string").alias("zip")
    ).withColumn(
        "age",
        floor(
            datediff(
                when(col("deathdate").isNotNull(), col("deathdate")).otherwise(current_date()),
                col("birthdate")
            ) / 365.25
        )
    ).withColumn(
        "age_group",
        when(col("age") < 18, "0–17")
        .when(col("age") < 35, "18–34")
        .when(col("age") < 50, "35–49")
        .when(col("age") < 65, "50–64")
        .otherwise("65+")
    )


    # Optional preview
    display(df.limit(10))

    # Uncomment to write to disk
    print(f"💾 Writing to: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ dim_patient_gold written! Row count: {df.count()}")


In [0]:
transform_dim_patient_gold(
    patients_path= f"{silver_base}/patients",
    output_path= f"{gold_base}/dim_patients"
)

📥 Loading Silver patients table...
🔄 Transforming patient dimension...


patient_id,gender,birthdate,deathdate,ethnicity,race,income,county,state,zip,age,age_group
0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,M,2021-10-14,,nonhispanic,white,466824,Essex County,Massachusetts,1810,3,0–17
8a4869c4-4545-219f-2c65-fc44f98f3edf,F,2013-12-20,,nonhispanic,white,39521,Middlesex County,Massachusetts,1702,11,0–17
c0ec9264-5dda-fea6-ddea-edf27c0e846b,M,2001-04-04,,nonhispanic,white,18695,Hampden County,Massachusetts,1085,24,18–34
44efb83f-e2f9-d51f-4c9c-f246c94a6836,F,2007-04-22,,nonhispanic,white,169528,Plymouth County,Massachusetts,2341,18,18–34
4d14f82e-4161-579c-0229-4be23711676b,M,2010-07-07,,hispanic,asian,534744,Middlesex County,Massachusetts,1852,14,0–17
9e8bdf4c-12c3-d7ed-7aa6-93ac1a9d8aba,F,1974-05-16,,nonhispanic,white,83939,Middlesex County,Massachusetts,0,51,50–64
347be68f-7625-f7cd-1171-a0031767dfe8,F,1964-01-19,,nonhispanic,white,60987,Hampden County,Massachusetts,1009,61,50–64
eeb5567d-5663-1ba9-b028-32c50a6fb5ba,M,1962-04-14,1997-05-26,nonhispanic,white,301,Bristol County,Massachusetts,2723,35,35–49
24746653-4b85-e66c-bfbe-b7f7e936e06d,M,2003-05-14,,nonhispanic,white,107830,Hampden County,Massachusetts,1020,22,18–34
bff652a6-0d09-2c89-2642-976aad2ca872,F,1964-03-14,,nonhispanic,white,112624,Bristol County,Massachusetts,2743,61,50–64


💾 Writing to: /FileStore/gold/dim_patients
✅ dim_patient_gold written! Row count: 2920


In [0]:
def transform_dim_provider_gold(providers_path: str, output_path: str):
    """
    Transforms the Silver provider table into a cleaned Gold dimension table.
    """
    print("📥 Loading Silver providers table...")
    df = spark.read.format("delta").load(providers_path)

    print("🔄 Transforming provider dimension...")

    df = df.select(
        col("id").alias("provider_id"),
        trim(col("name")).alias("provider_name"),
        trim(col("speciality")).alias("specialty"),
        trim(col("address")).alias("address"),
        trim(col("city")).alias("city"),
        trim(col("state")).alias("state"),
        col("zip_clean").cast("string").alias("zip")
    )

    # Optional preview
    display(df.limit(10))

    # Uncomment to write to disk
    print(f"💾 Writing to: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ dim_provider_gold written! Row count: {df.count()}")


In [0]:
transform_dim_provider_gold(
    providers_path= f"{silver_base}/providers",
    output_path= f"{gold_base}/dim_providers"
)


📥 Loading Silver providers table...
🔄 Transforming provider dimension...


provider_id,provider_name,specialty,address,city,state,zip
b4fddf7d-8270-3555-8b5e-0b2ebeeb522f,Ted Reilly,GENERAL PRACTICE,881 Main Street,Fitchburg,MA,1420
a62208fe-9bf8-3a29-82ad-2b9c40c2c87a,Tiffaney Brakus,GENERAL PRACTICE,461 WALNUT AVE,JAMAICA PLAIN,MA,21302
ecbbed09-e689-3f52-9f9e-a15ad70518a7,Aleen Lueilwitz,GENERAL PRACTICE,19 TACOMA ST,WORCESTER,MA,16053
32b5d003-3d81-3b92-9981-8b7847ede274,Gustavo Armstrong,GENERAL PRACTICE,66 WASHINGTON ST,STOUGHTON,MA,20722
070a18e5-869b-3232-8113-8ba49be6f77b,Olympia Ward,GENERAL PRACTICE,512 MAIN STREET SUITE 211,SHREWSBURY,MA,15456
4c4ffa32-f352-3dfb-8a2c-0c95be7a4872,Ambrose Feeney,GENERAL PRACTICE,37 ROUTE 6A,SANDWICH,MA,25631
019d7b1f-8b17-3b65-bf49-8a12bd95835e,Ronald Emard,GENERAL PRACTICE,60 HOSPITAL RD,LEOMINSTER,MA,14533
d4a0701b-e6be-39b6-a5fc-b41eb2451061,Gisele Lehner,GENERAL PRACTICE,1400 VFW Parkway,West Roxbury,MA,2132
6eff0a54-349a-3125-8447-f1aa434fabda,Brittanie Krajcik,GENERAL PRACTICE,94 MAIN STREET,HYANNIS,MA,26013
aa4ffaf5-9b5f-3c08-8781-234a82ea41a4,Jerrod Harris,GENERAL PRACTICE,512 MAIN ST,HOLDEN,MA,15202


💾 Writing to: /FileStore/gold/dim_providers
✅ dim_provider_gold written! Row count: 1012


In [0]:
def transform_dim_organization_gold(organizations_path: str, output_path: str):
    """
    Transforms the Silver organization table into a cleaned Gold dimension table.
    """
    print("📥 Loading Silver organizations table...")
    df = spark.read.format("delta").load(organizations_path)

    print("🔄 Transforming organization dimension...")

    df = df.select(
        col("id").alias("organization_id"),
        trim(col("name")).alias("organization_name"),
        trim(col("address")).alias("address"),
        trim(col("city")).alias("city"),
        trim(col("state")).alias("state"),
        col("zip_cleaned").cast("string").alias("zip")
    )

    # Optional preview
    display(df.limit(10))

    # Uncomment to write to disk
    print(f"💾 Writing to: {output_path}")
    df.write.format("delta").mode("overwrite").save(output_path)

    print(f"✅ dim_organization_gold written! Row count: {df.count()}")


In [0]:
transform_dim_organization_gold(
    organizations_path= f"{silver_base}/organizations",
    output_path= f"{gold_base}/dim_organizations"
)


📥 Loading Silver organizations table...
🔄 Transforming organization dimension...


organization_id,organization_name,address,city,state,zip
74ab949d-17ac-3309-83a0-13b4405c66aa,Fitchburg Outpatient Clinic,881 Main Street,Fitchburg,MA,1420
588f6ce6-b8db-3588-8189-29db2680a313,BOSTON HEALTH CARE FOR THE HOMELESS PROGRAM INC,461 WALNUT AVE,JAMAICA PLAIN,MA,21302
b6398e07-4967-31a5-807f-380039a1f303,EDWARD M KENNEDY COMMUNITY HEALTH CENTER INC,19 TACOMA ST,WORCESTER,MA,16053
faffaf6a-ee1a-3673-b0b0-421a9c249244,ACTIVATED BY WELLNESS LLC,66 WASHINGTON ST,STOUGHTON,MA,20722
17a4bae5-8b64-34d7-8144-b428be027bd0,NURSE ON CALL,512 MAIN STREET SUITE 211,SHREWSBURY,MA,15456
4112b8b1-59df-3255-a7ca-f42ee0a4cb2e,CAPE HERITAGE REHABILITATION & HEALTH CARE CENTER,37 ROUTE 6A,SANDWICH,MA,25631
e09d4c49-c2ef-3b0f-9a46-3719d9219306,UMASS MEMORIAL HEALTHALLIANCE CLINTON HOSPITAL INC,60 HOSPITAL RD,LEOMINSTER,MA,14533
e76b5eb0-0c9d-3593-b5da-3c0e9a97bb96,VA Boston Healthcare System West Roxbury Campus,1400 VFW Parkway,West Roxbury,MA,2132
c241b977-4131-32e4-9957-e0a00b2a1e5f,DUFFY HEALTH CENTER,94 MAIN STREET,HYANNIS,MA,26013
e33fbb39-3d48-356e-b719-6942a94c09d9,RHAPSODY HOME HEALTH AGENCY LLC,512 MAIN ST,HOLDEN,MA,15202


💾 Writing to: /FileStore/gold/dim_organizations
✅ dim_organization_gold written! Row count: 1012


Register tables to hive metastore

In [0]:
gold_base_path = "/FileStore/gold"

tables_to_register = [
    "fact_encounters",
    "fact_conditions",
    "fact_patient_summary",
    "dim_patients",
    "dim_providers",
    "dim_organizations"
]

for table_name in tables_to_register:
    path = f"{gold_base_path}/{table_name}"
    print(f"Registering {table_name} at {path}")
    
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")
    
    spark.sql(f"""
        CREATE TABLE {table_name}
        USING DELTA
        LOCATION '{path}'
    """)


Registering fact_encounters at /FileStore/gold/fact_encounters
Registering fact_conditions at /FileStore/gold/fact_conditions
Registering fact_patient_summary at /FileStore/gold/fact_patient_summary
Registering dim_patients at /FileStore/gold/dim_patients
Registering dim_providers at /FileStore/gold/dim_providers
Registering dim_organizations at /FileStore/gold/dim_organizations


Inspecting 

In [0]:
silver_enc_df = spark.read.format("delta").load(f"{silver_base}/encounters")
display(silver_enc_df.limit(10))

silver_enc_df.printSchema()

id,start,stop,patient,organization,provider,payer,encounterclass,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reasoncode,reasondescription,ingestion_timestamp,source_file
96c9dae7-5534-9b62-ff62-f9070d16cccc,2021-10-15T04:28:53.000+0000,2021-10-15T04:43:53.000+0000,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,87f3b181-1c52-3161-a15a-ea2d9a04b35f,ade28ca4-9796-33b6-91d2-5656ecd6b2a5,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,wellness,410620009,Well child visit (procedure),136.8,347.38,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
4f7d84f3-8207-7c46-3291-3e5ed0bfe0df,2015-05-29T11:05:06.000+0000,2015-05-29T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
b7d27489-fe6c-bd22-a90c-789c1aa2a7ce,2015-06-13T11:05:06.000+0000,2015-06-13T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,05fcf001-ab24-3d7d-85f7-f64942d82738,40f1fbee-320e-3a02-902f-bb55d650ac67,e03e23c9-4df1-3eb6-a62d-f70f02301496,outpatient,185345009,Encounter for symptom (procedure),85.55,85.55,0.0,65363002.0,Otitis media (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
92f15155-d985-4d86-caef-feeac8a22454,2015-11-27T11:05:06.000+0000,2015-11-27T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
e8085e9a-5531-0fae-2d81-a3ee076fc878,2016-05-27T11:05:06.000+0000,2016-05-27T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,1135.6,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
01d9dc5f-d51d-bf34-7198-dbf8f52e6052,2021-11-19T04:28:53.000+0000,2021-11-19T04:43:53.000+0000,0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,87f3b181-1c52-3161-a15a-ea2d9a04b35f,ade28ca4-9796-33b6-91d2-5656ecd6b2a5,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,wellness,410620009,Well child visit (procedure),136.8,1135.6,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
23998ffd-5bca-e28e-915c-42763fd6e7f0,2016-10-06T11:05:06.000+0000,2016-10-06T12:05:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,4887dc20-9ef6-3cc2-9c56-858388049a50,3d93c4aa-a703-3538-b7c9-1e9950006112,e03e23c9-4df1-3eb6-a62d-f70f02301496,emergency,50849002,Emergency room admission (procedure),146.18,146.18,0.0,110030002.0,Concussion injury of brain (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
1a82bcb4-4af2-1e09-5d17-1b9de6e50323,2016-11-25T11:05:06.000+0000,2016-11-25T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,272.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
d88400c9-90d5-5244-a5e8-beef337c655c,2016-12-05T11:05:06.000+0000,2016-12-05T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,05fcf001-ab24-3d7d-85f7-f64942d82738,40f1fbee-320e-3a02-902f-bb55d650ac67,e03e23c9-4df1-3eb6-a62d-f70f02301496,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,0.0,62106007.0,Concussion with no loss of consciousness (disorder),2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv
b31c0b80-b8bc-8dad-7361-fefb4cf784d3,2017-05-26T11:05:06.000+0000,2017-05-26T11:20:06.000+0000,8a4869c4-4545-219f-2c65-fc44f98f3edf,e7b77e8f-b607-3273-aef9-ad1bca48813f,5e4867ba-cd40-310b-bcc8-13320f4a81ed,e03e23c9-4df1-3eb6-a62d-f70f02301496,wellness,410620009,Well child visit (procedure),136.8,136.8,0.0,,,2025-06-16T02:50:02.257+0000,/FileStore/tables/encounters.csv


root
 |-- id: string (nullable = true)
 |-- start: timestamp (nullable = true)
 |-- stop: timestamp (nullable = true)
 |-- patient: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- payer: string (nullable = true)
 |-- encounterclass: string (nullable = true)
 |-- code: long (nullable = true)
 |-- description: string (nullable = true)
 |-- base_encounter_cost: double (nullable = true)
 |-- total_claim_cost: double (nullable = true)
 |-- payer_coverage: double (nullable = true)
 |-- reasoncode: long (nullable = true)
 |-- reasondescription: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)



In [0]:
silver_pt_df = spark.read.format("delta").load(f"{silver_base}/patients")
display(silver_pt_df.limit(10))

silver_pt_df.printSchema()

id,birthdate,deathdate,ssn,drivers,passport,prefix,first,middle,last,suffix,maiden,marital,race,ethnicity,gender,birthplace,address,city,state,county,fips,zip,lat,lon,healthcare_expenses,healthcare_coverage,income,ingestion_timestamp,source_file,zip_clean
0e4068e0-4312-9a70-8f4d-46d1ddb9ae7c,2021-10-14,,999-38-2519,,,,Raymundo,,Heidenreich,,,,white,nonhispanic,M,Pembroke Massachusetts US,420 Considine Junction Apt 72,Andover,Massachusetts,Essex County,25009.0,1810,42.64495575427654,-71.13183416887576,8850.9,0.0,466824,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1810
8a4869c4-4545-219f-2c65-fc44f98f3edf,2013-12-20,,999-94-6872,,,,Nereida,,Kuhic,,,,white,nonhispanic,F,Boston Massachusetts US,123 Roob Village Apt 40,Framingham,Massachusetts,Middlesex County,25017.0,1702,42.24606356195159,-71.38883115932856,33253.86,0.0,39521,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1702
c0ec9264-5dda-fea6-ddea-edf27c0e846b,2001-04-04,,999-76-8765,S99954162,X49765874X,Mr.,Raleigh,Judson,Bahringer,,,,white,nonhispanic,M,Boston Massachusetts US,131 Hammes Center Unit 42,Westfield,Massachusetts,Hampden County,25013.0,1085,42.093468849128016,-72.71562215086846,7914.49,678786.52,18695,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1085
44efb83f-e2f9-d51f-4c9c-f246c94a6836,2007-04-22,,999-95-2157,S99940739,,Ms.,Aileen,Jo,Kshlerin,,,,white,nonhispanic,F,Arlington Massachusetts US,1039 Welch Wall Suite 24,Hanson,Massachusetts,Plymouth County,25023.0,2341,42.04757746473025,-70.83745414282974,47809.73,126196.68,169528,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,2341
4d14f82e-4161-579c-0229-4be23711676b,2010-07-07,,999-11-2103,,,,Benton,Man,Gibson,,,,asian,hispanic,M,Stoughton Massachusetts US,336 Crona Extension Apt 15,Lowell,Massachusetts,Middlesex County,25017.0,1852,42.69448876805488,-71.30703702120817,49504.87,1558.4,534744,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1852
9e8bdf4c-12c3-d7ed-7aa6-93ac1a9d8aba,1974-05-16,,999-15-2255,S99918620,X5436800X,Mrs.,Ciera,Chandra,Rau,,Runolfsson,M,white,nonhispanic,F,Boston Massachusetts US,295 Schumm Avenue Unit 81,Concord,Massachusetts,Middlesex County,,0,42.42278423424035,-71.35563481371928,812193.47,95005.26,83939,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,0
347be68f-7625-f7cd-1171-a0031767dfe8,1964-01-19,,999-60-3550,S99938509,X60693692X,Ms.,Na,Ivette,Wisoky,,,S,white,nonhispanic,F,Barnstable Massachusetts US,395 Wintheiser Fork,Palmer,Massachusetts,Hampden County,25013.0,1009,42.17226735420984,-72.29717698800074,774948.39,281585.66,60987,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1009
eeb5567d-5663-1ba9-b028-32c50a6fb5ba,1962-04-14,1997-05-26,999-68-1934,S99964118,X28983285X,Mr.,Alejandro,Vincent,Barton,,,M,white,nonhispanic,M,West Concord Massachusetts US,300 Adams Well,Fall River,Massachusetts,Bristol County,25005.0,2723,41.629837233869424,-71.19027877578944,52836.71,126104.55,301,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,2723
24746653-4b85-e66c-bfbe-b7f7e936e06d,2003-05-14,,999-36-9949,S99955117,X27536353X,Mr.,Lyndon,,Schulist,,,,white,nonhispanic,M,Quincy Massachusetts US,307 Bahringer Underpass Apt 43,Chicopee,Massachusetts,Hampden County,25013.0,1020,42.133299812845074,-72.5481968912779,37442.28,25155.05,107830,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,1020
bff652a6-0d09-2c89-2642-976aad2ca872,1964-03-14,,999-69-6955,S99969654,X70893291X,Mrs.,Ema,Georgetta,Pollich,,Reynolds,D,white,nonhispanic,F,Framingham Massachusetts US,708 Waters Rue,New Bedford,Massachusetts,Bristol County,25005.0,2743,41.76299959526842,-70.85718820215224,1040250.98,254657.62,112624,2025-06-16T02:49:34.548+0000,/FileStore/tables/patients.csv,2743


root
 |-- id: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- deathdate: date (nullable = true)
 |-- ssn: string (nullable = true)
 |-- drivers: string (nullable = true)
 |-- passport: string (nullable = true)
 |-- prefix: string (nullable = true)
 |-- first: string (nullable = true)
 |-- middle: string (nullable = true)
 |-- last: string (nullable = true)
 |-- suffix: string (nullable = true)
 |-- maiden: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- race: string (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthplace: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- county: string (nullable = true)
 |-- fips: integer (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- healthcare_expenses: double (nullable = true)
 |--

In [0]:
silver_prv_df = spark.read.format("delta").load(f"{silver_base}/providers")
display(silver_prv_df.limit(10))

silver_prv_df.printSchema()

id,organization,name,gender,speciality,address,city,state,zip,lat,lon,encounters,procedures,ingestion_timestamp,source_file,zip_clean
b4fddf7d-8270-3555-8b5e-0b2ebeeb522f,74ab949d-17ac-3309-83a0-13b4405c66aa,Ted Reilly,M,GENERAL PRACTICE,881 Main Street,Fitchburg,MA,1420,42.586487,-71.80521,5687,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,1420
a62208fe-9bf8-3a29-82ad-2b9c40c2c87a,588f6ce6-b8db-3588-8189-29db2680a313,Tiffaney Brakus,F,GENERAL PRACTICE,461 WALNUT AVE,JAMAICA PLAIN,MA,21302331,42.3115876,-71.09800136347226,161,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,21302
ecbbed09-e689-3f52-9f9e-a15ad70518a7,b6398e07-4967-31a5-807f-380039a1f303,Aleen Lueilwitz,F,GENERAL PRACTICE,19 TACOMA ST,WORCESTER,MA,16053516,42.30230485,-71.76614692511056,94,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,16053
32b5d003-3d81-3b92-9981-8b7847ede274,faffaf6a-ee1a-3673-b0b0-421a9c249244,Gustavo Armstrong,M,GENERAL PRACTICE,66 WASHINGTON ST,STOUGHTON,MA,20722571,42.14415820061801,-71.10378319512402,372,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,20722
070a18e5-869b-3232-8113-8ba49be6f77b,17a4bae5-8b64-34d7-8144-b428be027bd0,Olympia Ward,F,GENERAL PRACTICE,512 MAIN STREET SUITE 211,SHREWSBURY,MA,15456406,42.2951095,-71.71808471110307,86,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,15456
4c4ffa32-f352-3dfb-8a2c-0c95be7a4872,4112b8b1-59df-3255-a7ca-f42ee0a4cb2e,Ambrose Feeney,M,GENERAL PRACTICE,37 ROUTE 6A,SANDWICH,MA,25631801,41.7663294,-70.5147600346998,11,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,25631
019d7b1f-8b17-3b65-bf49-8a12bd95835e,e09d4c49-c2ef-3b0f-9a46-3719d9219306,Ronald Emard,F,GENERAL PRACTICE,60 HOSPITAL RD,LEOMINSTER,MA,14533290,42.54031915,-71.76312990486304,7508,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,14533
d4a0701b-e6be-39b6-a5fc-b41eb2451061,e76b5eb0-0c9d-3593-b5da-3c0e9a97bb96,Gisele Lehner,F,GENERAL PRACTICE,1400 VFW Parkway,West Roxbury,MA,2132,42.2793,-71.1657,953,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,2132
6eff0a54-349a-3125-8447-f1aa434fabda,c241b977-4131-32e4-9957-e0a00b2a1e5f,Brittanie Krajcik,F,GENERAL PRACTICE,94 MAIN STREET,HYANNIS,MA,26013146,41.6562802,-70.27448000204822,163,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,26013
aa4ffaf5-9b5f-3c08-8781-234a82ea41a4,e33fbb39-3d48-356e-b719-6942a94c09d9,Jerrod Harris,M,GENERAL PRACTICE,512 MAIN ST,HOLDEN,MA,15202039,42.33457565,-71.83651795564862,7,0,2025-06-16T02:50:21.619+0000,/FileStore/tables/providers.csv,15202


root
 |-- id: string (nullable = true)
 |-- organization: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- speciality: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- encounters: integer (nullable = true)
 |-- procedures: integer (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- zip_clean: string (nullable = true)

