In [1]:
# ---------------------------------------------------------
# Notebook: 01_silver_layer_transformation
# Purpose : Transform Bronze tables -> Silver tables
# ---------------------------------------------------------


StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 3, Finished, Available, Finished)

In [2]:
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql import functions as F

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 4, Finished, Available, Finished)

In [3]:
# Utility: standardize column names to snake_case
def to_snake_case(df):
    for c in df.columns:
        df = df.withColumnRenamed(c, c.lower().replace(" ", "_"))
    return df

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 5, Finished, Available, Finished)

In [4]:
# =========================================================
# 1) Patients
# =========================================================
bronze_patients = spark.read.table("bronze_patients")

silver_patients = (
    to_snake_case(bronze_patients)
    .dropDuplicates(["PatientId"])                       # Deduplication
    .withColumn("gender", F.when(col("gender").isin("M","F"), col("gender")).otherwise("U"))
    .withColumn("load_date", current_timestamp())        # Audit column
)

silver_patients.write.mode("overwrite").saveAsTable("silver_patients")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 6, Finished, Available, Finished)

In [5]:
# =========================================================
# 2) Providers
# =========================================================
bronze_providers = spark.read.table("bronze_providers")

silver_providers = (
    to_snake_case(bronze_providers)
    .dropDuplicates(["ProviderId"])
    .withColumn("load_date", current_timestamp())
)

silver_providers.write.mode("overwrite").saveAsTable("silver_providers")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 7, Finished, Available, Finished)

In [6]:
# =========================================================
# 3) Locations
# =========================================================
bronze_locations = spark.read.table("bronze_locations")

silver_locations = (
    to_snake_case(bronze_locations)
    .dropDuplicates(["LocationId"])
    .withColumn("load_date", current_timestamp())
)

silver_locations.write.mode("overwrite").saveAsTable("silver_locations")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 8, Finished, Available, Finished)

In [7]:
# =========================================================
# 4) Encounters
# =========================================================
bronze_encounters = spark.read.table("bronze_encounters")

silver_encounters = (
    to_snake_case(bronze_encounters)
    .dropDuplicates(["EncounterId"])
    .withColumn("load_date", current_timestamp())
)

silver_encounters.write.mode("overwrite").saveAsTable("silver_encounters")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 9, Finished, Available, Finished)

In [8]:
# =========================================================
# 5) Diagnoses
# =========================================================
bronze_diagnoses = spark.read.table("bronze_diagnoses")

silver_diagnoses = (
    to_snake_case(bronze_diagnoses)
    .dropDuplicates(["DiagnosisId"])
    .withColumn("load_date", current_timestamp())
)

silver_diagnoses.write.mode("overwrite").saveAsTable("silver_diagnoses")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 10, Finished, Available, Finished)

In [9]:
# =========================================================
# 6) Procedures
# =========================================================
bronze_procedures = spark.read.table("bronze_procedures")

silver_procedures = (
    to_snake_case(bronze_procedures)
    .dropDuplicates(["ProcedureId"])
    .withColumn("load_date", current_timestamp())
)

silver_procedures.write.mode("overwrite").saveAsTable("silver_procedures")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 11, Finished, Available, Finished)

In [10]:
# =========================================================
# 7) Labs
# =========================================================
bronze_labs = spark.read.table("bronze_labs")

silver_labs = (
    to_snake_case(bronze_labs)
    .dropDuplicates(["LabResultId"])
    .withColumn("load_date", current_timestamp())
)

silver_labs.write.mode("overwrite").saveAsTable("silver_labs")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 12, Finished, Available, Finished)

In [11]:
# =========================================================
# 8) Vitals
# =========================================================
bronze_vitals = spark.read.table("bronze_vitals")

silver_vitals = (
    to_snake_case(bronze_vitals)
    .dropDuplicates(["VitalId"])
    .withColumn("load_date", current_timestamp())
)

silver_vitals.write.mode("overwrite").saveAsTable("silver_vitals")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 13, Finished, Available, Finished)

In [12]:
# =========================================================
# 9) Medications
# =========================================================
bronze_medications = spark.read.table("bronze_medications")

silver_medications = (
    to_snake_case(bronze_medications)
    .dropDuplicates(["MedicationId"])
    .withColumn("load_date", current_timestamp())
)

silver_medications.write.mode("overwrite").saveAsTable("silver_medications")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 14, Finished, Available, Finished)

In [13]:
# =========================================================
# 10) Payers
# =========================================================
bronze_payers = spark.read.table("bronze_payers")

silver_payers = (
    to_snake_case(bronze_payers)
    .dropDuplicates(["PayerId"])
    .withColumn("load_date", current_timestamp())
)

silver_payers.write.mode("overwrite").saveAsTable("silver_payers")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 15, Finished, Available, Finished)

In [14]:
# =========================================================
# 11) Claims
# =========================================================
bronze_claims = spark.read.table("bronze_claims")

silver_claims = (
    to_snake_case(bronze_claims)
    .dropDuplicates(["ClaimId"])
    .withColumn("load_date", current_timestamp())
)

silver_claims.write.mode("overwrite").saveAsTable("silver_claims")

StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 16, Finished, Available, Finished)

In [15]:
# =========================================================
# 12) Devices
# =========================================================
bronze_devices = spark.read.table("bronze_devices")

silver_devices = (
    to_snake_case(bronze_devices)
    .dropDuplicates(["DeviceId"])
    .withColumn("load_date", current_timestamp())
)

silver_devices.write.mode("overwrite").saveAsTable("silver_devices")

print("✅ Bronze ➝ Silver transformation complete")


StatementMeta(, 89d25c2a-4739-46d0-9d2e-560b8799a3db, 17, Finished, Available, Finished)

✅ Bronze ➝ Silver transformation complete
