In [0]:
#Imports
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
#ADLS configuration 
spark.conf.set(
  "fs.azure.account.key.<storage-account-name>.dfs.core.windows.net",
    "<your-storage-account-access-key>"
)

In [0]:
bronze_path = "abfss://<bronze-container>@<storage-account-name>.dfs.core.windows.net/patient_flow"
silver_path = "abfss://<silver-container>@<storage-account-name>.dfs.core.windows.net/patient_flow"

In [0]:
#read from bronze
bronze_df = (
    spark.readStream
    .format("delta")
    .load(bronze_path)
)

In [0]:
#Define Schema
schema = StructType([
    StructField("patient_id", StringType()),
    StructField("gender", StringType()),
    StructField("age", IntegerType()),
    StructField("department", StringType()),
    StructField("admission_time", StringType()),
    StructField("discharge_time", StringType()),
    StructField("bed_id", IntegerType()),
    StructField("hospital_id", IntegerType())
])

In [0]:
#Parse it to dataframe (for timestamp)
parsed_df = bronze_df.withColumn("data",from_json(col("raw_json"),schema)).select("data.*")

In [0]:
#convert type to Timestamp
clean_df = parsed_df.withColumn("admission_time", to_timestamp("admission_time"))
clean_df = clean_df.withColumn("discharge_time", to_timestamp("discharge_time"))

In [0]:
#invalid admission_times for NULLs and which ones greater than current time
clean_df = clean_df.withColumn("admission_time",
                               when(
                                   col("admission_time").isNull() | (col("admission_time") > current_timestamp()),
                                   current_timestamp())
                               .otherwise(col("admission_time")))

In [0]:
#Handle Invalid Age
clean_df = clean_df.withColumn("age",
                               when(col("age")>100,floor(rand()*90+1).cast("int"))
                               .otherwise(col("age"))
                               )

In [0]:
#schema evolution
expected_cols = ["patient_id", "gender", "age", "department", "admission_time", "discharge_time", "bed_id", "hospital_id"]

In [0]:
#If cols not found then create it and assign None to it
for col_name in expected_cols:
    if col_name not in clean_df.columns:
        clean_df = clean_df.withColumn(col_name, lit(None))

In [0]:
#Write to silver table
(
    clean_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("mergeSchema","true")
    .option("checkpointLocation", silver_path + "_checkpoint")
    .start(silver_path)
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7f95e82d8cb0>

In [0]:
display(spark.read.format("delta").load(silver_path))

patient_id,gender,age,department,admission_time,discharge_time,bed_id,hospital_id
7c695297-ab01-4768-a22f-a84850ac2f03,Male,14,ICU,2025-09-20T10:06:18.39701Z,2025-09-23T10:06:18.39701Z,202,3
1b454d59-6ac9-4f63-b8f7-8a7724c3d75a,Male,83,ICU,2025-09-21T04:09:09.508741Z,2025-09-21T05:09:09.508741Z,122,5
689f1794-9c04-4702-825a-460e163e434d,Male,90,ICU,2025-09-22T02:00:39.168436Z,2025-09-23T17:00:39.168436Z,64,3
44e1522d-85b8-4ad7-b48f-87ae09e514b1,Male,60,ICU,2025-09-20T10:04:10.312014Z,2025-09-21T15:04:10.312014Z,11,7
22ca9d33-3338-41dd-934a-3029449da44e,Male,8,ICU,2025-09-21T12:08:54.498591Z,2025-09-22T10:08:54.498591Z,242,1
41e90d88-c0ec-4995-9e9c-d84b4fbd16c9,Male,86,ICU,2025-09-22T00:58:36.090703Z,2025-09-24T17:58:36.090703Z,77,5
405979d9-99d1-4fc8-97fe-b7f210888b14,Male,56,ICU,2025-09-22T22:59:08.112334Z,2025-09-23T01:59:08.112334Z,11,7
8f2cbf66-cc51-4197-9242-317d381dc065,Male,12,ICU,2025-09-20T21:07:43.457015Z,2025-09-21T19:07:43.457015Z,91,5
e6499126-3cce-429a-8405-bd49be4c3f51,Male,93,ICU,2025-09-20T20:55:33.974561Z,2025-09-23T19:55:33.974561Z,99,3
14252983-56dd-47a6-bfce-d506ae68bf03,Male,37,ICU,2025-09-22T08:55:56.988113Z,2025-09-24T00:55:56.988113Z,68,3
