In [0]:
bronze_df = spark.read.table("bronze_patient_readmission")
bronze_df.display()


In [0]:
from pyspark.sql.functions import col, when

silver_df = bronze_df.withColumn(
    "readmit_30d",
    when(col("readmitted") == "<30", 1).otherwise(0)
)


In [0]:
leakage_cols = ["readmitted", "encounter_id", "patient_nbr"]
silver_df = silver_df.drop(*leakage_cols)


In [0]:
silver_df = silver_df.fillna("Unknown")


In [0]:
silver_df = silver_df.withColumn(
    "age_bucket",
    when(col("age").isin("[0-10]", "[10-20]", "[20-30]"), "Young")
    .when(col("age").isin("[30-40]", "[40-50]", "[50-60]"), "Middle")
    .otherwise("Senior")
)


In [0]:
from pyspark.sql.functions import expr

silver_df = silver_df.withColumn(
    "utilization_score",
    expr("number_inpatient + number_emergency + number_outpatient")
)


In [0]:
silver_df = silver_df.withColumn(
    "treatment_changed",
    when(col("change") == "Ch", 1).otherwise(0)
)


In [0]:
silver_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_patient_features")


In [0]:
spark.sql("""
SELECT
  COUNT(*) AS total_rows,
  SUM(readmit_30d) AS high_risk_patients
FROM silver_patient_features
""").display()
