### ðŸ“„ Patient Readmission Feature Engineering (Bronze â†’ Silver)

In [0]:
# Load bronze layer table into DataFrame for initial exploration

bronze_df = spark.read.table("bronze_patient_readmission")
bronze_df.display()


In [0]:
# Create binary feature 'readmit_30d' (1 if readmitted within 30 days, else 0)

from pyspark.sql.functions import col, when

silver_df = bronze_df.withColumn(
    "readmit_30d",
    when(col("readmitted") == "<30", 1).otherwise(0)
)


In [0]:
# Drop leakage columns that could cause data leakage in ML models

leakage_cols = ["readmitted", "encounter_id", "patient_nbr"]
silver_df = silver_df.drop(*leakage_cols)


In [0]:
# Handle missing values by filling nulls with 'Unknown'

silver_df = silver_df.fillna("Unknown")


In [0]:
# Create age buckets (Young, Middle, Senior) for categorical grouping

silver_df = silver_df.withColumn(
    "age_bucket",
    when(col("age").isin("[0-10]", "[10-20]", "[20-30]"), "Young")
    .when(col("age").isin("[30-40]", "[40-50]", "[50-60]"), "Middle")
    .otherwise("Senior")
)


In [0]:
# Compute utilization score as sum of inpatient, emergency, and outpatient visits

from pyspark.sql.functions import expr

silver_df = silver_df.withColumn(
    "utilization_score",
    expr("number_inpatient + number_emergency + number_outpatient")
)


In [0]:
# Create binary feature 'treatment_changed' (1 if treatment changed, else 0)

silver_df = silver_df.withColumn(
    "treatment_changed",
    when(col("change") == "Ch", 1).otherwise(0)
)


In [0]:
%sql
DROP TABLE IF EXISTS silver_patient_features;

In [0]:
from pyspark.sql.functions import col, when

# -------------------------
# Create treatment_changed
# -------------------------
silver_df = silver_df.withColumn(
    "treatment_changed",
    when(col("change") == "Ch", 1).otherwise(0)
)

# -------------------------
# Save clean Silver table
# -------------------------
silver_df.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_patient_features")

In [0]:
# Validate Silver table: count total rows and number of high-risk patients


spark.sql("""
SELECT
  COUNT(*) AS total_rows,
  SUM(readmit_30d) AS high_risk_patients
FROM silver_patient_features
""").display()


In [0]:
# SILVER â€“ LOAD BRONZE
# =========================

df = spark.table("default.bronze_patient_readmission")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.getOrCreate()

# Define df by reading the table
df = spark.table("default.bronze_patient_readmission")

df_silver = (
    df
    # # Create binary flag: 1 if readmitted within 30 days, else 0
    .withColumn(
        "readmit_30d",
        when(col("readmitted") == "<30", 1).otherwise(0)
    )
    # # Cast inpatient visits column to integer
    .withColumn(
        "number_inpatient",
        col("number_inpatient").cast("int")
    )
    # # Cast emergency visits column to integer
    .withColumn(
        "number_emergency",
        col("number_emergency").cast("int")
    )
    # # Cast outpatient visits column to integer
    .withColumn(
        "number_outpatient",
        col("number_outpatient").cast("int")
    )
)



In [0]:
# # Create utilization_score to measure overall healthcare usage by summing inpatient, emergency, and outpatient visits

df_silver = df_silver.withColumn(
    "utilization_score",
    col("number_inpatient")
    + col("number_emergency")
    + col("number_outpatient")
)

In [0]:


%sql
DROP TABLE IF EXISTS default.silver_patient_features;

In [0]:
# # Cast number_outpatient column to integer for proper numeric analysis and calculations


from pyspark.sql.functions import col

df_silver_fixed = (
    df_silver
    .withColumn("number_outpatient", col("number_outpatient").cast("int"))
)

In [0]:
# # Save the transformed DataFrame as a managed table, overwriting old data and schema if needed



(
    df_silver_fixed
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("default.silver_patient_features")
)

In [0]:
%sql
DESCRIBE TABLE default.silver_patient_features;