In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
# Define Paths
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"

In [0]:

df_patinent = spark.read.format("delta").load(silver_path + "PatientDetails day 1")


In [0]:
from pyspark.sql.functions import when, col

df_selected = df_patinent.withColumn(
    "bmi_status",
    when(col("bmi") < 18.5, "Underweight")
    .when((col("bmi") >= 18.5) & (col("bmi") <= 24.9), "Normal weight")
    .when((col("bmi") >= 25.0) & (col("bmi") <= 29.9), "Overweight")
    .when((col("bmi") >= 30.0) & (col("bmi") <= 34.9), "Obesity (Class 1)")
    .when((col("bmi") >= 35.0) & (col("bmi") <= 39.9), "Obesity (Class 2)")
    .when(col("bmi") >= 40.0, "Obesity (Class 3 - Severe/Extreme)")
)

df_selected = df_selected.withColumn(
    "hba1c_status",
    when(col("hba1c") < 5.7, "Normal")
    .when((col("hba1c") >= 5.7) & (col("hba1c") <= 6.4), "Pre-Diabetes")
    .when(col("hba1c") >= 6.5, "Diabetes")
)

display(df_selected)

In [0]:
df_selected = df_selected.drop("bmi", "hba1c")

# 	Identify high-risk patients for specialized attention.

In [0]:
df_high_risk_patients = df_selected.filter(
    (col("bmi_status").like("Obesity%")) & 
    (col("hba1c_status") == "Diabetes") & 
    (col("heartissues") == "Yes") & 
    (col("anytransplants") == "Yes") & 
    (col("cancerhistory") == "Yes") & 
    (col("numberofmajorsurgeries") > 2) & 
    (col("smoker") == "Yes")
)

display(df_high_risk_patients)

In [0]:
df_selected = df_selected.select([
    "patientid", "title", "name", "dob", "gender", "bloodtype", "chronicconditions", 
    "contactnumber", "emailaddress", "profession", "address", "city", "state", "country", 
    "zipcode", "heartissues", "anytransplants", "cancerhistory", "numberofmajorsurgeries", 
    "smoker","bmi_status","hba1c_status"
]).dropDuplicates(['patientid'])

window_spec = Window.orderBy("patientid")

df_selected = df_selected.withColumn("patient_key", row_number().over(window_spec))

display(df_selected)

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
    StructField("patientid", StringType(), True),
    StructField("title", StringType(), True),
    StructField("name", StringType(), True),
    StructField("dob", DateType(), True),
    StructField("gender", StringType(), True),
    StructField("bloodtype", StringType(), True),
    StructField("chronicconditions", StringType(), True),
    StructField("contactnumber", StringType(), True),
    StructField("emailaddress", StringType(), True),
    StructField("profession", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("zipcode", StringType(), True),
    StructField("heartissues", StringType(), True),
    StructField("anytransplants", StringType(), True),
    StructField("cancerhistory", StringType(), True),
    StructField("numberofmajorsurgeries", IntegerType(), True),
    StructField("smoker", StringType(), True),
    StructField("bmi_status", StringType(), True),
    StructField("hba1c_status", StringType(), True),
    StructField("patient_key", IntegerType(), False),
    StructField("start_date", TimestampType(), False),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), False),
    StructField("last_modified", TimestampType(), False)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Mock")

# Create the table with the specified schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Dim_Patient (
        patientid STRING,
        title STRING,
        name STRING,
        dob DATE,
        gender STRING,
        bloodtype STRING,
        chronicconditions STRING,
        contactnumber STRING,
        emailaddress STRING,
        profession STRING,
        address STRING,
        city STRING,
        state STRING,
        country STRING,
        zipcode STRING,
        heartissues STRING,
        anytransplants STRING,
        cancerhistory STRING,
        numberofmajorsurgeries INT,
        smoker STRING,
        bmi_status STRING,
        hba1c_status STRING,
        patient_key INT NOT NULL,
        start_date TIMESTAMP NOT NULL,
        end_date TIMESTAMP,
        is_active BOOLEAN NOT NULL,
        last_modified TIMESTAMP NOT NULL
    )
        USING DELTA 
    LOCATION "/mnt/mock_prajwal/Healthcare_practice/gold/dim_patient"
""")

In [0]:
from pyspark.sql.functions import coalesce, max, lit, row_number, col, current_timestamp
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# Load target table
target_df = spark.read.table("Prajwal_Mock.Dim_Patient")

# Check if 'patient_key' column exists in the target table
if 'patient_key' in target_df.columns:
    # Finding max of previous data from table
    max_key = target_df.agg(coalesce(max("patient_key"), lit(0))).collect()[0][0]
else:
    max_key = 0

# Define window specification
window_spec = Window.orderBy("patientid")

# Add row number and patient_key
source_df_keyed = df_selected \
    .withColumn("rn", row_number().over(window_spec)) \
    .withColumn("patient_key", col("rn") + lit(max_key)) \
    .drop("rn")

# Add audit columns to source
source_df_audit_col = source_df_keyed \
    .withColumn("start_date", current_timestamp()) \
    .withColumn("end_date", lit(None).cast("timestamp")) \
    .withColumn("is_active", lit(True)) \
    .withColumn("last_modified", current_timestamp())

# Convert target to DeltaTable
target_table = DeltaTable.forName(spark, "Prajwal_Mock.Dim_Patient")

# Merge condition (on business key and active rows)
merge_condition = "target.patientid = source.patientid AND target.is_active = true"

# Perform Expiration
target_table.alias("target") \
    .merge(source_df_audit_col.alias("source"), merge_condition) \
    .whenMatchedUpdate(
        condition="target.name != source.name OR "
                  "target.dob != source.dob OR "
                  "target.gender != source.gender OR "
                  "target.bloodtype != source.bloodtype OR "
                  "target.contactnumber != source.contactnumber OR "
                  "target.emailaddress != source.emailaddress OR "
                  "target.address != source.address OR "
                  "target.city != source.city OR "
                  "target.state != source.state OR "
                  "target.country != source.country OR "
                  "target.zipcode != source.zipcode",
        set={
            "end_date": current_timestamp(),
            "is_active": lit(False),
            "last_modified": current_timestamp()
        }
    ).execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = spark.read.table("Prajwal_Mock.Dim_Patient").filter("is_active=true").select("patientid")
insert_df = source_df_audit_col.join(updated_target_df, on="patientid", how="left_anti")

insert_df.write.format("delta").mode("append").option("mergeSchema", "true").save("/mnt/mock_prajwal/Healthcare_practice/gold/dim_patient")

# Ensure the schema matches the target table
# insert_df = insert_df.select(*[col for col in target_df.columns if col in insert_df.columns])

In [0]:
# Count the number of updated records
updated_count = target_table.toDF().filter("is_active = false AND last_modified = current_timestamp()").count()

# Count the number of inserted records
inserted_count = insert_df.count()

# Display the counts
display(spark.createDataFrame([(updated_count, inserted_count)], ["Updated Records", "Inserted Records"]))

In [0]:
df_dim_patient = spark.read.format("delta").load("/mnt/mock_prajwal/Healthcare_practice/gold/dim_patient")
display(df_dim_patient)

In [0]:
# spark.sql("DROP TABLE IF EXISTS Prajwal_Mock.Dim_Patient")

In [0]:
# df_adm = spark.read.format("delta").load(silver_path + "Admissions")
# df_bill = spark.read.format("delta").load(silver_path + "Billing")
# df_proc = spark.read.format("delta").load(silver_path + "Procedures")
# df_test = spark.read.format("delta").load(silver_path + "test")

In [0]:
# df_adm.printSchema()
# df_bill.printSchema()
# df_proc.printSchema()
# df_test.printSchema()