In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
# Define Paths
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"

In [0]:
df = spark.read.format("delta").load(silver_path + "PatientDetails")
display(df)

In [0]:
selected_columns = ["patientid", "bmi", "hba1c", "heartissues", "anytransplants", "cancerhistory", "numberofmajorsurgeries", "smoker"]
df_selected = df.select(*selected_columns).dropDuplicates(['patientid'])

window_spec = Window.orderBy("patientid")

df_selected = df_selected.withColumn("patinet_key", row_number().over(window_spec))
display(df_selected)

In [0]:
schema = StructType([
    StructField("patientid", StringType(), True),
    StructField("bmi", DoubleType(), True),
    StructField("hba1c", DoubleType(), True),
    StructField("heartissues", StringType(), True),
    StructField("anytransplants", StringType(), True),
    StructField("cancerhistory", StringType(), True),
    StructField("numberofmajorsurgeries", IntegerType(), True),
    StructField("smoker", StringType(), True),
    StructField("patinet_key", IntegerType(), False)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Mock")

# Create the table with the specified schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Dim_Patient (
        patientid STRING,
        bmi DOUBLE,
        hba1c DOUBLE,
        heartissues STRING,
        anytransplants STRING,
        cancerhistory STRING,
        numberofmajorsurgeries INTEGER,
        smoker STRING,
        patinet_key INT NOT NULL
    )
        USING DELTA 
    LOCATION "/mnt/mock_prajwal/Healthcare_practice/gold/Dim_Patient"
""")

In [0]:
df_selected.write.mode("overwrite").format("delta").save(gold_path + "Dim_Patient")

In [0]:
# Read from gold path
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"
product_dim_df = spark.read.format("delta").load("/mnt/mock_prajwal/Healthcare_practice/gold/Dim_Patient")

# Read from product dim table
product_dim_table_df = spark.table("Prajwal_Mock.Dim_Patient")

# Display the dataframes
display(product_dim_df)
display(product_dim_table_df)

In [0]:
# Count records in the gold layer
gold_count = product_dim_df.count()

# Assuming silver layer is stored in a DataFrame named silver_df
silver_df = spark.read.format("delta").load(silver_path + "PatientDetails")
silver_count = silver_df.count()

# Display counts
display(spark.createDataFrame([(silver_count, gold_count)], ["Silver Layer Count", "Gold Layer Count"]))