# GOLD FACT

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
# Define Paths
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"

In [0]:
df_adm = spark.read.format("delta").load(silver_path + "Admissions")
df_bill = spark.read.format("delta").load(silver_path + "Billing")
df_proc = spark.read.format("delta").load(silver_path + "Procedures")
df_test = spark.read.format("delta").load(silver_path + "Tests")
df_dtr = spark.read.format("delta").load(silver_path + "Doctor")

In [0]:
fact_df = df_proc.alias("p") \
    .join(df_adm.alias("a"), on=["patientid", "doctorid"], how="outer")\
    .join(df_bill.alias("b"), on=["patientid"], how="outer")\
    .join(df_test.alias("t"), on=["patientid"], how="outer")

display(fact_df)

In [0]:
selected_columns = [
    'patientid', 'doctorid', 'proceduredate', 'proceduretype', 'outcome', 
    'department', 'admission_date', 'discharge_date', 'billingid', 'totalamount', 'amountpaid', 
    'paymentstatus', 'paymentdate', 'patientcategory', 'paymentmode', 'paymentreference', 
    'billingdepartment', 'testtype', 'testdate', 'testresults', 'cost'
]

fact_df_selected = fact_df.select(*selected_columns)

display(fact_df_selected)

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, expr

fact_df_with_id = fact_df_selected.withColumn("activityid", expr("monotonically_increasing_id() + 1"))

display(fact_df_with_id)

In [0]:
fact_df_with_id.printSchema()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType, LongType

schema = StructType([
    StructField("patientid", StringType(), True),
    StructField("doctorid", StringType(), True),
    StructField("proceduredate", DateType(), True),
    StructField("proceduretype", StringType(), True),
    StructField("outcome", StringType(), True),
    StructField("department", StringType(), True),
    StructField("admission_date", DateType(), True),
    StructField("discharge_date", DateType(), True),
    StructField("billingid", StringType(), True),
    StructField("totalamount", DoubleType(), True),
    StructField("amountpaid", DoubleType(), True),
    StructField("paymentstatus", StringType(), True),
    StructField("paymentdate", DateType(), True),
    StructField("patientcategory", StringType(), True),
    StructField("paymentmode", StringType(), True),
    StructField("paymentreference", StringType(), True),
    StructField("billingdepartment", StringType(), True),
    StructField("testtype", StringType(), True),
    StructField("testdate", DateType(), True),
    StructField("testresults", StringType(), True),
    StructField("cost", DoubleType(), True),
    StructField("activityid", LongType(), False)
])

# Create the table if it does not exist
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Fact_table (
        patientid STRING,
        doctorid STRING,
        proceduredate DATE,
        proceduretype STRING,
        outcome STRING,
        department STRING,
        admission_date DATE,
        discharge_date DATE,
        billingid STRING,
        totalamount DOUBLE,
        amountpaid DOUBLE,
        paymentstatus STRING,
        paymentdate DATE,
        patientcategory STRING,
        paymentmode STRING,
        paymentreference STRING,
        billingdepartment STRING,
        testtype STRING,
        testdate DATE,
        testresults STRING,
        cost DOUBLE,
        activityid LONG
    )
    USING DELTA 
    LOCATION "/mnt/mock_prajwal/Healthcare_practice/gold/Dim_fact"
""")

In [0]:
fact_df_with_id.write.format("delta").mode("append").option("mergeSchema", "true").save("/mnt/mock_prajwal/Healthcare_practice/gold/Dim_fact")

In [0]:

product_dim_df = spark.read.format("delta").load("/mnt/mock_prajwal/Healthcare_practice/gold/Dim_fact")

# Read from product dim table
product_dim_table_df = spark.table("Prajwal_Mock.Fact_table")

# Display the dataframes
display(product_dim_df)
display(product_dim_table_df)

In [0]:
# spark.sql("DROP TABLE IF EXISTS Prajwal_Mock.Fact_table")