In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
# Define Paths
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"

In [0]:
df = spark.read.format("delta").load(silver_path + "Doctor")
display(df)

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

df_selected = df.select(
    col("doctorid"),
    col("name"),
    col("specialty"),
    col("department"),
    col("degree"),
    col("joined_date")
).dropDuplicates(['doctorid'])

window_spec = Window.orderBy("doctorid")

df_selected = df_selected.withColumn("doctor_key", row_number().over(window_spec))

display(df_selected)

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType, BooleanType

schema = StructType([
    StructField("doctorid", StringType(), True),
    StructField("name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("department", StringType(), True),
    StructField("degree", StringType(), True),
    StructField("joined_date", DateType(), True),
    StructField("doctor_key", IntegerType(), False)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Mock")

# Create the table with the specified schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Dim_Doctor (
        doctorid STRING,
        name STRING,
        specialty STRING,
        department STRING,
        degree STRING,
        joined_date DATE,
        doctor_key INT NOT NULL
    )
        USING DELTA 
    LOCATION "/mnt/mock_prajwal/Healthcare_practice/gold/Dim_Doctor"
""")

In [0]:
df_selected.write.mode("overwrite").format("delta").save(gold_path + "Dim_Doctor")

In [0]:
# Read from gold path
gold_path = "/mnt/mock_prajwal/Healthcare_practice/gold/"
product_dim_df = spark.read.format("delta").load("/mnt/mock_prajwal/Healthcare_practice/gold/Dim_Doctor")

# Read from product dim table
product_dim_table_df = spark.table("Prajwal_Mock.Dim_Doctor")

# Display the dataframes
display(product_dim_df)
display(product_dim_table_df)

In [0]:
# Count records in the gold layer
gold_count = product_dim_df.count()

# Assuming silver layer is stored in a DataFrame named silver_df
silver_df = spark.read.format("delta").load(silver_path + "Doctor")
silver_count = silver_df.count()

# Display counts
display(spark.createDataFrame([(silver_count, gold_count)], ["Silver Layer Count", "Gold Layer Count"]))

In [0]:
df_d = spark.read.format("delta").load(silver_path + "Admissions")
df_e = spark.read.format("delta").load(silver_path + "Billing")
df_f = spark.read.format("delta").load(silver_path + "PatientDetails")
df_g = spark.read.format("delta").load(silver_path + "Procedures")
df_h = spark.read.format("delta").load(silver_path + "test")


In [0]:
# Print schema for all DataFrames
dataframes = [df_d, df_e, df_f, df_g, df_h, df]

for dataframe in dataframes:
    dataframe.printSchema()