In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime
import os
import time

In [0]:
%run ../logs/logs_notebook

In [0]:
%run ../utilities/Futuredate

In [0]:
%run ../utilities/Pastdate

In [0]:
# Define Paths
bronze_path = "/mnt/mock_prajwal/Healthcare_practice/bronze/"
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"

In [0]:
# try block to handle exceptions
try:
    # check if the bronze path exists
    if dbutils.fs.ls(bronze_path):
        # get the list of files in the bronze path
        files = [file.name for file in dbutils.fs.ls(bronze_path)]
        
        # check if 'FF_Customer_Details_Day0/' exists in the files
        if 'Admissions/' in files:
            # set file path and other file details
            file_path = bronze_path + "Admissions"
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            file_name = file_name.split(".")[0]
            file_info = dbutils.fs.ls(file_path)[0]
            file_size_kb = file_info.size / 1024
            file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
            processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
            file_type = "parquet"  # Define file_type
            Layer = "silver"
            
            # start time of the files
            start_time = time.time()
            log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading Parquet File {file_name}", Layer)
            
            # Load the parquet file into a DataFrame
            df = spark.read.format("parquet").load(bronze_path + "Admissions")

            df_adm = df.toDF(*[c.lower().replace(" ", "_") for c in df.columns])

            futuredate = udf(date_format_udf_Policy_future, DateType())

            df_adm = df_adm.withColumn("admission_date", futuredate(col("admission_date"))) \
                   .withColumn("discharge_date", futuredate(col("discharge_date")))

            # Record count and processing time
            record_count = df_adm.count()
            processing_time_sec = int(time.time() - start_time)
            
            # Final status
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)

            df_adm.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "Admissions")
# handle exceptions
except Exception as e:
    processing_time_sec = int(time.time() - start_time)
    log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}", Layer)
    raise e



In [0]:
# Read the data from the silver layer
df_silver_admissions = spark.read.format("delta").load(silver_path + "Admissions")
display(df_silver_admissions)

In [0]:
log_path = "/mnt/mock_prajwal/Healthcare_practice/logs"
df_logs = spark.read.format("delta").load(log_path)
df_logs_today = df_logs.filter(df_logs['processed_time'].cast("date") == "2025-05-19")
display(df_logs_today)

In [0]:
df_adm = spark.read.format("parquet").load(bronze_path + "Admissions")

In [0]:
df_adm = df_adm.toDF(*[c.lower().replace(" ", "_") for c in df_adm.columns])

In [0]:
futuredate = udf(date_format_udf_Policy_future, DateType())
df_adm = df_adm.withColumn("admission_date", futuredate(col("admission_date"))) \
                   .withColumn("discharge_date", futuredate(col("discharge_date")))

display(df_adm)

Writitng to silver path


In [0]:
df_adm.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "Admissions")

# Billing

In [0]:
df = spark.read.format("parquet").load(bronze_path + "Billing")

In [0]:
df = df.toDF(*[c.lower() for c in df.columns])
display(df)

In [0]:
from pyspark.sql.functions import col, round

df = df.withColumn("amountpaid", round(col("amountpaid").cast("double"), 2))
df = df.withColumn("totalamount", round(col("totalamount").cast("double"), 2))
display(df)

In [0]:
from pyspark.sql.functions import col, lit

df = df.withColumn("totalamount", col("totalamount").cast("double"))
df = df.fillna({"totalamount": 11111})
display(df)

In [0]:
df = df.withColumn("paymentmode", 
                   when(col("paymentmode") == "NetBanking", "Net Banking")
                   .when(col("paymentmode") == "Credit Card CC", "Credit Card")
                   .when(col("paymentmode") == "DebitCard", "Debit Card")
                   .otherwise(col("paymentmode")))
display(df)

In [0]:
distinct_values = df.select("billingdepartment").distinct()
display(distinct_values)

In [0]:
from pyspark.sql.functions import col, sum

null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
display(null_counts)

In [0]:
df.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "Billing")

# DOCTOR

In [0]:
df_dr = spark.read.format("parquet").load(bronze_path + "Doctor")
display(df_dr)

In [0]:
df_dr = df_dr.toDF(*[c.strip() for c in df_dr.columns])
display(df_dr)

In [0]:
df_dr = df_dr.toDF(*[c.lower().replace(" ", "_") for c in df_dr.columns])

display(df_dr)

In [0]:
futuredate = udf(date_format_udf_Policy_future, DateType())
df_dr = df_dr.withColumn("joined_date", futuredate(col("joined_date")))

display(df_dr)
                         


In [0]:
distinct_speciality = df_dr.select("department").distinct()
display(distinct_speciality)

In [0]:
from pyspark.sql.functions import regexp_replace

df_dr = df_dr.withColumn("degree", regexp_replace(col("degree"), "M.B.B.S", "MBBS"))
display(df_dr)

In [0]:
from pyspark.sql.functions import col, sum

null_counts = df_dr.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_dr.columns])
display(null_counts)

In [0]:
df_dr.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "Doctor")