In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime
import os
import time

In [0]:
%run ../logs/logs_notebook

In [0]:
%run ../utilities/Futuredate

In [0]:
%run ../utilities/Pastdate

In [0]:
# Define Paths
bronze_path = "/mnt/mock_prajwal/Healthcare_practice/bronze/"
silver_path = "/mnt/mock_prajwal/Healthcare_practice/silver/"

In [0]:
# try block to handle exceptions
try:
    # check if the bronze path exists
    if dbutils.fs.ls(bronze_path):
        # get the list of files in the bronze path
        files = [file.name for file in dbutils.fs.ls(bronze_path)]
        
        # check if 'PatientDetails day 1' exists in the files
        if 'PatientDetails day 1/' in files:
            # set file path and other file details
            file_path = bronze_path + "PatientDetails day 1"
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            file_name = file_name.split(".")[0]
            file_info = dbutils.fs.ls(file_path)[0]
            file_size_kb = file_info.size / 1024
            file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
            processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
            file_type = "parquet"  # Define file_type
            Layer = "silver"
            
            # start time of the files
            start_time = time.time()
            log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading Parquet File {file_name}", Layer)
            
            # Load the parquet file into a DataFrame
            df = spark.read.format("parquet").load(bronze_path + "PatientDetails day 1")
            
            pastdate = udf(date_format_udf, DateType())
            df = df.withColumn("dob", pastdate(col("dob")))

            df = df.toDF(*[c.lower() for c in df.columns])

            df = df.toDF(*[c.replace(" ", "") for c in df.columns])

            df = df.withColumn("title", coalesce(col("title"), lit("Mr.")))
            df = df.withColumn("gender", coalesce(col("gender"), lit("Male")))


            df = df.withColumn("gender", regexp_replace("gender", "Oth", "Other"))
            df = df.withColumn("gender", regexp_replace("gender", "M ale", "Male"))
            df = df.withColumn("gender", regexp_replace("gender", "Fe male", "Female"))

            df = df.withColumn("gender", regexp_replace("gender", "Otherer", "Other"))

            df = df.withColumn("title", regexp_replace("title", "Doctor", "Dr."))
            df = df.withColumn("title", regexp_replace("title", "Miss", "Ms."))
            df = df.withColumn("title", regexp_replace("title", "Professor.", "Prof."))
            df = df.withColumn("title", regexp_replace("title", " Ms.", "Ms."))

            df = df.select([when(col(c).isNull(), "Unknow").otherwise(col(c)).alias(c) if c.startswith("chronicconditions_") or c in ["address", "city"] else col(c) for c in df.columns])

            # Standardize ZIP codes (Ensuring 5-digit numeric values)
            df = df.withColumn("zipcode", regexp_replace(col("zipcode"), "[^0-9]", ""))
            df = df.withColumn("zipcode", lpad(col("zipcode"), 5, "0"))


            # Ensure email is correctly formatted
            df = df.withColumn('emailaddress', regexp_replace('emailaddress', r'[^a-zA-Z0-9@._-]', ''))

            # Standardize phone numbers (Ensuring numeric values)
            df = df.withColumn("contactnumber", regexp_replace(col("contactnumber"), "[^0-9]", ""))

            df = df.withColumn("cancerhistory", 
                            when(col("cancerhistory") == "TRUE", "Yes")
                            .when(col("cancerhistory") == "0", "No")
                            .when(col("cancerhistory") == "1", "Yes")
                            .otherwise(col("cancerhistory")))

            df = df.withColumn("anytransplants", 
                            when(col("anytransplants") == "FALSE", "No")
                            .otherwise(col("anytransplants")))

            df = df.withColumn("smoker", 
                            when(col("smoker") == "0", "No")
                            .when(col("smoker") == "1", "Yes")
                            .otherwise(col("smoker")))

            df = df.withColumn("dob", when(col("dob").isNull(), lit("1999-01-01")).otherwise(col("dob")).cast("date"))
            df = df.withColumn("contactnumber", when(col("contactnumber").isNull(), lit("00000000")).otherwise(col("contactnumber")))

            # Record count and processing time
            record_count = df.count()
            processing_time_sec = int(time.time() - start_time)
            
            # Final status
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)

            df.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "PatientDetails day 1")
# handle exceptions
except Exception as e:
    processing_time_sec = int(time.time() - start_time)
    log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}", Layer)
    raise e



In [0]:
# Read the data from the silver layer
df_silver_admissions = spark.read.format("delta").load(silver_path + "PatientDetails day 1")
display(df_silver_admissions)

In [0]:
log_path = "/mnt/mock_prajwal/Healthcare_practice/logs"
df_logs = spark.read.format("delta").load(log_path)
df_logs_today = df_logs.filter(df_logs['processed_time'].cast("date") == "2025-05-19")
display(df_logs_today)

In [0]:
# from pyspark.sql.functions import coalesce, lit

# df = df.withColumn("title", coalesce(col("title"), lit("Mr.")))
# df = df.withColumn("gender", coalesce(col("gender"), lit("Male")))
# display(df)

In [0]:
# from pyspark.sql.functions import regexp_replace

# df = df.withColumn("gender", regexp_replace("gender", "Oth", "Other"))
# df = df.withColumn("gender", regexp_replace("gender", "M ale", "Male"))
# df = df.withColumn("gender", regexp_replace("gender", "Fe male", "Female"))
# display(df)

In [0]:
# df = df.withColumn("gender", regexp_replace("gender", "Otherer", "Other"))
# display(df)

In [0]:
# from pyspark.sql.functions import regexp_replace

# df = df.withColumn("title", regexp_replace("title", "Doctor", "Dr."))
# df = df.withColumn("title", regexp_replace("title", "Miss", "Ms."))
# df = df.withColumn("title", regexp_replace("title", "Professor.", "Prof."))
# df = df.withColumn("title", regexp_replace("title", " Ms.", "Ms."))
# display(df)

In [0]:
# from pyspark.sql.functions import when

# df = df.select([when(col(c).isNull(), "Unknow").otherwise(col(c)).alias(c) if c.startswith("chronicconditions_") or c in ["address", "city"] else col(c) for c in df.columns])
# display(df)

In [0]:
# df = df.withColumn("cancerhistory", 
#                    when(col("cancerhistory") == "TRUE", "Yes")
#                    .when(col("cancerhistory") == "0", "No")
#                    .when(col("cancerhistory") == "1", "Yes")
#                    .otherwise(col("cancerhistory")))

# display(df)

In [0]:
# df = df.withColumn("anytransplants", 
#                    when(col("anytransplants") == "FALSE", "No")
#                    .otherwise(col("anytransplants")))

# display(df)

In [0]:
# df = df.withColumn("smoker", 
#                    when(col("smoker") == "0", "No")
#                    .when(col("smoker") == "1", "Yes")
#                    .otherwise(col("smoker")))

# display(df)

In [0]:
# from pyspark.sql.functions import col, lit, when

# df = df.withColumn("dob", when(col("dob").isNull(), lit("1999-01-01")).otherwise(col("dob")).cast("date"))
# df = df.withColumn("contactnumber", when(col("contactnumber").isNull(), lit("00000000")).otherwise(col("contactnumber")))

# display(df)

In [0]:
# from pyspark.sql.functions import col, sum

# null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
# display(null_counts)