In [0]:
# Importing the required libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os
from datetime import datetime
import time
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType
import pyspark.sql.functions as F

In [0]:
%run ../logs/logs_notebook

In [0]:
%run ../utilities/Futuredate

In [0]:
# Define Paths
source_path = "/mnt/mock_prajwal/Healthcare_practice/source_file/Procedures.json"
bronze_path = "/mnt/mock_prajwal/Healthcare_practice/bronze/"

In [0]:
df = spark.read.option("multiline","true").format("json").load(source_path)
df = df.withColumn("ingestion_time", current_timestamp())
display(df)

In [0]:
futuredate = udf(date_format_udf_Policy_future, DateType())
df = df.withColumn("proceduredate", futuredate(col("proceduredate")))
display(df)

In [0]:
df_proc = spark.read.format("csv").option("inferSchema","true").option("header","true").load("dbfs:/mnt/mock_prajwal/Healthcare_practice/source_file/Procedures.csv")


In [0]:

df_proc = df_proc.withColumn("ingestion_time", current_timestamp())
display(df_proc)

In [0]:
union_rec = df.unionByName(df_proc)
display(union_rec)

In [0]:
union_rec.write.format("parquet").mode("overwrite").save(bronze_path + "Procedures")

In [0]:
try:
    # Log file details
    file_path = source_path
    file_name = os.path.basename(file_path)
    file_extension = file_name.split(".")[-1]
    file_name = file_name.split(".")[0]
    file_info = dbutils.fs.ls(file_path)[0]
    file_size_kb = file_info.size / 1024
    file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
    processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
    file_type = "json"  # Define file_type

    Layer = "bronze"

    # start time of the files
    start_time = time.time()
    log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading json File {file_name}", Layer)
    
    # Read CSV file
    df = spark.read.option("multiline","true").format("json").load(source_path)
    
    # Add ingestion_time column
    df = df.withColumn("ingestion_time", current_timestamp())

    # Reading another procedure file and unioning both the files as they have same data
    df_proc = spark.read.format("parquet").load(bronze_path + "Procedures")

    union_rec = df.union(df_proc)
    
    # Record count and processing time
    record_count = df.count()
    processing_time_sec = int(time.time() - start_time)

    # Write to Parquet

    union_rec.write.format("parquet").mode("overwrite").save(bronze_path + file_name)

    # Final status
    log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)

except Exception as e:
    processing_time_sec = int(time.time() - start_time)
    log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}", Layer)
    raise e

In [0]:
df_bronze = spark.read.format("parquet").load(bronze_path + "Procedures")
display(df_bronze)

In [0]:
df_bronze.printSchema()

In [0]:
df_bronze_count = df_bronze.count()
df_bronze_count

In [0]:
log_path = "/mnt/mock_prajwal/Healthcare_practice/logs"
df_logs = spark.read.format("delta").load(log_path)
display(df_logs)