In [0]:
# Importing the required libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os
from datetime import datetime
import time
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType
import pyspark.sql.functions as F

In [0]:
%run ../logs/logs_notebook

In [0]:
# Define Paths
source_path = "/mnt/mock_prajwal/Healthcare_practice/source_file/Doctor.csv"
bronze_path = "/mnt/mock_prajwal/Healthcare_practice/bronze/"

In [0]:
try:
    # Initialize start time
    start_time = time.time()
    
    # Log file details
    file_path = source_path
    file_name = os.path.basename(file_path)
    file_extension = file_name.split(".")[-1]
    file_name = file_name.split(".")[0]
    file_info = dbutils.fs.ls(file_path)[0]
    file_size_kb = file_info.size / 1024
    file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
    processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
    file_type = "csv"  # Define file_type

    Layer = "Bronze"

    # Log start of processing
    log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading CSV File {file_name}", Layer)
    
    # Read CSV file
    df = spark.read.option("delimiter", ",").option("inferSchema", "true").csv(source_path, header=True, quote='"', escape='"')
    
    # Add ingestion_time column
    df = df.withColumn("ingestion_time", current_timestamp())
    
    # Record count and processing time
    record_count = df.count()
    processing_time_sec = int(time.time() - start_time)

    # Write to Parquet
    df.write.format("parquet").mode("overwrite").save(bronze_path + file_name)

    # Final status
    log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)

except Exception as e:
    processing_time_sec = int(time.time() - start_time)
    log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}", Layer)
    raise e

In [0]:
df_bronze = spark.read.format("parquet").load(bronze_path + file_name)
display(df_bronze)

In [0]:
log_path = "/mnt/mock_prajwal/Healthcare_practice/logs"
df_logs = spark.read.format("delta").load(log_path)
df_logs_today = df_logs.filter(df_logs['processed_time'].cast("date") == "2025-05-19")
display(df_logs_today)