In [0]:
# Importing important function
import os
from datetime import datetime
import time
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType
import pyspark.sql.functions as F


# Define Paths
source_path = "/mnt/Prajwal/Capstone_Project/Source_Files"
bronze_path = "/mnt/Prajwal/Capstone_Project/bronze/"

# Defining log path
log_path = '/mnt/Prajwal/Capstone_Project/logs'

dbutils.fs.ls(source_path)

files = ['Loanpayments.csv', 'bankcustomer_source1_day0.csv', 'bankcustomer_source2_day0.json', 'bankloandetails.csv']

# Define the enhanced Log Table Schema
log_schema = StructType([
    StructField("timestamp", TimestampType(), True),
    StructField("log_level", StringType(), True),
    StructField("message", StringType(), True),
    StructField("file_type", StringType(), True),
    StructField("file_size_kb", DoubleType(), True),
    StructField("file_mod_time", TimestampType(), True),
    StructField("record_count", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("processing_time_sec", IntegerType(), True),
    StructField("processed_by", StringType(), True),
    StructField("processed_time", TimestampType(), True)
])

# Function to log message with additional metadata
def log_message(source_file, file_type, file_size_kb, file_mod_time, record_count, status, processing_time_sec, processed_by, message):
    log_entry = [(datetime.now(), "INFO", message, file_type, file_size_kb, file_mod_time, record_count, status, processing_time_sec, processed_by, datetime.now())]
    log_df = spark.createDataFrame(log_entry, log_schema)
    log_df.write.format("delta").mode("append").save(log_path)
    print(f"{datetime.now()} - [{message}]")


# Function to process files in a directory and log metadata
def process_files_in_directory(files, bronze_path):
    try:
        for file_name in files:
            file_path = os.path.join(source_path, file_name)

            # Log file details
            file_extension = file_name.split(".")[-1]
            file_name = file_name.split(".")[0]
            file_info = dbutils.fs.ls(file_path)[0]
            file_size_kb = file_info.size / 1024
            file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
            processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')

            # start time of the files
            start_time = time.time()

            if file_extension == "csv":
                file_type = "CSV"
                log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading CSV File {file_name}")
                df = spark.read.format("csv").option("header", "true").load(file_path)

            elif file_extension == "json":
                file_type = "JSON"
                log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading Json File {file_name}")
                df = spark.read.option("multiline","true").format("json").load(file_path)
            else:
                file_type = "UNKNOWN"
                log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", 0, processed_by, f"Unsupported file format: {file_extension}. Only CSV and TXT files are supported.")
                continue

            #adding ingestion time
            df = df.withColumn("ingest_time", F.current_timestamp())

            # Record count and processing time
            record_count = df.count()
            processing_time_sec = int(time.time() - start_time)
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "PROCESSING", processing_time_sec, processed_by, f"Successfully read {record_count} records from {file_path}")

            # Writing data to output path
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "PROCESSING", processing_time_sec, processed_by, f"Writing data to {bronze_path}")
            df.write.format("parquet").mode("overwrite").save(bronze_path + file_name)
            

            # Final status
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}")
    except Exception as e:
        processing_time_sec = int(time.time() - start_time)
        log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}")
        raise e    


process_files_in_directory(files, bronze_path)
            




In [0]:
log_path = '/mnt/Prajwal/Capstone_Project/logs'
log_df = spark.read.format("delta").load(log_path)
display(log_df)

In [0]:
%run ../Silver/Capstone_Silver_bankcustomer