silver zone is responsible for cleaning, standardization, and applying business rules.
Auto Optimize & Small File Compaction (Delta Optimization)
Memory & Shuffle Optimizations (if transformations occur)
Data Deduplication (Removing Duplicates)
Adjusting Shuffle Partitions & Enabling AQE
Data Type Optimization (Ensuring efficient storage & performance)


Auto Optimize & Compact Small Files

Enables Delta Lake's Auto Optimize whereever necessary.


In [0]:
spark.sql("SET spark.databricks.delta.optimizeWrite.enabled = true")
spark.sql("SET spark.databricks.delta.autoCompact.enabled = true")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime
import os
import time

In [0]:
%run ../logs/logs_notebook

In [0]:
%run ../utilities/Futuredate

In [0]:
%run ../utilities/Pastdate

In [0]:
# Define paths for bronze and silver layers
bronze_path = "/mnt/mock_prajwal/Mock2/bronze/"
silver_path = "/mnt/mock_prajwal/Mock2/silver/"

In [0]:
# try block to handle exceptions
try:
    # check if the bronze path exists
    if dbutils.fs.ls(bronze_path):
        # get the list of files in the bronze path
        files = [file.name for file in dbutils.fs.ls(bronze_path)]
        
        # check if 'FF_Customer_Details_Day0/' exists in the files
        if 'FF_Customer_Details_Day0/' in files:
            # set file path and other file details
            file_path = bronze_path + "FF_Customer_Details_Day0"
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            file_name = file_name.split(".")[0]
            file_info = dbutils.fs.ls(file_path)[0]
            file_size_kb = file_info.size / 1024
            file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
            processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
            file_type = "parquet"  # Define file_type
            Layer = "silver"
            
            # start time of the files
            start_time = time.time()
            log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading Parquet File {file_name}", Layer)
            
            # Load the parquet file into a DataFrame
            df = spark.read.format("parquet").load(bronze_path + "FF_Customer_Details_Day0")
            
            # Rename columns for consistency and readability
            df_renamed = df.withColumnRenamed("Row ID", "row_id") \
                .withColumnRenamed("Order Priority", "order_priority") \
                .withColumnRenamed("Discount", "discount") \
                .withColumnRenamed("Unit Price", "unit_price") \
                .withColumnRenamed("Shipping Cost", "shipping_cost") \
                .withColumnRenamed("Customer ID", "customer_id") \
                .withColumnRenamed("Customer Name", "customer_name") \
                .withColumnRenamed("Ship Mode", "ship_mode") \
                .withColumnRenamed("Customer Segment", "customer_segment") \
                .withColumnRenamed("Product Category", "product_category") \
                .withColumnRenamed("Product Sub-Category", "product_sub_category") \
                .withColumnRenamed("Product Container", "product_container") \
                .withColumnRenamed("Product Name", "product_name") \
                .withColumnRenamed("Product Base Margin", "product_base_margin") \
                .withColumnRenamed("Country", "country") \
                .withColumnRenamed("Region", "region") \
                .withColumnRenamed("State or Province", "state") \
                .withColumnRenamed("City", "city") \
                .withColumnRenamed("Postal Code", "postal_code") \
                .withColumnRenamed("Order Date", "order_date") \
                .withColumnRenamed("Ship Date", "ship_date") \
                .withColumnRenamed("Profit", "profit") \
                .withColumnRenamed("Quantity ordered new", "quantity_ordered_new") \
                .withColumnRenamed("Sales", "sales") \
                .withColumnRenamed("Order ID", "order_id") \
                .withColumnRenamed("Customer_DOB", "customer_dob") \
                .withColumnRenamed("Customer_Maritial_Status", "customer_marital_status") \
                .withColumnRenamed("Gender", "gender") \
                .withColumnRenamed("Valid_From", "valid_from") \
                .withColumnRenamed("Valid_To", "valid_to") \
                .withColumnRenamed("Sales_Rep_Id", "sales_rep_id") \
                .withColumnRenamed("Sales_Rep_Name", "sales_rep_name") \
                .withColumnRenamed("ingestion_time", "ingestion_time")
            
            # Apply UDF to filter past and future records
            futuredate = udf(date_format_udf_Policy_future, DateType())
            df_renamed = df_renamed.withColumn("ship_date", futuredate(col("ship_date")))
            df_renamed = df_renamed.withColumn("valid_from", futuredate(col("valid_from")))
            futuredate = udf(date_format_udf_Policy_future, DateType())
            df_renamed = df_renamed.withColumn("order_date", coalesce(futuredate(col("order_date")), lit("2017-01-01")))
            df_re = df_renamed.withColumn("valid_to", coalesce(col("valid_to"), lit("2018-12-31")))
            
            # Update priority column based on the given conditions
            df_re = df_re.withColumn("order_priority", 
                                     when(col("order_priority") == "High", "High")
                                     .when(col("order_priority") == "Low", "Low")
                                     .when(col("order_priority") == "Critical", "Critical")
                                     .when(col("order_priority").isNull(), "Not Specified")
                                     .when(col("order_priority") == "Medium", "Medium")
                                     .otherwise(col("order_priority")))
            
            # Set default values for certain columns
            df_re = df_re.withColumn("product_base_margin", when(col("product_base_margin").isNull(), 0.11).otherwise(col("product_base_margin")))
            
            # Standardize ZIP codes (Ensuring 5-digit numeric values)
            df_re = df_re.withColumn("postal_code", regexp_replace(col("postal_code"), "[^0-9]", ""))
            df_re = df_re.withColumn("postal_code", lpad(col("postal_code"), 5, "0"))
            
            # Record count and processing time
            record_count = df_re.count()
            processing_time_sec = int(time.time() - start_time)
            
            # Final status
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)
            df_re.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "FF_Customer_Details_Day0")
        
        # check if 'CustContact/' exists in the files
        if 'CustContact/' in files:
            # set file path and other file details
            file_path = bronze_path + "CustContact"
            file_name = os.path.basename(file_path)
            file_extension = file_name.split(".")[-1]
            file_name = file_name.split(".")[0]
            file_info = dbutils.fs.ls(file_path)[0]
            file_size_kb = file_info.size / 1024
            file_mod_time = datetime.fromtimestamp(file_info.modificationTime / 1000)
            processed_by = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
            file_type = "parquet"  # Define file_type
            Layer = "silver"
            
            # start time of the files
            start_time = time.time()
            log_message(file_path, file_type, file_size_kb, file_mod_time, None, "PROCESSING", 0, processed_by, f"Reading Json and txt File {file_name}", Layer)
            
            # Load the parquet file into a DataFrame
            df = spark.read.format("parquet").load(bronze_path + "CustContact")
            
            # Rename columns for consistency and readability
            df = df.withColumnRenamed("Customer_ID", "customer_id") \
                .withColumnRenamed("Email", "email") \
                .withColumnRenamed("Phone_Number", "phone_number") \
                .withColumnRenamed("ingestion_time", "ingestion_time")
            
            # Ensure email is correctly formatted
            df = df.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))
            
            # Standardize phone numbers (Ensuring numeric values)
            df = df.withColumn("phone_number", regexp_replace(col("phone_number"), "[^0-9]", ""))
            
            # Record count and processing time
            record_count = df.count()
            processing_time_sec = int(time.time() - start_time)
            
            # Final status
            log_message(file_path, file_type, file_size_kb, file_mod_time, record_count, "COMPLETED", processing_time_sec, processed_by, f"Successfully processed {file_path}", Layer)
            df.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path + "CustContact")
# handle exceptions
except Exception as e:
    processing_time_sec = int(time.time() - start_time)
    log_message(file_path, file_type, file_size_kb, file_mod_time, 0, "FAILED", processing_time_sec, processed_by, f"Error processing file {file_path}: {str(e)}", Layer)
    raise e

In [0]:
log_path = "/mnt/mock_prajwal/Mock2/logs"
df_logs = spark.read.format("delta").load(log_path)
display(df_logs)

In [0]:
silver_path = "/mnt/mock_prajwal/Mock2/silver/"

df_customer_details = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day0")
df_cust_contact = spark.read.format("delta").load(silver_path + "CustContact")

display(df_customer_details)
display(df_cust_contact)