silver zone is responsible for cleaning, standardization, and applying business rules.
Auto Optimize & Small File Compaction (Delta Optimization)
Memory & Shuffle Optimizations (if transformations occur)
Data Deduplication (Removing Duplicates)
Adjusting Shuffle Partitions & Enabling AQE
Data Type Optimization (Ensuring efficient storage & performance)


Auto Optimize & Compact Small Files

Enables Delta Lake's Auto Optimize whereever necessary.


In [0]:
spark.sql("SET spark.databricks.delta.optimizeWrite.enabled = true")
spark.sql("SET spark.databricks.delta.autoCompact.enabled = true")

In [0]:
# Importing necessary libraries
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import substring
from pyspark.sql.types import DateType

In [0]:
bronze_path = "/mnt/mock_prajwal/bronze/Insurance_details_South_0"

df = spark.read.format("parquet").load(bronze_path)

In [0]:
df = df.withColumnRenamed("Customer ID", "customer_id") \
       .withColumnRenamed("Customer Name", "customer_name") \
       .withColumnRenamed("Customer_Segment", "customer_segment") \
       .withColumnRenamed("Maritial_Status", "marital_status") \
       .withColumnRenamed("Gender", "gender") \
       .withColumnRenamed("Effective_Start_Dt", "effective_start_date") \
       .withColumnRenamed("Effective_End_Dt", "effective_end_date") \
       .withColumnRenamed("Policy_Type_Id", "policy_type_id") \
       .withColumnRenamed("Policy_Id", "policy_id") \
       .withColumnRenamed("Premium_Amt", "premium_amount") \
       .withColumnRenamed("Policy_Start_Dt", "policy_start_date") \
       .withColumnRenamed("Policy_End_Dt", "policy_end_date") \
       .withColumnRenamed("Next_Premium_Dt", "next_premium_date") \
       .withColumnRenamed("Actual_Premium_Paid_Dt", "actual_premium_paid_date") \
       .withColumnRenamed("Country", "country") \
       .withColumnRenamed("Region", "region") \
       .withColumnRenamed("State or Province", "state") \
       .withColumnRenamed("City", "city") \
       .withColumnRenamed("Postal Code", "postal_code") \
       .withColumnRenamed("Total_Policy_Amt", "total_policy_amount") \
       .withColumnRenamed("Premium_Amt_Paid_TillDate", "premium_amt_paid_tilldate")

In [0]:
from datetime import datetime
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DateType
from pyspark.sql.functions import col, datediff, to_date, floor
from pyspark.sql.functions import lit, col, when
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.types import DateType

def date_format_udf(date_str):
    formats = ['%d-%b-%y', '%b %d-%Y', '%m/%d/%Y', '%m/_%d/%Y', '%m/ %b/%Y', '%d-%b-%Y', '%m/%d/%y' ,'%d-%m-%Y']

    # Loop through each format to try parsing the string.
    for fmt in formats:
        try:
            if date_str is None:
                return None 
            
            # Try parsing the string using the current format
            date_obj = datetime.strptime(date_str, fmt)
            current_date = datetime.now()
            # If the parsed year is in the future, it's likely due to 2-digit year parsing (like '24' becoming 2124)
            if date_obj.year > current_date.year:
                # Correct it by subtracting 100 years
                date_obj = date_obj.replace(year=date_obj.year - 100)
            return date_obj.date()
        except ValueError:
            continue

fix_dob_udf = udf(date_format_udf, DateType())

df_new = df.withColumn("DOB", fix_dob_udf(col("DOB")))

# Create a window specification
window_spec = Window.orderBy("customer_id")

# Add row_number column
df_new = df_new.withColumn("row_number", row_number().over(window_spec))

# Adjust DOB of 269 row to 1982-02-29 and 271 row to 1994-04-30 (since April has only 30 days)
df_new = df_new.withColumn("DOB", 
                           when(col("row_number") == 269, lit("1982-02-29").cast(DateType()))
                           .when(col("row_number") == 271, lit("1994-04-30").cast(DateType()))
                           .otherwise(col("DOB")))

df_new = df_new.drop("row_number")

df_new = df_new.withColumn("effective_start_date", fix_dob_udf(col("effective_start_date")))
df_new = df_new.withColumn("effective_end_date", fix_dob_udf(col("effective_end_date")))

df_new = df_new.withColumn("policy_start_date", fix_dob_udf(col("policy_start_date")))


# Here this udf is for Policy end date 

def date_format_udf_Policy(date_str):
    formats = ['%d-%b-%y', '%b %d-%Y', '%m/%d/%Y', '%m/_%d/%Y', '%m/ %b/%Y', '%d-%b-%Y', '%m/%d/%y' ,'%d-%m-%Y']

    # Loop through each format to try parsing the string.
    for fmt in formats:
        try:
            if date_str is None:
                return None 
            
            # Try parsing the string using the current format
            date_obj = datetime.strptime(date_str, fmt)
            current_date = datetime.now()
            # If the parsed year is in the future, it's likely due to 2-digit year parsing (like '24' becoming 2124)
            if date_obj.year > current_date.year:
                # Correct it by subtracting 100 years
                date_obj = date_obj.replace(year=date_obj.year)
            return date_obj.date()
        except ValueError:
            continue

fix_dob_udf_new = udf(date_format_udf_Policy, DateType())

df_new = df_new.withColumn("policy_end_date", fix_dob_udf_new(col("policy_end_date")))

df_new = df_new.withColumn("next_premium_date", fix_dob_udf(col("next_premium_date")))

df_new = df_new.withColumn("actual_premium_paid_date", fix_dob_udf(col("actual_premium_paid_date")))

# Standardize ZIP codes (Ensuring 5-digit numeric values)
df_new = df_new.withColumn("postal_code", regexp_replace(col("postal_code"), "[^0-9]", ""))
df_new = df_new.withColumn("postal_code", lpad(col("postal_code"), 5, "0"))
df_new = df_new.withColumn("premium_amount", col("premium_amount").cast("integer"))
df_new = df_new.withColumn("total_policy_amount", col("total_policy_amount").cast("integer"))
df_new = df_new.withColumn("premium_amt_paid_tilldate", col("premium_amt_paid_tilldate").cast("integer"))
df_new = df_new.withColumn("country", lit("USA"))
df_new = df_new.withColumn("region", when(col("region").isNull() | (col("region") == ""), "South").otherwise(col("region")))


# df_new = df_new.withColumn("DOB", to_date(col("DOB"), "yyyy-MM-dd"))
# df_new = df_new.withColumn("policy_start_date", to_date(col("policy_start_date"), "yyyy-MM-dd"))
df_new = df_new.withColumn("age", floor(datediff(col("policy_start_date"), col("DOB")) / 365))

df_new = df_new.withColumn("effective_end_date", col("policy_end_date"))

from pyspark.sql.functions import col, datediff, lit, floor

# Assuming 'customer_start_date' is the column indicating when the customer started
df_new = df_new.withColumn("customer_tenure_years", 
                           floor(datediff(col("actual_premium_paid_date"), col("policy_start_date")) / 365).alias("customer_tenure_years"))




In [0]:
Named_df = df_new.withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*doctor\s+","Dr.")).withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s+professor\s+","Prof.")).\
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*Mistress\s*","Mrs")) 

In [0]:
Named_final = Named_df.withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*doctor\s*","Dr")). \
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*professor\s*","Prof")). \
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*Mrs\s*","Mrs.")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^(?i)Dr!\s*([A-Za-z]+)\*\*?\s*([A-Za-z]+)$", "DR.$1 $2")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^(?i)Dr\.([A-Za-z])", "Dr. $1")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^\s*(?i)Dr\.", "Dr."))

df_south = Named_final.withColumn("customer_name", regexp_replace(col("customer_name"), r"(?i)^\s*Prof\s*([A-Za-z]+)", "Prof. $1"))

display(df_south)

# READING WEST DATA AND UNOINING WITH SOUTH

In [0]:
bronze_path_west = "/mnt/mock_prajwal/bronze/Insurance_details_West_0"

df_west = spark.read.format("parquet").load(bronze_path_west)

In [0]:
display(df_west)

In [0]:
df_west = df_west.withColumnRenamed("Customer ID", "customer_id") \
       .withColumnRenamed("Customer Name", "customer_name") \
       .withColumnRenamed("Customer_Segment", "customer_segment") \
       .withColumnRenamed("Maritial_Status", "marital_status") \
       .withColumnRenamed("Gender", "gender") \
       .withColumnRenamed("Effective_Start_Dt", "effective_start_date") \
       .withColumnRenamed("Effective_End_Dt", "effective_end_date") \
       .withColumnRenamed("Policy_Type_Id", "policy_type_id") \
       .withColumnRenamed("Policy_Id", "policy_id") \
       .withColumnRenamed("Premium_Amt", "premium_amount") \
       .withColumnRenamed("Policy_Start_Dt", "policy_start_date") \
       .withColumnRenamed("Policy_End_Dt", "policy_end_date") \
       .withColumnRenamed("Next_Premium_Dt", "next_premium_date") \
       .withColumnRenamed("Actual_Premium_Paid_Dt", "actual_premium_paid_date") \
       .withColumnRenamed("Country", "country") \
       .withColumnRenamed("Region", "region") \
       .withColumnRenamed("State or Province", "state") \
       .withColumnRenamed("City", "city") \
       .withColumnRenamed("Postal Code", "postal_code") \
       .withColumnRenamed("Total_Policy_Amt", "total_policy_amount") \
       .withColumnRenamed("Premium_Amt_Paid_TillDate", "premium_amt_paid_tilldate")

display(df_west)

In [0]:
Named_df = df_west.withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*doctor\s+","Dr.")).withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s+professor\s+","Prof.")).\
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*Mistress\s*","Mrs")) 


In [0]:
Named_final = Named_df.withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*doctor\s*","Dr")). \
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*professor\s*","Prof")). \
withColumn("customer_name",regexp_replace(col("customer_name"),r"(?i)^\s*Mrs\s*","Mrs.")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^(?i)Dr!\s*([A-Za-z]+)\*\*?\s*([A-Za-z]+)$", "DR.$1 $2")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^(?i)Dr\.([A-Za-z])", "Dr. $1")). \
withColumn("customer_name",regexp_replace(col("customer_name"), r"^\s*(?i)Dr\.", "Dr."))

Named_final = Named_final.withColumn("customer_name", regexp_replace(col("customer_name"), r"(?i)^\s*Prof\s*([A-Za-z]+)", "Prof. $1"))

In [0]:
df_west = Named_final.withColumn("DOB", fix_dob_udf(col("DOB")))

display(df_west)

In [0]:
df_west = df_west.withColumn("effective_start_date", fix_dob_udf(col("effective_start_date")))
df_west = df_west.withColumn("effective_end_date", fix_dob_udf(col("effective_end_date")))

df_west = df_west.withColumn("policy_start_date", fix_dob_udf(col("policy_start_date")))

df_west = df_west.withColumn("policy_end_date", fix_dob_udf_new(col("policy_end_date")))

df_new = df_west.withColumn("next_premium_date", fix_dob_udf(col("next_premium_date")))

df_new = df_new.withColumn("actual_premium_paid_date", fix_dob_udf(col("actual_premium_paid_date")))

# Standardize ZIP codes (Ensuring 5-digit numeric values)
df_new = df_new.withColumn("postal_code", regexp_replace(col("postal_code"), "[^0-9]", ""))
df_new = df_new.withColumn("postal_code", lpad(col("postal_code"), 5, "0"))

df_new = df_new.withColumn("premium_amount", col("premium_amount").cast("integer"))
df_new = df_new.withColumn("total_policy_amount", col("total_policy_amount").cast("integer"))
df_new = df_new.withColumn("premium_amt_paid_tilldate", col("premium_amt_paid_tilldate").cast("integer"))
df_west = df_new.withColumn("country", lit("USA"))


display(df_west)

In [0]:
df_west = df_west.withColumn("actual_premium_paid_date", when(col("actual_premium_paid_date").isNull(), "2010-01-01").otherwise(col("actual_premium_paid_date")))
df_west = df_west.filter(col("customer_id").isNotNull())

null_counts = df_west.select([count(when(col(c).isNull(), c)).alias(c) for c in df_west.columns]) 
display(null_counts)

In [0]:
df_new = df_west.withColumn("age", floor(datediff(col("policy_start_date"), col("DOB")) / 365))

df_new = df_new.withColumn("effective_end_date", col("policy_end_date"))

from pyspark.sql.functions import col, datediff, lit, floor

# Assuming 'customer_start_date' is the column indicating when the customer started
df_west = df_new.withColumn("customer_tenure_years", 
                           floor(datediff(col("actual_premium_paid_date"), col("policy_start_date")) / 365).alias("customer_tenure_years"))

In [0]:
display(df_west)

In [0]:
df_combined = df_south.union(df_west)
display(df_combined)

In [0]:
from pyspark.sql.functions import col, datediff, when

"""
Late Payment Category: Categorize payments based on how late they were made. 
1-30 days - Slightly Late
31-90 days - Moderately Late
91+ days - Severely Late
Negative days - Paid Early
"""

df_combined = df_combined.withColumn("days_late", datediff(col("actual_premium_paid_date"), col("next_premium_date")))
df_combined = df_combined.withColumn("late_payment_category", 
                                     when(col("days_late") < 0, "Paid Early")
                                     .when((col("days_late") >= 0) & (col("days_late") <= 30), "Slightly Late")
                                     .when((col("days_late") > 30) & (col("days_late") <= 90), "Moderately Late")
                                     .when(col("days_late") > 90, "Severely Late"))

display(df_combined)

In [0]:
silver_path = "/mnt/mock_prajwal/silver/Insurance_details_South_West"


df_combined.write.mode("overwrite").format("delta").partitionBy("ingestion_time").option("overwriteSchema", "true").save(silver_path)