In [0]:
"""
silver zone is responsible for cleaning, standardization, and applying business rules.
Auto Optimize & Small File Compaction (Delta Optimization)
Memory & Shuffle Optimizations (if transformations occur)
Data Deduplication (Removing Duplicates)
Handling Data Skew (Broadcast Joins & Salting for large joins)
Adjusting Shuffle Partitions & Enabling AQE
Data Type Optimization (Ensuring efficient storage & performance)

"""

In [0]:
%python
from pyspark.sql.functions import *
from datetime import datetime

# Define bronze and silver path 
bronze_path = "/mnt/Prajwal/Retail_sales_usecase/bronzeCDetails_day1"
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverCDetails_day1"

df = spark.read.format("parquet").load(bronze_path)
df = df.withColumnRenamed("street_addres", "address") \
               .withColumnRenamed("email_id", "email")

df = df.fillna({
    'first_name': 'Unknown',
    'last_name': 'Unknown',
    'email': 'noemail@example.com',
    'address': 'No Address',
    'state': 'Unknown',
    'zipcode': '00000',
    'contact_no': '000-000-0000'
})

# Trim spaces in all columns
df = df.select([trim(col(c)).alias(c) for c in df.columns])

# Capitalize first letter of first_name and last_name
df = df.withColumn('first_name', initcap(col('first_name'))).withColumn('last_name', initcap(col('last_name')))

# Ensure email is correctly formatted
df = df.withColumn('email', regexp_replace('email', r'[^a-zA-Z0-9@._-]', ''))

# Ensure zipcode is correctly formatted (assuming US zip codes) and ensure it is 5 digits
df = df.withColumn('zipcode', lpad(regexp_replace('zipcode', r'[^0-9]', ''), 5, '0'))

# Check for duplicates in customer_id
df = df.dropDuplicates(["customer_id"])

# Change ingestion time to format of yyyy-mm-dd hh:mm:ss
df = df.withColumn('ingestion_time', date_format(col('ingest_time'), 'yyyy-MM-dd HH:mm:ss'))

# Clean the target directory if it is not empty and not a Delta table
dbutils.fs.rm(silver_path, True)

# Write the cleaned data to the silver path
df.write.format("delta").mode("overwrite").partitionBy("ingestion_time").save(silver_path)

# Create a logging table

log_data = [(bronze_path, silver_path, "success", datetime.now())]
log_schema = ["bronze_path", "silver_path", "status", "timestamp"]
log_df = spark.createDataFrame(log_data, log_schema)

# Write the log data to a logging table
log_df.write.format("delta").mode("overwrite").save("/mnt/Prajwal/Retail_sales_usecase/Silver_logs")