In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
spark=SparkSession.builder.appName("read bronze data").getOrCreate()
df_customer_bronze=spark.read\
    .format("delta")\
    .load("/Volumes/customer_360/customer_360_bronze/bronze_customer_volume")

In [0]:
src_bronze_path="/Volumes/customer_360/customer_360_bronze/bronze_customer_volume"
silver_path="/Volumes/customer_360/customer_360_silver/silver_customer_volume"
if DeltaTable.isDeltaTable(spark, silver_path):
    bronze_table = DeltaTable.forPath(spark, silver_path)
    # Get max data_arrival_timestamp
    max_ts_row = bronze_table.toDF().select(max("data_arrival_timestamp")).collect()[0]
    max_ts = max_ts_row[0]  # None if table is empty
    if max_ts is None:
        print("Bronze table is empty. Will load all records.")
else:
    print("Bronze table not found. Will load all records.")
    max_ts = None  # first load

# Filter source for incremental load
if max_ts:
    df = df_customer_bronze.filter(col("data_arrival_timestamp") > max_ts)
else:
    df = df_customer_bronze  # first load, take all records

print(f"Number of records to load: {df.count()}")

In [0]:
df=df.dropDuplicates(['customer_id','data_arrival_timestamp'])


In [0]:
df = df.dropna(subset=["customer_id", "customer_name","region","data_arrival_timestamp"],how='any')

In [0]:
df=df.fillna({
    "segment":'Unknown',
    "country":"Unknown",
    "city":"Unknown",
    "state":"Unknown",
    "postal_code":00000,
    "age":0
})

In [0]:
df = df.filter(col("customer_id").startswith("CUST"))

In [0]:
df = df.filter(~col("customer_name").rlike("^[0-9]"))

In [0]:
df = df.withColumn(
    "age",
    when((col("age") < 0) | (col("age") > 105), 0).otherwise(col("age"))
)

In [0]:
df = df.withColumn("is_deleted", lit(False))

In [0]:
df.display()

In [0]:
df.write\
    .format("delta")\
    .mode("append")\
    .option("mergeSchema", "true")\
    .partitionBy("region")\
    .save("/Volumes/customer_360/customer_360_silver/silver_customer_volume")


In [0]:
from datetime import datetime
from pyspark.sql.functions import max
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType

# Count records
records_count = df.count()

if records_count==0:
    msg=" (Source Empty)"
else:
    msg=""

# max timestamp (only if rows exist)
max_data_ts_row = (
    df.select(max("data_arrival_timestamp")).collect()[0][0]
    if records_count > 0
    else None
)

# Use Python datetime for load_time
load_time = datetime.now()

# Define schema explicitly
schema = StructType([
    StructField("layer", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("load_time", TimestampType(), True),
    StructField("records_loaded", LongType(), True),
    StructField("max_data_timestamp", TimestampType(), True)
])

# Prepare audit data (even if 0 rows)
data = [("silver", f"silver_customer{msg}", load_time, records_count, max_data_ts_row)]

# Create DataFrame
df_audit = spark.createDataFrame(data, schema)

# Append to audit table
df_audit.write.format("delta") \
    .mode("append") \
    .save("/Volumes/customer_360/audit/audit_volume/etl_audit")

print(f"Audit log updated successfully. Records loaded: {records_count}")
