In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import max, col


In [0]:
# Path to your source folder
source_path = "/Volumes/customer_360/customer_360_source/source_sales_volume/*"
bronze_path = "/Volumes/customer_360/customer_360_bronze/bronze_sales_volume/"

# Read all CSVs into a single DataFrame
df_sales = spark.read.option("header", True) \
                          .option("inferSchema", True) \
                          .csv(source_path)


In [0]:

# Check if bronze table exists
if DeltaTable.isDeltaTable(spark, bronze_path):
    bronze_table = DeltaTable.forPath(spark, bronze_path)
    # Get max data_arrival_timestamp
    max_ts_row = bronze_table.toDF().select(max("data_arrival_timestamp")).collect()[0]
    max_ts = max_ts_row[0]  # None if table is empty
    if max_ts is None:
        print("Bronze table is empty. Will load all records.")
else:
    print("Bronze table not found. Will load all records.")
    max_ts = None  # first load

# Filter source for incremental load
if max_ts:
    df_to_load = df_sales.filter(col("data_arrival_timestamp") > max_ts)
else:
    df_to_load = df_sales  # first load, take all records

print(f"Number of records to load: {df_to_load.count()}")




In [0]:
# Append only the new records to bronze
df_to_load.write\
  .format("delta")\
  .mode("append")\
  .save(bronze_path)
  
print("New records appended to bronze successfully.")

In [0]:
from datetime import datetime
from pyspark.sql.functions import max
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType

# Count records
records_count = df_to_load.count()

# max timestamp (only if rows exist)
max_data_ts_row = (
    df_to_load.select(max("data_arrival_timestamp")).collect()[0][0]
    if records_count > 0
    else None
)

# Use Python datetime for load_time
load_time = datetime.now()

# Define schema explicitly
schema = StructType([
    StructField("layer", StringType(), True),
    StructField("table_name", StringType(), True),
    StructField("load_time", TimestampType(), True),
    StructField("records_loaded", LongType(), True),
    StructField("max_data_timestamp", TimestampType(), True)
])

# Prepare audit data (even if 0 rows)
data = [("bronze", "bronze_sales", load_time, records_count, max_data_ts_row)]

# Create DataFrame
df_audit = spark.createDataFrame(data, schema)

# Append to audit table
df_audit.write.format("delta") \
    .mode("append") \
    .save("/Volumes/customer_360/audit/audit_volume/etl_audit")

print(f"Audit log updated successfully. Records loaded: {records_count}")
