In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta.tables import *

In [0]:
# Defining schema and table paths
catalog_name = "walmart_purchases"
bronze_schema = f"{catalog_name}.bronze"
silver_schema = f"{catalog_name}.silver"

bronze_table = f"{bronze_schema}.bronze_purchases"
silver_table = f"{silver_schema}.silver_purchases"

In [0]:
# Reading the bronze table
bronze_df = spark.table(bronze_table)

In [0]:
bronze_df.display()

### Data Cleaning and Type Casting

In [0]:
cleaned_bronze_df = (
    bronze_df
    # ------Trim whitespace for string columns------
    .withColumn("Customer_ID", trim(col("Customer_ID"))) 
    .withColumn("Gender", trim(col("Gender")))
    .withColumn("City", trim(col("City")))
    .withColumn("Category", trim(col("Category")))
    .withColumn("Product_Name", trim(col("Product_Name")))
    .withColumn("Payment_Method", trim(col("Payment_Method")))
    .withColumn("Discount_Applied", trim(col("Discount_Applied")))
    .withColumn("Repeat_Customer", trim(col("Repeat_Customer")))

    # ------Cast data types-----
    .withColumn("Purchase_Date", to_date(col("Purchase_Date"), "M/d/yyyy"))
    .withColumn("ingestion_date", col("ingestion_date").cast(DateType()))
)

### Drop duplicates

In [0]:
cleaned_bronze_df = cleaned_bronze_df.dropDuplicates(["Customer_ID", "Product_Name", "Purchase_Date", "Purchase_Amount"])

### Handling Null values

In [0]:
cleaned_bronze_df = cleaned_bronze_df.dropna(subset=["Customer_ID", "Purchase_Date", "Purchase_Amount"])

### Date Normalization

In [0]:
cleaned_bronze_df = (
    cleaned_bronze_df
    .withColumn("Year", year(col("Purchase_Date")))
    .withColumn("Month", month(col("Purchase_Date")))
    .withColumn("Day", dayofmonth(col("Purchase_Date")))
    .withColumn("Week", weekofyear(col("Purchase_Date")))
)


### Data Quality Checks

In [0]:
valid_payment_methods = ["Cash on Delivery", "Credit Card", "Debit Card", "UPI"]

silver_df = (
    cleaned_bronze_df
    .filter(col("Customer_ID").isNotNull()) # customer id not null
    .filter(col("Payment_Method").isin(valid_payment_methods)) # ensure valid payment method
    .filter(col("Purchase_Amount") > 0) # ensure purchase amount is positive
    .withColumn("processing_timestamp", current_timestamp()) # Adding processing metadata
)

In [0]:
# Writing data to silver table
if spark.catalog.tableExists(silver_table):
    delta_silver = DeltaTable.forName(spark, silver_table)

    (
        delta_silver.alias("t") \
        .merge(
            silver_df.alias("s"),
            "t.Customer_ID = s.Customer_ID and t.Purchase_Date = s.Purchase_Date"
        )
        .whenMatchedUpdateAll()         # update existing records if changed
        .whenNotMatchedInsertAll()      # insert new records
        .execute()
    )

else:
    silver_df.write \
        .format("delta") \
        .mode("overwrite") \
        .saveAsTable(f"{silver_table}")


In [0]:
%sql
select * from walmart_purchases.silver.silver_purchases