* Remove duplicates
* Clean state values
* Possibly join with geolocation to enrich

In [0]:
df_customer = spark.sql("select * from mycatalog.olist_ecommerce_bronze.customers")
display(df_customer)

# DATA CLEANING

- CITY and STATE COLUMN
  - UPPER
  - REMOVE SPACE FROM BEGINNING AND END

In [0]:
from pyspark.sql.functions import *

In [0]:
df_silver = df_customer\
    .withColumn("customer_city", trim(upper(col("customer_city"))))\
    .withColumn("customer_state", trim(upper(col("customer_state"))))    
    


# REMOVE DUPLICATE 
- CUSTOMER ID COLUMNS

In [0]:
dedup_silver = df_silver.dropDuplicates(["customer_id"])

## DATA QUALITY CHECKS
- NOT NULL `Customer_id`
- VALID STATE LENGITH

In [0]:
dqnull_silver = dedup_silver.filter(col("customer_id").isNotNull())

In [0]:
dqlen_silver = dqnull_silver.filter(length(col("customer_state")) == 2 )

### ZIP Code Is Not a Number (It’s an Identifier)

- Even though it contains digits, a ZIP code:

❌ Is not used for math
❌ Should not be summed
❌ Should not be averaged
❌ Should not be used in calculations


We store those as strings, not integers.

- Leading Zero Problem (Very Important)
If you store ZIP as integer:
01234 → becomes 1234

You permanently lose the leading zero.


In [0]:
df_customer_final = dqlen_silver\
    .withColumn("customer_zip_code_prefix", lpad(col("customer_zip_code_prefix").cast("string"), 5, 0))

### SCD 1 CUSTOMER TABLE

In [0]:
from delta.tables import DeltaTable

# Specify catalog, schema, and table
catalog_name = "mycatalog"
schema_name = "olist_ecommerce_silver"
table_name = "customers"

# List all tables in the schema
tables = [t.name for t in spark.catalog.listTables(f"{catalog_name}.{schema_name}")]

if table_name in tables:
    print(f"Table {catalog_name}.{schema_name}.{table_name} exists")
    delta_table = DeltaTable.forName(spark, f"{catalog_name}.{schema_name}.{table_name}")
    source_df = broadcast(df_customer_final)
    
    #SCD1
    delta_table.alias("t").merge(
        source=df_customer_final.alias("s"),
        condition=col("t.customer_id") == col("s.customer_id")
    ).whenMatchedUpdate(
        set = {
            "customer_unique_id": col("s.customer_unique_id"),
            "customer_zip_code_prefix": col("s.customer_zip_code_prefix"),
            "customer_city": col("s.customer_city"),
            "customer_state": col("s.customer_state"),
            "source_file": col("s.source_file"),
            "source_file_timestamp": col("s.source_file_timestamp"),
        }
    ) \
     .whenNotMatchedInsert(
         values = {
             "customer_id": col("s.customer_id"),
             "customer_unique_id": col("s.customer_unique_id"),
             "customer_zip_code_prefix": col("s.customer_zip_code_prefix"),
             "customer_city": col("s.customer_city"),
             "customer_state": col("s.customer_state"),
             "source_file": col("s.source_file"),
             "source_file_timestamp": col("s.source_file_timestamp"),
         }
     ) \
    .whenNotMatchedBySourceDelete()\
    .execute()
    
else:
    print(f"Table {catalog_name}.{schema_name}.{table_name} does NOT exist")
    df_customer_final.write.format("delta").mode("overwrite").saveAsTable(f"{catalog_name}.{schema_name}.{table_name}")

###  Join with Geolocation Table (Optional Enrichment)

- You can enrich customers in Silver by joining with:
  - olist_geolocation_dataset

### UPPER GEOLOCATION CITY AND **STATE**

In [0]:
upper_geo_silver = df_geolocation\
    .withColumn("geolocation_city", upper(trim(col("geolocation_city"))))\
    .withColumn("geolocation_state", upper(trim(col("geolocation_state"))))

## CONVERT ZIP CODE COLUMN TO STRING AND PAD 0

In [0]:
df_geolocation_final = upper_geo_silver\
    .withColumn("geolocation_zip_code_prefix", lpad(col("geolocation_zip_code_prefix").cast("string"), 5, 0))

### JOIN THE CUSTOMER TABLE WITH GEOLOCATION TABLE

In [0]:
silver_customers_df = spark.sql("SELECT * FROM mycatalog.olist_ecommerce_silver.customers")
df_customer_geolocation = silver_customers_df.alias("left").join(
    df_geolocation_final.alias("right"),
    silver_customers_df.customer_zip_code_prefix == df_geolocation_final.geolocation_zip_code_prefix,
    "left"
).select("left.customer_id", "left.customer_unique_id", "left.customer_zip_code_prefix", "left.customer_city", "left.customer_state", "left.source_file", "left.source_file_timestamp", "right.geolocation_lat", "right.geolocation_lng")

In [0]:
df_customer_geolocation.write.format("delta").saveAsTable("mycatalog.olist_ecommerce_silver.customer_geolocation")