In [0]:
# DEFINE CUSTOMER BRONZE TABLE PATH
bronze_merchants_df = spark.table("finance_fraudworkspace.bronze_managed.merchants_bronze")

In [0]:
# CLEAN MERCHANTS DATA
from pyspark.sql.functions import col, when, lit, upper, trim
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

# Start from bronze merchants
merchants_silver = bronze_merchants_df

# CLEAN MERCHANTS NAME
merchants_silver = merchants_silver.withColumn(
    "Merchant_name",
    when(col("Merchant_name").isNull(), lit("UNKNOWN_MERCHANT"))
    .otherwise(trim(col("Merchant_name")))
)

# CLEAN MERCHANTS CATEGORY
merchants_silver = merchants_silver.withColumn(
    "Merchant_category",
    when(col("Merchant_category").isNull(), lit("OTHER"))
    .otherwise(upper(trim(col("Merchant_category"))))
)

# CLEAN COUNTRY CODE
merchants_silver = merchants_silver.withColumn(
    "Merchant_country_code",
    when(col("Merchant_country_code").isNull(), lit("UNK"))
    .otherwise(upper(trim(col("Merchant_country_code"))))
)

# CLEAN RISK LEVEL
merchants_silver = merchants_silver.withColumn(
    "Risk_level",
    when(col("Risk_level").isNull(), lit("UNKNOWN"))
    .otherwise(upper(trim(col("Risk_level"))))
)

# REMOVE DUPLICATES
window_spec = Window.partitionBy("Merchant_id").orderBy(col("Merchant_id"))

merchants_silver = (
    merchants_silver
        .withColumn("row_num", row_number().over(window_spec))
        .filter(col("row_num") == 1)
        .drop("row_num")
)

display(merchants_silver)

Merchant_id,Merchant_name,Merchant_category,Merchant_country_code,Risk_level
M001,Amazon,RETAIL,USA,LOW
M002,OnlineCasino,GAMBLING,MT,HIGH
M003,Airline,TRAVEL,UK,MEDIUM
M004,ElectronicShop,RETAIL,DE,LOW
M005,CryptoExhange,FINANCE,UNK,HIGH
M006,UnknownStore,OTHER,USA,UNKNOWN


In [0]:
silver_merchants_clean = "finance_fraudworkspace.silver_managed.merchants_silver"

merchants_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(silver_merchants_clean)

display(silver_merchants_clean)

'finance_fraudworkspace.silver_managed.merchants_silver'