In [0]:
# DEFINE CUSTOMER BRONZE TABLE PATH
bronze_customers_df = spark.table("finance_fraudworkspace.bronze_managed.customers_bronze")

In [0]:
# CLEAN CUSTOMER DATA
from pyspark.sql.functions import col, to_timestamp, to_date

clean_customers_df = (
    bronze_customers_df
    .withColumn("created_timestamp", to_timestamp(col("created_timestamp"), "dd-MM-yyyy HH:mm"))
    .withColumn("DOB", to_date(col("DOB"), "dd-MM-yyyy"))
)


In [0]:
from pyspark.sql.functions import when, lit, concat, upper
# CLEAN CUSTOMER_ID
customers_silver = clean_customers_df.filter(col("Customer_id").isNotNull())

# CLEAN FULL NAME 
customers_silver = customers_silver.withColumn(
    "Full_name", when(col("Full_name").isNull(), lit("Unknown"))
    .otherwise(col("Full_name"))
)

# CLEAN DOB
customers_silver = customers_silver.withColumn(
    "DOB", when(col("DOB").isNull(), lit("2001-01-01"))
    .otherwise(col("DOB"))
)

# CLEAN EMAIL
customers_silver = customers_silver.withColumn(
    "Email", when(col("Email").isNull(), 
                   concat(col("Customer_id"), lit("@noemail.com")))
    .otherwise(col("Email"))
)

# CLEAN RISK LEVEL
customers_silver = customers_silver.withColumn(
    "Risk_level", when(col("Risk_level").isNull(), lit("UNKNOWN"))
    .otherwise(upper(col("Risk_level")))
)

# REMOVE DUPLICATES
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy("customer_id").orderBy(col("created_timestamp").desc())

customers_silver = (
    customers_silver
        .withColumn("row_num", row_number().over(window_spec))
        .filter(col("row_num") == 1)
        .drop("row_num")
)

display(customers_silver)

Customer_id,Full_name,DOB,Email,Risk_level,created_timestamp
C001,Alice Brown,1988-05-12,alice_brown@outlook.com,LOW,2023-01-10T09:15:00Z
C002,Bob Smith,1975-09-22,C002@noemail.com,MEDIUM,2023-02-14T11:20:00Z
C003,Charlie Lee,2001-01-01,charlielee@gmail.com,HIGH,2023-03-01T14:05:00Z
C004,David Wilson,1985-03-18,davidwilson1985@gmail.com,UNKNOWN,2023-04-12T16:45:00Z
C005,Unknown,2000-07-25,evadunne@gmail.com,HIGH,2023-05-20T10:30:00Z
C006,Fatima Khan,1998-12-01,fatimakhxn@gmail.com,LOW,2023-06-18T08:10:00Z


In [0]:
silver_customer_clean = "finance_fraudworkspace.silver_managed.customers_silver"

customers_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(silver_customer_clean)

display(silver_customer_clean)

'finance_fraudworkspace.silver_managed.customers_silver'