# Init

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Read from bronze

In [0]:
df = spark.table("workspace.bronze.erp_cust_az12")

df.display()

# Trimming

In [0]:
df1 = df

for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df1 = df.withColumn(field.name, trim(col(field.name)))

df1.display()

# Date formatting

In [0]:
df2 = df1.withColumn("BDATE", col("BDATE").cast(DateType()))

df2 = (
        df2.withColumn("BDATE", 
                       when(col("BDATE") >= current_date(), None)
                       .otherwise(col("BDATE")))
    )
df2.display()


# Normalisation

In [0]:
df3 = (
    df2.withColumn(
    "GEN",
    when(col("GEN") == "M", "Male")
    .when(col("GEN") == "F", "Female")
    .otherwise(col("GEN"))
    )
    .withColumn(
        "GEN",
        when(col("GEN").isNull(), "n/a")
        .otherwise(col("GEN"))
        )
)
df3.display()

# CutomerID clean up

In [0]:
df4 = df3.withColumn("CID", 
                     when(col("CID").startswith("NASAW"), substring(col("CID"), 6, length(col("CID"))))
                     .otherwise(col("CID"))
                     )

df4.display()


# Renaming the cloumn names

In [0]:
rename_map = {
    "CID": "customer_id",
    "BDATE": "birth_date",
    "GEN": "gender"
}

df5= df4

for old_name, new_name in rename_map.items():
    df5 = df5.withColumnRenamed(old_name, new_name)

df5.display()

# Writing to Silver table

In [0]:
df5.write.mode("overwrite").saveAsTable("workspace.silver.erp_customers")

# Sanity check in silver table

In [0]:
%sql
select * from workspace.silver.erp_customers limit 10