# Init

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Read from Bronze

In [0]:
df = spark.table("workspace.bronze.erp_loc_a101")

df.display()

# trim

In [0]:
df1 = df
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df1 = df1.withColumn(field.name, trim(field.name))

df1.display()


# Formatting the CID

In [0]:
df2 = df1.withColumn("CID", regexp_replace(col("CID"), "-", ""))

display(df2)


# Country normalisation

In [0]:
df3 = df2.withColumn("cntry", 
                     when(col("cntry").isin("USA","US"), "United States")
                     .when(col("cntry") == "DE", "Germany")
                     .when(col("cntry").isNull(), "n/a")
                     .otherwise(col("cntry"))
                     )

display(df3)

# Renaming columns

In [0]:
rename_map = {
  "CID": "customer_id",
  "CNTRY": "country"
}

df4 = df3

for old_name, new_name in rename_map.items():
  df4 = df4.withColumnRenamed(old_name, new_name)

df4.display()

# Write to Silver Table

In [0]:
df4.write.mode("overwrite").saveAsTable("workspace.silver.erp_location")

# Sanity check in Silver table

In [0]:
%sql
select * from workspace.silver.erp_location limit 10