# Init

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Read from bronze table

In [0]:
df = spark.table("workspace.bronze.crm_prd_info")

display(df)


# Trim

In [0]:
df1 = df

for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df1 = df.withColumn(field.name, trim(col(field.name)))

display(df1)

# Product key parsing

In [0]:
df2 = df1.withColumn("cat_id", regexp_replace(substring(col("prd_key"), 1, 5), "-","_"))
df3 = df2.withColumn("prd_key", substring(col("prd_key"), 7, length(col("prd_key"))))
display(df3)

# Cost clean up

In [0]:
df4 = df3.withColumn("prd_cost", coalesce(col("prd_cost"), lit(0)))

df4.display()

# Product line normalisation

In [0]:
df5 = (
        df4.withColumn("prd_line",
                     when(upper(col("prd_line")) == "M", "Mountain")
                     .when(upper(col("prd_line")) == "R", "Road")
                     .when(upper(col("prd_line")) == "S", "Other Sales")
                     .when(upper(col("prd_line")) == "T", "Touring")
                     .otherwise("n/a")
        )
)

display(df5)

# Date casting

In [0]:
df6 = df5.withColumn("prd_start_dt", col("prd_start_dt").cast(DateType()))
df7 = df6.withColumn("prd_end_dt", col("prd_end_dt").cast(DateType()))

display(df7)


# Renaming the columns

In [0]:
df8 = df7
rename_map = {
  "prd_id": "product_id",
  "prd_key": "product_number",
  "cat_id": "category_id",
  "prd_nm": "product_name",
  "prd_line": "product_line",
  "prd_cost": "product_cost",
  "prd_start_dt": "start_date", 
  "prd_end_dt": "end_date"
}

for old_name, new_name in rename_map.items():
  df8 = df8.withColumnRenamed(old_name, new_name)
  
display(df8)

# Sanity check of dataframe

In [0]:
df8.limit(10).display()

# Writing to silver table

In [0]:
(
df8.write
.mode("overwrite")
.format("delta")
.saveAsTable("workspace.silver.crm_prodcuts")
)