In [0]:
from pyspark.sql.functions import col, trim, lower, when, regexp_replace, to_date, year, month

In [0]:
dbutils.widgets.removeAll()

In [0]:
_env = "dev"
target_schema_name = "silver"
table_name = "ecomm_sales"

In [0]:
input_schema = "bronze"
input_table_name = f"{_env}.{input_schema}.{table_name}"

In [0]:
df = spark.read.table(input_table_name)

### Transforming, data cleansing and feature engineering.

In [0]:
# Casting all fields to right data types.
casted_df = (
    df
    .withColumn("Date", to_date(col("Date"), "dd-MM-yyyy"))
    .withColumn("Product_Category", trim(lower(col("Product_Category"))))
    .withColumn("Price", col("Price").cast("double"))
    .withColumn("Discount", col("Discount").cast("double"))
    .withColumn("Customer_Segment", trim(lower(col("Customer_Segment"))))
    .withColumn("Marketing_Spend", col("Marketing_Spend").cast("double"))
    .withColumn("Units_Sold", col("Units_Sold").cast("int"))
)

In [0]:
# Data cleansing
cleansed_df = (
    casted_df
    .filter(col("Date").isNotNull() & col("Product_Category").isNotNull())
    .withColumn("Price", when(col("Price") < 0, None).otherwise(col("Price")))
    .withColumn("Discount", when(col("Discount") < 0, 0).otherwise(col("Discount")))
    .withColumn("Marketing_Spend", when(col("Marketing_Spend") < 0, 0).otherwise(col("Marketing_Spend")))
    .withColumn("Units_Sold", when(col("Units_Sold") < 0, 0).otherwise(col("Units_Sold")))
    .withColumn("Product_Category", regexp_replace(col("Product_Category"), "[^a-z]", ""))
    .withColumn("Customer_Segment", regexp_replace(col("Customer_Segment"), "[^a-z]", ""))
)

In [0]:
# Feature engineering
featured_df = (
    cleansed_df
    .withColumn("order_month", month(col("Date")))
    .withColumn("order_year", year(col("Date")))
    .withColumn("Revenue", (col("Price") - col("Discount")) * col("Units_Sold"))
)

### Saving to silver table.

In [0]:
featured_df.write.format("delta") \
    .mode("overwrite")\
    .option("mergeSchema", "true") \
    .saveAsTable(f"{_env}.{target_schema_name}.{table_name}"
)