#Gold Layer

###deriving values from the data validated in the silver layer and building a visual dashboard out of it


In [0]:
from pyspark.sql import functions as F

silver_table="silver_sales"
# Load from Silver table
df_silver = spark.table('validated_sales')

# Apply Gold layer business transformations:
# 1. Filter data strictly within valid date range
# 2. Aggregate: total sales, average sales, count of orders by year/month
# 3. Persist into Gold table

df_gold = (
    df_silver
    .filter((F.col("ORDERDATE") >= F.lit("2003-01-06")) & 
            (F.col("ORDERDATE") <= F.lit("2005-05-31")))
    .groupBy(
        F.year("ORDERDATE").alias("year"),
        F.month("ORDERDATE").alias("month")
    )
    .agg(
        F.sum("SALES").alias("total_sales"),
        F.avg("SALES").alias("avg_sales"),
        F.countDistinct("ORDERNUMBER").alias("unique_orders"),
        F.count("*").alias("row_count")
    )
    .orderBy("year", "month")
)

display(df_gold.limit(5))

year,month,total_sales,avg_sales,unique_orders,row_count
2003,1,129753.6,3327.0153846153844,5,39
2003,2,140836.19,3435.029024390244,3,41
2003,3,174504.90000000002,3490.0980000000004,6,50
2003,4,201609.55,3476.0267241379315,7,58
2003,5,192673.11,3321.950172413792,6,58


###Monthly sales summary

In [0]:
# Monthly Sales Summary
df_monthly = (
    df_silver.groupBy(
        F.year("ORDERDATE").alias("year"),
        F.month("ORDERDATE").alias("month")
    )
    .agg(
        F.sum("SALES").alias("total_sales"),
        F.avg("SALES").alias("avg_sales"),
        F.countDistinct("ORDERNUMBER").alias("unique_orders"),
        F.count("*").alias("row_count")
    )
    .orderBy("year", "month")
)

# Save to Gold Layer
df_monthly.write.format("delta").mode("overwrite").saveAsTable("gold_sales_monthly")

# Visualize in Databricks 
display(df_monthly)

year,month,total_sales,avg_sales,unique_orders,row_count
2003,1,129753.6,3327.0153846153844,5,39
2003,2,140836.19,3435.029024390244,3,41
2003,3,174504.90000000002,3490.0980000000004,6,50
2003,4,201609.55,3476.0267241379315,7,58
2003,5,192673.11,3321.950172413792,6,58
2003,6,168082.56000000003,3653.968695652175,6,46
2003,7,187731.88,3754.6376,6,50
2003,8,197809.3,3410.505172413793,5,58
2003,9,263973.36,3473.333684210526,8,76
2003,10,568290.97,3596.7782911392405,17,158


Databricks visualization. Run in Databricks to view.

###Yearly Sales Summary

In [0]:
# Yearly Sales Summary
df_yearly = (
    df_silver.groupBy(
        F.year("ORDERDATE").alias("year")
    )
    .agg(
        F.sum("SALES").alias("total_sales"),
        F.avg("SALES").alias("avg_sales"),
        F.countDistinct("ORDERNUMBER").alias("unique_orders")
    )
    .orderBy("year")
)

# Save to Gold Layer
df_yearly.write.format("delta").mode("overwrite").saveAsTable("gold_sales_yearly")

# Visualize (bar chart in Databricks)
display(df_yearly)

year,total_sales,avg_sales,unique_orders
2003,3516979.540000001,3516.9795400000016,104
2004,4724162.600000002,3512.3885501858754,144
2005,1791486.7099999993,3747.880146443513,59


Databricks visualization. Run in Databricks to view.

###Top 10 performing products

In [0]:
# Top 10 Products
df_top_products = (
    df_silver.groupBy("PRODUCTCODE")
    .agg(F.sum("SALES").alias("total_sales"))
    .orderBy(F.desc("total_sales"))
    .limit(10)
)

# Save to Gold Layer
df_top_products.write.format("delta").mode("overwrite").saveAsTable("gold_top_products")

# Visualize (bar chart for top products)
display(df_top_products)

PRODUCTCODE,total_sales
S18_3232,288245.4199999999
S10_1949,191073.03
S10_4698,170401.07000000004
S12_1108,168585.32000000004
S18_2238,154623.95
S12_3891,145332.03999999995
S24_3856,140626.90000000002
S12_2823,140006.16
S18_1662,139421.97
S12_1099,137177.00999999998


Databricks visualization. Run in Databricks to view.

###Top 10 Customers

In [0]:
# Top 10 Customers
df_top_customers = (
    df_silver.groupBy("CUSTOMERNAME")
    .agg(F.sum("SALES").alias("total_sales"))
    .orderBy(F.desc("total_sales"))
    .limit(10)
)

# Save to Gold Layer
df_top_customers.write.format("delta").mode("overwrite").saveAsTable("gold_top_customers")

# Visualize (bar chart for customers)
display(df_top_customers)

CUSTOMERNAME,total_sales
Euro Shopping Channel,912294.1100000008
Mini Gifts Distributors Ltd.,654858.0600000004
"Australian Collectors, Co.",200995.41
Muscle Machine Inc,197736.9400000001
La Rochelle Gifts,180124.9
"Dragon Souveniers, Ltd.",172989.68
Land of Toys Inc.,164069.44000000003
The Sharp Gifts Warehouse,160010.27000000002
"AV Stores, Co.",157807.80999999994
"Anna's Decorations, Ltd",153996.13000000003


Databricks visualization. Run in Databricks to view.