KPI Summary Dashboard

In [0]:
%sql
drop table online_retail.sales_by_country


In [0]:
spark.conf.set(
    "fs.azure.account.key.project02etlstorage.dfs.core.windows.net",
    "hjU3pTklfet5UlyVsEQa+j78mA2oCqSZCwmmTcvbvP2WTU6ZkTjUvxyWiV+sm+Fsz3IvUdE7rJi2+AStKYSWxg==")

In [0]:
from pyspark.sql.functions import col, sum, countDistinct, when

dst_df = spark.read.table("online_retail.sales_cleaned")

customer_summary_df = dst_df.filter(col("CustomerID") != 0) \
    .groupBy("CustomerID") \
    .agg(
        sum("TotalAmount").alias("TotalRevenue"),       
        countDistinct("InvoiceNo").alias("NumberOfOrders"),
        sum("Quantity").alias("TotalItemsBought")
    ) \
    .withColumn("CustomerTier",                           
                when(col("TotalRevenue") > 10000, "Platinum")
                .when(col("TotalRevenue") > 5000, "Gold")
                .when(col("TotalRevenue") > 1000, "Silver")
                .otherwise("Bronze")
               ) \
    .orderBy(col("TotalRevenue").desc())


customer_summary_df.write.mode("overwrite").format("delta").saveAsTable("online_retail.customer_summary")

customer_summary_df.write.mode("overwrite").parquet(f"abfss://destination@project02etlstorage.dfs.core.windows.net/customer_revenue")

display(customer_summary_df)

CustomerID,TotalRevenue,NumberOfOrders,TotalItemsBought,CustomerTier
14646,280206.02,73,196915,Platinum
18102,259657.3,60,64124,Platinum
17450,194550.78999999992,46,69993,Platinum
16446,168472.5,2,80997,Platinum
14911,143825.06000000003,201,80265,Platinum
12415,124914.53,21,77374,Platinum
14156,117379.63,55,57885,Platinum
17511,91062.38,31,64549,Platinum
16029,81024.84000000001,63,40208,Platinum
12346,77183.6,1,74215,Platinum


Monthly Sales Trends

In [0]:
monthly_trends_df = dst_df.groupBy("Year", "Month") \
    .agg(sum("TotalAmount").alias("MonthlyRevenue"),
         sum("Quantity").alias("MonthlyUnitsSold"),
         countDistinct("InvoiceNo").alias("TransactionCount"),
         countDistinct("CustomerID").alias("UniqueCustomers")) \
    .orderBy("Year", "Month")

monthly_trends_df.write.mode("overwrite").format("delta").saveAsTable("online_retail.monthly_trends")
monthly_trends_df.write.mode("overwrite").parquet(f"abfss://destination@project02etlstorage.dfs.core.windows.net/monthly_trends_df")
display(monthly_trends_df)


Year,Month,MonthlyRevenue,MonthlyUnitsSold,TransactionCount,UniqueCustomers
2010,12,823746.1399999646,359239,1559,886
2011,1,691364.5600000108,387785,1086,742
2011,2,523631.8900000278,283555,1100,759
2011,3,717639.3600000187,377526,1454,975
2011,4,537808.6200000144,308815,1246,857
2011,5,770536.0200000107,395738,1681,1057
2011,6,761739.9000000219,389213,1533,992
2011,7,719221.1900000272,401759,1475,950
2011,8,759138.3800000154,421770,1361,936
2011,9,1058590.169999997,570820,1837,1267


Top Selling Products

In [0]:
top_products_df = dst_df.groupBy("StockCode", "Description") \
    .agg(sum("TotalAmount").alias("TotalRevenue"),
         sum("Quantity").alias("Total_Number_items_sold"),
         countDistinct("InvoiceNo").alias("TimesOrdered")) \
    .orderBy(col("TotalRevenue").desc())

top_products_df.write.mode("overwrite").format("delta").saveAsTable("online_retail.top_products_by_revenue")
top_products_df.write.mode("overwrite").parquet(f"abfss://destination@project02etlstorage.dfs.core.windows.net/top_products_df")
display(top_products_df)


StockCode,Description,TotalRevenue,Total_Number_items_sold,TimesOrdered
DOT,DOTCOM POSTAGE,206248.77,706,706
22423,REGENCY CAKESTAND 3 TIER,174484.73999999982,13879,1988
23843,"PAPER CRAFT , LITTLE BIRDIE",168469.6,80995,1
85123A,WHITE HANGING HEART T-LIGHT HOLDER,104340.29000000036,37599,2189
47566,PARTY BUNTING,99504.33,18295,1685
85099B,JUMBO BAG RED RETROSPOT,94340.04999999978,48474,2089
23166,MEDIUM CERAMIC TOP STORAGE JAR,81700.92000000003,78033,247
M,Manual,78110.27,7224,289
POST,POSTAGE,78101.88,3150,1126
23084,RABBIT NIGHT LIGHT,66964.98999999986,30788,994


Sales by Country 

In [0]:
from pyspark.sql.functions import col, sum, countDistinct

sales_by_country_df = (
    dst_df.groupBy("Country")
    .agg(
        sum("TotalAmount").alias("TotalRevenue"),
        countDistinct("InvoiceNo").alias("NumberOfOrders"),
        countDistinct("CustomerID").alias("UniqueCustomers")
    )
    .orderBy(col("TotalRevenue").desc())
)

# Save to Delta
sales_by_country_df.write.mode("overwrite").format("delta").saveAsTable("online_retail.sales_by_country")

# Save as Parquet in ADLS
sales_by_country_df.write.mode("overwrite").parquet(
    "abfss://destination@project02etlstorage.dfs.core.windows.net/sales_by_country"
)

display(sales_by_country_df)

Country,TotalRevenue,NumberOfOrders,UniqueCustomers
United Kingdom,9025222.0799982,18019,3921
Netherlands,285446.3399999997,94,9
EIRE,283453.9599999995,288,4
Germany,228867.14000000065,457,94
France,209715.10999999996,392,88
Australia,138521.30999999994,57,9
Spain,61577.11000000006,90,30
Switzerland,57089.90000000009,54,22
Belgium,41196.33999999999,98,25
Sweden,38378.33,36,8
