In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import functions as F
from datetime import datetime,timedelta
from zoneinfo import ZoneInfo

In [0]:
spark.conf.set(
    "fs.azure.account.key.bmwstorageacc.dfs.core.windows.net",
    dbutils.secrets.get(scope = "bmwanalytics", key = "bmwstorevalut")
)

In [0]:
PIPELINE_VERSION = "V-"+datetime.now(ZoneInfo("Asia/Kolkata")).strftime("%Y%m%d-%H%M%S")
storage = "abfss://bmwstorage@bmwstorageacc.dfs.core.windows.net"
sliver_path = f"{storage}/sliver/bmw_sales/sales"
gold_path = f"{storage}/gold/bmw_sales/sales/"
try:
    version = [x for x in dbutils.fs.ls(sliver_path) if x.isDir() and x.name.startswith("V-")]
    if len(version) > 0:
        latest = sorted(version, key=lambda x: x.name)[-1]
        PIPELINE_VERSION = latest.name
except Exception as e:
    print(e)

In [0]:
sliver = spark.read.format("delta").load(f"{sliver_path}/{PIPELINE_VERSION}")
display(sliver)

In [0]:
gold_df = sliver.select(
    "vehicle_id",
    "model",
    "year",
    "region",
    "color",
    "fuel_type",
    "transmission",
    "engine_size",
    "mileage",
    "price",
    "sales_volume",
    "sales_classification",
    "revenue_usd",
    "price_brand"
)

In [0]:
gold_df.orderBy(col("year").desc()).display()

In [0]:
gold_df.columns

In [0]:
window = Window.partitionBy("model", "year", "region").orderBy(col("sales_volume").desc())

In [0]:
gold_year_region = (
    gold_df.groupBy("vehicle_id","year","model", "region")
    .agg(
        F.sum("sales_volume").alias("Total_Sales_Volume"),
        F.sum("revenue_usd").alias("Total_Revenue"),
        F.avg("price").alias("Avg_Price_USD"),
        F.avg("mileage").alias("Avg_Mileage_KM"),
        F.avg("engine_size").alias("Avg_Engine_Size_L"),
    )
)


In [0]:
w_year = Window.partitionBy("region","model").orderBy(col("year").desc())

gold_yearly_yoy = (
    gold_year_region
    .withColumn("Prev_Year_Sales_Volume", F.lag("Total_Sales_Volume", 1).over(w_year))
    .withColumn("Prev_Year_Revenue", F.lag("Total_Revenue", 1).over(w_year))
    .withColumn(
        "YoY_Sales_Growth_Pct",
        F.when(
            (F.col("Prev_Year_Sales_Volume").isNull()) | (F.col("Prev_Year_Sales_Volume") == 0),
            F.lit(None).cast("double")
        ).otherwise(
            F.round(
                ((F.col("Total_Sales_Volume") - F.col("Prev_Year_Sales_Volume")) / F.col("Prev_Year_Sales_Volume")) * 100,
                2
            )
        )
    )
    .withColumn(
        "YoY_Revenue_Growth_Pct",
        F.when(
            (F.col("Prev_Year_Revenue").isNull()) | (F.col("Prev_Year_Revenue") == 0),
            F.lit(None).cast("double")
        ).otherwise(
            F.round(
                ((F.col("Total_Revenue") - F.col("Prev_Year_Revenue")) / F.col("Prev_Year_Revenue")) * 100,
                2
            )
        )
    )
    .filter((F.col("Prev_Year_Sales_Volume").isNotNull()) & (F.col("Prev_Year_Revenue").isNotNull())
           &  (F.col("YoY_Sales_Growth_Pct") > 0) & (F.col("YoY_Revenue_Growth_Pct") > 0) )
    
)

gold_yearly_yoy.orderBy(col("year").desc()).display()

In [0]:
(
    gold_yearly_yoy.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(gold_path + f"/yearly_yoy/{PIPELINE_VERSION}")
)