In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import functions as F
from datetime import datetime,timedelta
from zoneinfo import ZoneInfo

In [0]:
spark.conf.set(
    "fs.azure.account.key.bmwstorageacc.dfs.core.windows.net",
    dbutils.secrets.get(scope = "bmwanalytics", key = "bmwstorevalut")
)

In [0]:
PIPELINE_VERSION = "V-"+datetime.now(ZoneInfo("Asia/Kolkata")).strftime("%Y%m%d-%H%M%S")
storage = "abfss://bmwstorage@bmwstorageacc.dfs.core.windows.net"
sliver_path = f"{storage}/sliver/bmw_sales/sales"
gold_path = f"{storage}/gold/bmw_sales/sales/"
try:
    version = [x for x in dbutils.fs.ls(sliver_path) if x.isDir() and x.name.startswith("V-")]
    if len(version) > 0:
        latest = sorted(version, key=lambda x: x.name)[-1]
        PIPELINE_VERSION = latest.name
except Exception as e:
    print(e)

In [0]:
sliver = spark.read.format("delta").load(f"{sliver_path}/{PIPELINE_VERSION}")
display(sliver)

In [0]:
gold_df = sliver.select(
    "vehicle_id",
    "model",
    "year",
    "region",
    "color",
    "fuel_type",
    "transmission",
    "engine_size",
    "mileage",
    "price",
    "sales_volume",
    "sales_classification",
    "revenue_usd",
    "price_brand"
)

In [0]:
gold_df.orderBy(col("year").desc()).display()

In [0]:
base = (
    gold_df
    .withColumn("year", F.col("year").cast("int"))
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("sales_volume", F.col("sales_volume").cast("long"))
    .withColumn("engine_size", F.col("engine_size").cast("double"))
    .withColumn("mileage", F.col("mileage").cast("double"))
    # Ensure revenue consistency
    .withColumn("revenue_usd", (F.col("price") * F.col("sales_volume")).cast("double"))
)

In [0]:
gold_year_fuel_trans = (
    base.groupBy("year", "fuel_type", "transmission")
    .agg(
        F.sum("sales_volume").alias("Sales_Volume")
    )
)

fuel_year_totals = (
    gold_year_fuel_trans
    .groupBy("year")
    .agg(F.sum("Sales_Volume").alias("Year_Total_Sales"))
)

gold_year_fuel_trans_final = (
    gold_year_fuel_trans
    .join(fuel_year_totals, "year", "left")
    .withColumn(
        "Sales_Share_Pct",
        F.round((F.col("Sales_Volume") / F.col("Year_Total_Sales")) * 100, 2)
    )
    .drop("Year_Total_Sales")
    .orderBy("year", F.desc("Sales_Volume"))
)

display(gold_year_fuel_trans_final)


In [0]:
(
    gold_year_fuel_trans_final
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(gold_path + f"gold_year_fuel_trans_final/{PIPELINE_VERSION}")
)