In [0]:
from pyspark.sql.functions import col,year,month,dayofmonth,to_date,from_json,current_timestamp,sum,to_timestamp,when
from pyspark.sql.types import StructField,StringType,StructType,StringType,IntegerType,DoubleType,TimestampType
from pyspark.sql import *
from datetime import datetime
from zoneinfo import ZoneInfo

In [0]:
spark.conf.set(
    "fs.azure.account.key.bmwstorageacc.dfs.core.windows.net",
    dbutils.secrets.get(scope = "bmwanalytics", key = "bmwstorevalut")
)

In [0]:
PIPELINE_VERSION = "V-"+datetime.now(ZoneInfo("Asia/Kolkata")).strftime("%Y-%m-%d %H:%M:%S")
storage = "abfss://bmwstorage@bmwstorageacc.dfs.core.windows.net"
bronze_path = f"{storage}/bronze/bmw_sales/sales/"
sliver_store = f"{storage}/sliver"
snapshot = f"{sliver_store}/bmw_Sales/snapshot/{PIPELINE_VERSION}"
sliver_out = f"{sliver_store}/bmw_sales/sales/{PIPELINE_VERSION}"
ckpt1 = f"{sliver_store}/bmw_sales/sales/{PIPELINE_VERSION}"
ckpt2 = f"{sliver_store}/bmw_sales/snapshot/{PIPELINE_VERSION}"
sliver_cpkt = f"{ckpt}/_checkpoint"
snapshot_cpkt = f"{ckpt}/_checkpoint"

In [0]:
try:
    version = [x for x in dbutils.fs.ls(bronze_path) if x.isDir() and x.name.startswith("V-")]
    if len(version) > 0:
        latest = sorted(version, key=lambda x: x.name)[-1].name
        print(f"Latest version is {latest}")
except Exception as e:
    print(e)
    latest = None

In [0]:
bronze_read = (
    spark.readStream.format("delta").load(f"{bronze_path}/{latest}")
)

In [0]:
bronze_df = spark.read.format("delta").load(f"{bronze_path}/{latest}")
bronze_df.display()

In [0]:
schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("event_timestamp", TimestampType(), True),
    StructField("vehicle_id", StringType(), True),
    StructField("model", StringType(), True),
    StructField("year", IntegerType(), True),
    StructField("region", StringType(), True),
    StructField("color", StringType(), True),
    StructField("fuel_type", StringType(), True),
    StructField("transmission", StringType(), True),
    StructField("engine_size", DoubleType(), True),
    StructField("mileage", IntegerType(), True),
    StructField("price", DoubleType(), True),
    StructField("sales_volume", IntegerType(), True),
    StructField("sales_classification", StringType(), True)
])

prase_df_stream = bronze_read.withColumn("data", from_json(col("raw_json"), schema)).select("data.*")
prase_df = bronze_df.withColumn("data", from_json(col("raw_json"), schema)).select("data.*")

In [0]:
prase_df.display()

In [0]:
prase_df.orderBy(col("year").desc()).display()

In [0]:
check_null = prase_df.select(*[
   sum(col(c).isNull().cast("int")).alias(c) for c in prase_df.columns
    ])
check_null.display()

In [0]:
clean_df = (
    prase_df_stream.withColumn("event_timestamp", to_timestamp("event_timestamp"))
    .withColumn("revenue_usd", col("price") * col("sales_volume"))
)

In [0]:
q1,q2 = prase_df.approxQuantile("price", [0.33,0.66], 0.01)
clean_df = (
    clean_df
    .withColumn(
        "price_brand",
        when(col("price") <= q1, "Low")
        .when(col("price") <= q2, "Medium")
        .otherwise("High")
    )
)

In [0]:
query1 = (
    clean_df.writeStream
    .format("delta")
    .option("checkpointLocation", ckpt1)
    .outputMode("append")
    .option("mergeSchema", "true")
    .option("overwriteSchema", "true")
    .start(sliver_out)
)

query2 = (
    clean_df.writeStream
    .format("delta")
    .option("checkpointLocation", ckpt2)
    .outputMode("append")
    .option("mergeSchema", "true")
    .option("overwriteSchema", "true")
    .start(snapshot)
)