In [0]:
# 📥 Notebook 01: Ingest Sales Data from ADLS Gen2

# Step 1: Set up access to ADLS Gen2 using SAS token
sas_token = "sv=2024-11-04&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2025-05-29T17:35:56Z&st=2025-05-29T09:35:56Z&spr=https&sig=QarAE3j86hlYVLjclxloB%2BPAxWwDET46QqBo7dkGvO0%3D"

spark.conf.set("fs.azure.account.auth.type.storageprojectend.dfs.core.windows.net", "SAS")
spark.conf.set("fs.azure.sas.token.provider.type.storageprojectend.dfs.core.windows.net",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set("fs.azure.sas.fixed.token.storageprojectend.dfs.core.windows.net", sas_token)

# Step 2: Read CSV file from raw container
df = spark.read.option("header", "true").csv("abfss://raw@storageprojectend.dfs.core.windows.net/sales_data_sample.csv")
df.display()


OrderDate,Region,SalesPerson,Model,UnitsSold,Price,Revenue
2024-01-15,North,Alice,Sedan,10,20000,200000
2024-02-20,South,Bob,SUV,15,30000,450000
2024-03-05,East,Charlie,Hatchback,20,15000,300000
2024-03-15,West,David,SUV,5,32000,160000
2024-04-10,Central,Eve,Sedan,12,21000,252000


In [0]:
# 📘 Notebook 02: Data Transformation in Databricks

from pyspark.sql.functions import col, to_date

# Step 1: Load data from raw ADLS
df = spark.read.option("header", "true").csv("abfss://raw@storageprojectend.dfs.core.windows.net/sales_data_sample.csv")

# Step 2: Cast data types
df_cleaned = (
    df.withColumn("OrderDate", to_date(col("OrderDate"), "yyyy-MM-dd"))
      .withColumn("UnitsSold", col("UnitsSold").cast("int"))
      .withColumn("Price", col("Price").cast("double"))
      .withColumn("Revenue", col("Revenue").cast("double"))
)

# Step 3: Optional filtering or enrichment
df_final = df_cleaned.filter(col("UnitsSold") > 0)

# Preview
df_final.display()




OrderDate,Region,SalesPerson,Model,UnitsSold,Price,Revenue
2024-01-15,North,Alice,Sedan,10,20000.0,200000.0
2024-02-20,South,Bob,SUV,15,30000.0,450000.0
2024-03-05,East,Charlie,Hatchback,20,15000.0,300000.0
2024-03-15,West,David,SUV,5,32000.0,160000.0
2024-04-10,Central,Eve,Sedan,12,21000.0,252000.0


In [0]:
# 📤 Notebook 03: Export Transformed Data to Curated Layer

# Write the final dataframe as a partitioned Parquet file
df_final.write.mode("overwrite").parquet("abfss://raw@storageprojectend.dfs.core.windows.net/sales_data/")

# You can also register this as a table (optional)
df_final.write.mode("overwrite").saveAsTable("sales_data_curated")