In [None]:
# Synapse Notebook for Data Transformation

# Import necessary libraries
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
# This initializes the Spark environment for data processing.
spark = SparkSession.builder.appName("TransformSalesData").getOrCreate()

# Step 2: Load Raw Data
# Specify the path to the raw data stored in Azure Data Lake.
raw_data_path = "abfss://raw@<storage_account>.dfs.core.windows.net/sales_data/"
raw_df = spark.read.json(raw_data_path)  # Read raw data in JSON format

# Display a sample of the raw data for validation
raw_df.show()

# Step 3: Perform Transformation
# Aggregate total sales by region and product
sales_summary = raw_df.groupBy("region", "product").sum("amount")

# Rename the aggregated column for clarity
sales_summary = sales_summary.withColumnRenamed("sum(amount)", "total_sales")

# Display the transformed data
sales_summary.show()

# Step 4: Save Transformed Data
# Specify the path to save the processed data
processed_data_path = "abfss://processed@<storage_account>.dfs.core.windows.net/sales_summary/"
sales_summary.write.mode("overwrite").parquet(processed_data_path)

# Log completion
print("Data transformation completed successfully.")
