## Exploring Delta Lake in Microsoft Fabric: A Relational Perspective
**by Jesus Lopez Martin - https://www.syntax.es - December 2024**

The Setup: We worked in a Microsoft Fabric Workspace on an F2 capacity, using a Lakehouse with Schemas and a Notebook in PySpark and Scala SQL. Our primary dataset? The "sales" 100M table from [Contoso Data Generator](https://github.com/sql-bi/Contoso-Data-Generator-V2-Data/releases/tag/ready-to-use-data). 

In [None]:
from pyspark.sql.functions import col, to_date
import datetime

# Path to Parquet in Azure Blob Storage
parquet_path = "wasbs://<your_container>@<your_storageaccount>.blob.core.windows.net/parquet100m/sales.parquet"
table_name = "bronze_wwi.sales"
version_mapping_table = "bronze_wwi.sales_version_mapping"

# Read the Parquet file complet
print(f"Reading data from: {parquet_path}")
sales_df = spark.read.format("parquet").load(parquet_path)

# Dates
min_date = sales_df.selectExpr("MIN(OrderDate)").collect()[0][0]
max_date = sales_df.selectExpr("MAX(OrderDate)").collect()[0][0]
print(f"Range of dates in OrderDate: {min_date} a {max_date}")

start_date = min_date 
end_date = max_date

# List for register map between dates and versions
sales_version_mapping = []

# Create Delta table day by day
current_date = start_date

while current_date <= end_date:
    # Filter actual day
    current_day_str = current_date.strftime("%Y-%m-%d")
    daily_data = sales_df.filter(to_date(col("OrderDate")) == current_day_str)

    if daily_data.count() > 0: 
        print(f"Loading data for day: {current_day_str}, record count: {daily_data.count()}")

        # Register metadata in commit
        spark.conf.set("spark.databricks.delta.commitInfo.userMetadata", f"Simulated day: {current_day_str}")

        # Write data in Delta table
        daily_data.write.format("delta").mode("append").saveAsTable(table_name)

        # Last version
        history = spark.sql(f"DESCRIBE HISTORY {table_name}").collect()
        latest_version = history[0]['version']

        # Register the date and version in list
        sales_version_mapping.append((current_day_str, latest_version))

    # Next day
    current_date += datetime.timedelta(days=1)

# Create a DataFrame with map and save how Delta table
mapping_df = spark.createDataFrame(sales_version_mapping, ["date", "version"])
mapping_df.write.format("delta").mode("overwrite").saveAsTable(version_mapping_table)

print("Load complete and map generated.")