# 📓 2. Ingest Data
## Ingestion Module: Initial Load Benchmarking for TC.01.x (Delta) & TC.02.x (Warehouse)

### Ensure `DataSourceLakehouse` & `BenchmarkLakehouse` are connected as data sources before running.
- `BenchmarkWarehouse` is connected automatically via code below.
- This notebook performs initial loads for each update strategy target table in both Delta and Warehouse.
- Note that `df = spark.read.parquet(base_file)` preserves timezone information which is necessary for successful SQL endpoints.
- Metrics are captured for the first load in each location.
- All initial load rows will have `update_type = 'insert'`.

In [None]:
import time
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import col, lit

In [None]:
# Lakehouse/warehouse names
source_lakehouse = "DataSourceLakehouse"
target_lakehouse = "BenchmarkLakehouse"
target_warehouse = "BenchmarkWarehouse"

row_count = 10000  # Change as needed

# Explicit paths
input_path_parquet = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/{source_lakehouse}.Lakehouse/Files/base/base_{row_count}_parquet.parquet"
base_file = input_path_parquet

# Delta and Warehouse target table names for each strategy
delta_tables = [
    "delta_refresh_load",
    "delta_compare_load",
    "delta_increment_load"
]
warehouse_tables = [
    "wh_table_refresh_load",
    "wh_table_compare_load",
    "wh_table_increment_load"
]

In [None]:
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {target_lakehouse}.metrics (
    test_case_id STRING,
    timestamp TIMESTAMP,
    format STRING,
    location STRING,
    rows INT,
    update_strategy STRING,
    ingest_time_s FLOAT,
    spinup_time_s FLOAT,
    storage_size_mb FLOAT,
    query_type STRING,
    query_time_s FLOAT,
    cu_used FLOAT,
    notes STRING
)
""")

## TC.01.x: Initial Load to Delta Tables

Loads synthetic data into three Delta tables, preparing for future update strategy benchmarks.
Metrics are captured for `delta_refresh_load` only.
All initial rows will have `update_type = 'insert'`.

In [None]:
# Read source data (base load)
df = spark.read.parquet(base_file)

# Spinup timing (for symmetry with prior logic)
tc01x_spinup_start = time.time()
spark = SparkSession.builder.getOrCreate()
tc01x_spinup_end = time.time()
tc01x_spinup_duration = tc01x_spinup_end - tc01x_spinup_start

# Ingest timing and write to each Delta target table
metrics_logged = False
for i, table in enumerate(delta_tables):
    ingest_start = time.time()
    df.write.mode("overwrite").saveAsTable(f"{target_lakehouse}.{table}")
    ingest_end = time.time()
    ingest_duration = ingest_end - ingest_start

    if not metrics_logged:
        try:
            import mssparkutils
            table_path = f"/lakehouse/{target_lakehouse}/Tables/{table}"
            storage_files = mssparkutils.fs.ls(table_path)
            storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
        except Exception:
            storage_size_mb = float('nan')

        metrics_tc01x = [
            (
                "TC.01.x",
                datetime.now(),
                "Delta",
                "Tables",
                row_count,
                "Full Refresh",
                ingest_duration,
                tc01x_spinup_duration,
                storage_size_mb,
                "N/A",
                float('nan'),
                float(row_count), # using row_count in lieu of cu_used and match FloatType schema
                f"Initial load to {table} (Delta)"
            )
        ]
        spark.createDataFrame(metrics_tc01x, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
        print(f"Delta initial load complete for {table} | Ingest time: {ingest_duration:.2f}s | Storage: {storage_size_mb:.2f} MB")
        metrics_logged = True

## TC.02.x: Initial Load to Warehouse Tables

Loads synthetic data into three Warehouse tables, preparing for future update strategy benchmarks.
Metrics are captured for `wh_table_refresh_load` only.
All initial rows will have `update_type = 'insert'`.

In [None]:
from com.microsoft.spark.fabric import Constants

# Ingest timing and write to each Warehouse target table
metrics_logged = False
for i, table in enumerate(warehouse_tables):
    ingest_start = time.time()
    df.write.mode("overwrite").synapsesql(f"{target_warehouse}.dbo.{table}")
    ingest_end = time.time()
    ingest_duration = ingest_end - ingest_start

    if not metrics_logged:
        try:
            import mssparkutils
            storage_size_mb = float('nan')  # Warehouse storage size usually not directly available
        except Exception:
            storage_size_mb = float('nan')

        metrics_tc02x = [
            (
                "TC.02.x",
                datetime.now(),
                "Warehouse",
                "Tables",
                row_count,
                "Full Refresh",
                ingest_duration,
                float('nan'),  # spinup time N/A
                storage_size_mb,
                "N/A",
                float('nan'),
                float(row_count), # using row_count in lieu of cu_used and match FloatType schema
                f"Initial load to {table} (Warehouse)"
            )
        ]
        spark.createDataFrame(metrics_tc02x, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')
        print(f"Warehouse initial load complete for {table} | Ingest time: {ingest_duration:.2f}s")
        metrics_logged = True

In [None]:
print("Completion")
print("Initial loads for Delta and Warehouse tables (TC.01.x & TC.02.x) completed and metrics logged.")