# 📓 2. Ingest Data
## Ingestion Module: Sequential Benchmark for TC01, TC07, TC13

### Ensure `DataSourceLakehouse` & `BenchmarkLakehouse` are connected as data sources before running.
- For TC13, data is loaded into a Warehouse using T-SQL from the notebook.

In [None]:
import time
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, FloatType

In [None]:
# Lakehouse/warehouse names
source_lakehouse = "DataSourceLakehouse"
target_lakehouse = "BenchmarkLakehouse"
target_warehouse = "BenchmarkWarehouse"

row_count = 10000  # Change as needed

# Explicit paths
input_path_parquet = f"abfss://FabricBenchmarking@onelake.dfs.fabric.microsoft.com/{source_lakehouse}.Lakehouse/Files/base/base_{row_count}_parquet.parquet"
delta_table_name = "target_table_delta"
delta_table_path = f"/Tables/{target_lakehouse}/{delta_table_name}"
warehouse_table_name = f"data_{row_count}"

In [None]:
metrics_schema = StructType([
    StructField("test_case_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("format", StringType(), True),
    StructField("location", StringType(), True),
    StructField("rows", IntegerType(), True),
    StructField("update_strategy", StringType(), True),
    StructField("ingest_time_s", FloatType(), True),
    StructField("spinup_time_s", FloatType(), True),
    StructField("storage_size_mb", FloatType(), True),
    StructField("query_type", StringType(), True),
    StructField("query_time_s", FloatType(), True),
    StructField("cu_used", FloatType(), True),
    StructField("notes", StringType(), True)
])

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {target_lakehouse}.metrics (
    test_case_id STRING,
    timestamp TIMESTAMP,
    format STRING,
    location STRING,
    rows INT,
    update_strategy STRING,
    ingest_time_s FLOAT,
    spinup_time_s FLOAT,
    storage_size_mb FLOAT,
    query_type STRING,
    query_time_s FLOAT,
    cu_used FLOAT,
    notes STRING
)
""")

## TC01: Lakehouse Parquet File Ingest (from DataSourceLakehouse to BenchmarkLakehouse)

In [None]:
tc01_spinup_start = time.time()
spark = SparkSession.builder.getOrCreate()
tc01_spinup_end = time.time()
tc01_spinup_duration = tc01_spinup_end - tc01_spinup_start

tc01_ingest_start = time.time()
try:
    df = spark.read.format("parquet").load(input_path_parquet)
except Exception as e:
    if ("Path does not exist" in str(e)) or ("not found" in str(e).lower()) or ("Operation failed" in str(e) and "Bad Request" in str(e)):
        print(f"Error: {input_path_parquet} not found. Did you run 1.GenerateData first?")
        raise
    else:
        raise
else:
    df.write.mode("overwrite").saveAsTable(f"{target_lakehouse}.target_table_parquet")
    tc01_ingest_end = time.time()
    tc01_ingest_duration = tc01_ingest_end - tc01_ingest_start

    # Storage size for TC01
    try:
        import msparkutils
        tc01_table_path = f"/lakehouse/{target_lakehouse}/Tables/target_table_parquet"
        storage_files = msparkutils.fs.ls(tc01_table_path)
        tc01_storage_size_mb = sum(f.size for f in storage_files) / (1024 * 1024)
    except Exception:
        tc01_storage_size_mb = float('nan')

    metrics_tc01 = [
        (
            "TC01",
            datetime.now(),
            "Parquet",
            "Files",
            row_count,
            "Full Refresh",
            tc01_ingest_duration,
            tc01_spinup_duration,
            tc01_storage_size_mb,
            "N/A",
            float('nan'),
            float('nan'),
            "No tabular access"
        )
    ]
    spark.createDataFrame(metrics_tc01, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')

## TC07: Shortcut to Delta Table Ingest (from BenchmarkLakehouse table, shortcut created in BenchmarkWarehouse)

In [None]:
tc07_spinup_start = time.time()
# Ensure Delta table exists in BenchmarkLakehouse
df = spark.read.format("parquet").load(input_path_parquet)
df.write.mode("overwrite").format("delta").saveAsTable(f"{target_lakehouse}.{delta_table_name}")
tc07_spinup_end = time.time()
tc07_spinup_duration = tc07_spinup_end - tc07_spinup_start

tc07_ingest_start = time.time()
# Shortcut creation logic
try:
    import msparkutils
    shortcut_name = "shortcut_to_delta"
    msparkutils.warehouse.createShortcut(
        shortcutName=shortcut_name,
        sourceLakehouse=target_lakehouse,
        sourcePath=delta_table_path,
        targetWarehouse=target_warehouse
    )
    shortcut_sync_start = time.time()
    msparkutils.warehouse.refreshShortcuts(target_warehouse)
    shortcut_sync_end = time.time()
    tc07_notes = f"Shortcut created; Metadata sync delay: {shortcut_sync_end-shortcut_sync_start:.2f}s"
except Exception as e:
    tc07_notes = f"Shortcut creation failed: {str(e)}"
tc07_ingest_end = time.time()
tc07_ingest_duration = tc07_ingest_end - tc07_ingest_start

tc07_total_ingest_duration = tc01_ingest_duration + tc07_ingest_duration
tc07_total_spinup_duration = tc01_spinup_duration + tc07_spinup_duration

metrics_tc07 = [
    (
        "TC07",
        datetime.now(),
        "Shortcut to Delta",
        "Tables",
        row_count,
        "Full Refresh",
        tc07_total_ingest_duration,
        tc07_total_spinup_duration,
        float('nan'),
        "N/A",
        float('nan'),
        float('nan'),
        tc07_notes
    )
]
spark.createDataFrame(metrics_tc07, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')

## TC13: Warehouse Table Ingest (physical copy from BenchmarkLakehouse Delta to BenchmarkWarehouse via T-SQL)

This cell uses T-SQL magic to ingest data into the Warehouse.

In [None]:
# Using spark connector to warehouse
from com.microsoft.spark.fabric import Constants
from pyspark.sql.functions import col

# Make sure timestamps have time zone
for c in df.columns:
    if dict(df.dtypes)[c] == "timestamp_ntz":
        df = df.withColumn(c, col(c).cast("timestamp"))

# Write data from a DataFrame to the warehouse
df.write.mode("overwrite").synapsesql(f"{target_warehouse}.dbo.{warehouse_table_name}")

## Verify the data written to the warehouse
# df_verify = spark.read.synapsesql(f"{target_warehouse}.dbo.{warehouse_table_name}")
#display(df_verify.limit(10))

In [None]:
# After running the T-SQL cell, log metrics for TC13
tc13_spinup_start = time.time()  # Optionally, set this before running T-SQL if timing is needed
tc13_spinup_end = time.time()
tc13_spinup_duration = tc13_spinup_end - tc13_spinup_start
tc13_ingest_duration = tc13_spinup_end - tc13_spinup_start

tc13_total_ingest_duration = tc01_ingest_duration + tc13_ingest_duration
tc13_total_spinup_duration = tc01_spinup_duration + tc13_spinup_duration

metrics_tc13 = [
    (
        "TC13",
        datetime.now(),
        "Warehouse",
        "Tables",
        row_count,
        "Full Refresh",
        tc13_total_ingest_duration,
        tc13_total_spinup_duration,
        float('nan'),
        "N/A",
        float('nan'),
        float('nan'),
        "Physical copy to SQL table via T-SQL"
    )
]
spark.createDataFrame(metrics_tc13, schema=metrics_schema).write.mode('append').saveAsTable(f'{target_lakehouse}.metrics')

print("Completion")
print("All three scenarios (TC01, TC07, TC13) completed and metrics logged.")