# PySpark Merge Performance Test

**Purpose:** Merge all 36 source tables into a single bronze_sales table

**Source:** `dbo.bronze_YYYY_MM` (36 tables)

**Target:** `dbo.pyspark_bronze_sales`

**Lakehouse:** sales_LH (schema-enabled)

**Comparison:** This will be compared against dbt Jobs performance

In [1]:
import time

# Configuration
SOURCE_SCHEMA = "dbo"
SOURCE_PREFIX = "bronze"
TARGET_TABLE = "dbo.pyspark_bronze_sales"
YEARS = [2023, 2024, 2025]
MONTHS = range(1, 13)

print("="*60)
print("PYSPARK MERGE PERFORMANCE TEST")
print("="*60)
print(f"Source: {SOURCE_SCHEMA}.{SOURCE_PREFIX}_YYYY_MM (36 tables)")
print(f"Target: {TARGET_TABLE}")
print("Method: PySpark DataFrame UNION")
print("="*60)

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 3, Finished, Available, Finished)

PYSPARK MERGE PERFORMANCE TEST
Source: dbo.bronze_YYYY_MM (36 tables)
Target: dbo.pyspark_bronze_sales
Method: PySpark DataFrame UNION


In [2]:
# PERFORMANCE TEST START
print("\nStarting performance test...")
start_time = time.time()

df_merged = None
tables_merged = 0

for year in YEARS:
    for month in MONTHS:
        # Table name with schema: dbo.bronze_YYYY_MM
        table_name = f"{SOURCE_SCHEMA}.{SOURCE_PREFIX}_{year}_{month:02d}"
        
        try:
            df = spark.table(table_name)
            
            if df_merged is None:
                df_merged = df
            else:
                df_merged = df_merged.union(df)
            
            tables_merged += 1
        except Exception as e:
            print(f"  Error reading {table_name}: {type(e).__name__}: {e}")

read_time = time.time() - start_time
print(f"\nRead & Union: {read_time:.2f} seconds ({tables_merged} tables)")

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 4, Finished, Available, Finished)


Starting performance test...

Read & Union: 36.95 seconds (36 tables)


In [3]:
# Write merged data to target table in dbo schema
write_start = time.time()

df_merged.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(TARGET_TABLE)

write_time = time.time() - write_start
total_time = time.time() - start_time

print(f"Write Time:   {write_time:.2f} seconds")

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 5, Finished, Available, Finished)

Write Time:   121.97 seconds


In [4]:
# Verify row count
row_count = spark.table(TARGET_TABLE).count()

print("\n" + "="*60)
print("PYSPARK MERGE - RESULTS")
print("="*60)
print(f"Source Tables:     {SOURCE_SCHEMA}.{SOURCE_PREFIX}_YYYY_MM")
print(f"Target Table:      {TARGET_TABLE}")
print(f"Tables Merged:     {tables_merged}")
print(f"Total Rows:        {row_count:,}")
print(f"")
print(f"Read/Union Time:   {read_time:.2f} seconds")
print(f"Write Time:        {write_time:.2f} seconds")
print(f"")
print(f">>> TOTAL TIME:    {total_time:.2f} seconds <<<")
print("="*60)

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 6, Finished, Available, Finished)


PYSPARK MERGE - RESULTS
Source Tables:     dbo.bronze_YYYY_MM
Target Table:      dbo.pyspark_bronze_sales
Tables Merged:     36
Total Rows:        3,600,000

Read/Union Time:   36.95 seconds
Write Time:        121.97 seconds

>>> TOTAL TIME:    160.38 seconds <<<


In [5]:
# Show sample
print(f"Sample data from {TARGET_TABLE}:")
spark.table(TARGET_TABLE).show(5)

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 7, Finished, Available, Finished)

Sample data from dbo.pyspark_bronze_sales:
+--------------+-----------+---------+-------------+----------+----------+----------+-----------+---------+----------+-----------------+----------+-----------+-----+-------+---------------+----------+--------+------------+-----------+----------+----------+---------------+------------+-----------------+---------------------+------------------+-----------------------+--------------+------------+-----------+----------+---------------+-------------+--------------+----------+
|      order_id|customer_id|driver_id|restaurant_id|order_date|order_time|order_year|order_month|order_day|order_hour|order_day_of_week|is_weekend|       city|state|country|restaurant_type|item_count|subtotal|delivery_fee|service_fee|tax_amount|tip_amount|discount_amount|total_amount|prep_time_minutes|delivery_time_minutes|total_time_minutes|delivery_distance_miles|payment_method|order_status|device_type|promo_code|customer_rating|driver_rating|is_first_order|is_reorder|
+----

In [6]:
# Verify distribution by year
print("Row distribution by year:")
spark.sql(f"""
    SELECT order_year, COUNT(*) as row_count
    FROM {TARGET_TABLE}
    GROUP BY order_year
    ORDER BY order_year
""").show()

StatementMeta(, 1401965e-9a9a-4e18-b8ce-9bbfc89f2de3, 8, Finished, Available, Finished)

Row distribution by year:
+----------+---------+
|order_year|row_count|
+----------+---------+
|      2023|  1200000|
|      2024|  1200000|
|      2025|  1200000|
+----------+---------+

