# 02 - Format Benchmark: CSV vs. Parquet vs. Delta Lake

**Objective:** Compare I/O performance and storage efficiency across different file formats.

This notebook measures:
1. **Read Performance**: Time to load data from each format
2. **Storage Efficiency**: Disk space used by each format
3. **Query Performance**: Aggregation speed (columnar vs. row-oriented)
4. **Filter Pushdown**: Predicate pushdown effectiveness

---

## Setup and Imports

In [None]:
# Add src directory to path
import sys
from pathlib import Path

notebook_dir = Path.cwd()
project_root = notebook_dir.parent
src_dir = project_root / "src"
sys.path.insert(0, str(src_dir))

print(f"Project root: {project_root}")
print(f"Src directory: {src_dir}")

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import project modules
from config import (
    get_data_path,
    FACT_SALES_TABLE,
    DIM_CUSTOMERS_TABLE,
    SPARK_APP_NAME,
    PLOTS_DIR
)
from benchmark_utils import BenchmarkTimer, get_directory_size_mb, print_benchmark_summary

# Set plotting style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ All imports successful")

## Initialize Spark Session

In [None]:
# Create Spark session with Delta Lake support
spark = (
    SparkSession.builder
    .appName(f"{SPARK_APP_NAME} - Format Benchmark")
    .master("local[*]")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "8")
    .getOrCreate()
)

print(f"✓ Spark {spark.version} session initialized")
print(f"✓ App Name: {spark.sparkContext.appName}")

## Storage Size Comparison

First, let's compare the disk space used by each format.

In [None]:
# Calculate storage sizes for all formats
storage_data = []

for table_name in [FACT_SALES_TABLE, DIM_CUSTOMERS_TABLE]:
    for fmt in ['csv', 'parquet', 'delta']:
        path = get_data_path(fmt, table_name)
        if path.exists():
            size_mb = get_directory_size_mb(path)
            storage_data.append({
                'Table': table_name,
                'Format': fmt.upper(),
                'Size_MB': size_mb
            })

# Create DataFrame
storage_df = pd.DataFrame(storage_data)

# Display results
print("="*60)
print("STORAGE SIZE COMPARISON")
print("="*60)
print(storage_df.to_string(index=False))
print("="*60)

In [None]:
# Visualize storage comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot for fact_sales
sales_data = storage_df[storage_df['Table'] == FACT_SALES_TABLE]
axes[0].bar(sales_data['Format'], sales_data['Size_MB'], color=['#e74c3c', '#3498db', '#2ecc71'])
axes[0].set_title('fact_sales - Storage Size Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Size (MB)', fontsize=12)
axes[0].set_xlabel('Format', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(sales_data['Size_MB']):
    axes[0].text(i, v + 5, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# Plot for dim_customers
customer_data = storage_df[storage_df['Table'] == DIM_CUSTOMERS_TABLE]
axes[1].bar(customer_data['Format'], customer_data['Size_MB'], color=['#e74c3c', '#3498db', '#2ecc71'])
axes[1].set_title('dim_customers - Storage Size Comparison', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Size (MB)', fontsize=12)
axes[1].set_xlabel('Format', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(customer_data['Size_MB']):
    axes[1].text(i, v + 0.2, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(PLOTS_DIR / 'storage_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"✓ Plot saved to: {PLOTS_DIR / 'storage_comparison.png'}")

## Benchmark 1: Full Scan Read Performance

Measure how long it takes to read and count all records from each format.

In [None]:
# Test CSV read performance
with BenchmarkTimer(
    "Read CSV - fact_sales (Full Scan)",
    description="Load and count all records from CSV",
    spark=spark,
    clear_cache=True
) as timer:
    csv_df = spark.read.option("header", "true").option("inferSchema", "true").csv(
        str(get_data_path("csv", FACT_SALES_TABLE))
    )
    count = csv_df.count()
    print(f"Records: {count:,}")

In [None]:
# Test Parquet read performance
with BenchmarkTimer(
    "Read Parquet - fact_sales (Full Scan)",
    description="Load and count all records from Parquet",
    spark=spark,
    clear_cache=True
) as timer:
    parquet_df = spark.read.parquet(str(get_data_path("parquet", FACT_SALES_TABLE)))
    count = parquet_df.count()
    print(f"Records: {count:,}")

In [None]:
# Test Delta read performance
with BenchmarkTimer(
    "Read Delta - fact_sales (Full Scan)",
    description="Load and count all records from Delta",
    spark=spark,
    clear_cache=True
) as timer:
    delta_df = spark.read.format("delta").load(str(get_data_path("delta", FACT_SALES_TABLE)))
    count = delta_df.count()
    print(f"Records: {count:,}")

## Benchmark 2: Columnar Aggregation Performance

Test aggregation performance to highlight the advantage of columnar formats.
We'll aggregate sales by product category.

In [None]:
# CSV aggregation
with BenchmarkTimer(
    "CSV - Aggregation by Category",
    description="GroupBy product_category and sum amount",
    spark=spark,
    clear_cache=True
):
    csv_df = spark.read.option("header", "true").option("inferSchema", "true").csv(
        str(get_data_path("csv", FACT_SALES_TABLE))
    )
    result = csv_df.groupBy("product_category").agg(
        F.sum("amount").alias("total_amount"),
        F.count("*").alias("num_transactions")
    ).orderBy("product_category").collect()
    print(f"Categories: {len(result)}")

In [None]:
# Parquet aggregation
with BenchmarkTimer(
    "Parquet - Aggregation by Category",
    description="GroupBy product_category and sum amount",
    spark=spark,
    clear_cache=True
):
    parquet_df = spark.read.parquet(str(get_data_path("parquet", FACT_SALES_TABLE)))
    result = parquet_df.groupBy("product_category").agg(
        F.sum("amount").alias("total_amount"),
        F.count("*").alias("num_transactions")
    ).orderBy("product_category").collect()
    print(f"Categories: {len(result)}")

In [None]:
# Delta aggregation
with BenchmarkTimer(
    "Delta - Aggregation by Category",
    description="GroupBy product_category and sum amount",
    spark=spark,
    clear_cache=True
):
    delta_df = spark.read.format("delta").load(str(get_data_path("delta", FACT_SALES_TABLE)))
    result = delta_df.groupBy("product_category").agg(
        F.sum("amount").alias("total_amount"),
        F.count("*").alias("num_transactions")
    ).orderBy("product_category").collect()
    print(f"Categories: {len(result)}")

## Benchmark 3: Filter Pushdown (Predicate Pushdown)

Test how well each format supports predicate pushdown optimization.
We'll filter for specific product categories.

In [None]:
# CSV with filter
with BenchmarkTimer(
    "CSV - Filter Electronics Category",
    description="Filter product_category = 'Electronics' and count",
    spark=spark,
    clear_cache=True
):
    csv_df = spark.read.option("header", "true").option("inferSchema", "true").csv(
        str(get_data_path("csv", FACT_SALES_TABLE))
    )
    filtered = csv_df.filter(F.col("product_category") == "Electronics")
    count = filtered.count()
    print(f"Filtered records: {count:,}")

In [None]:
# Parquet with filter (should benefit from predicate pushdown)
with BenchmarkTimer(
    "Parquet - Filter Electronics Category",
    description="Filter product_category = 'Electronics' and count",
    spark=spark,
    clear_cache=True
):
    parquet_df = spark.read.parquet(str(get_data_path("parquet", FACT_SALES_TABLE)))
    filtered = parquet_df.filter(F.col("product_category") == "Electronics")
    count = filtered.count()
    print(f"Filtered records: {count:,}")

In [None]:
# Delta with filter (should benefit from data skipping)
with BenchmarkTimer(
    "Delta - Filter Electronics Category",
    description="Filter product_category = 'Electronics' and count",
    spark=spark,
    clear_cache=True
):
    delta_df = spark.read.format("delta").load(str(get_data_path("delta", FACT_SALES_TABLE)))
    filtered = delta_df.filter(F.col("product_category") == "Electronics")
    count = filtered.count()
    print(f"Filtered records: {count:,}")

## Benchmark 4: Selective Column Read

Test columnar format advantage when reading only specific columns.

In [None]:
# CSV - Read only 2 columns
with BenchmarkTimer(
    "CSV - Select 2 Columns",
    description="Read only customer_id and amount columns",
    spark=spark,
    clear_cache=True
):
    csv_df = spark.read.option("header", "true").option("inferSchema", "true").csv(
        str(get_data_path("csv", FACT_SALES_TABLE))
    )
    result = csv_df.select("customer_id", "amount").count()
    print(f"Records: {result:,}")

In [None]:
# Parquet - Read only 2 columns (should be much faster)
with BenchmarkTimer(
    "Parquet - Select 2 Columns",
    description="Read only customer_id and amount columns",
    spark=spark,
    clear_cache=True
):
    parquet_df = spark.read.parquet(str(get_data_path("parquet", FACT_SALES_TABLE)))
    result = parquet_df.select("customer_id", "amount").count()
    print(f"Records: {result:,}")

In [None]:
# Delta - Read only 2 columns (should also benefit from columnar format)
with BenchmarkTimer(
    "Delta - Select 2 Columns",
    description="Read only customer_id and amount columns",
    spark=spark,
    clear_cache=True
):
    delta_df = spark.read.format("delta").load(str(get_data_path("delta", FACT_SALES_TABLE)))
    result = delta_df.select("customer_id", "amount").count()
    print(f"Records: {result:,}")

## Results Analysis and Visualization

In [None]:
# Load and analyze benchmark results
from config import BENCHMARK_LOG_FILE
import csv

# Read the log file
benchmark_results = []
with open(BENCHMARK_LOG_FILE, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if 'fact_sales' in row['test_name'] and row['status'] == 'SUCCESS':
            benchmark_results.append(row)

# Create DataFrame for analysis
results_df = pd.DataFrame(benchmark_results)
results_df['duration_seconds'] = results_df['duration_seconds'].astype(float)

# Display recent benchmarks
print("\nRecent Benchmark Results:")
print(results_df[['test_name', 'duration_seconds']].tail(12).to_string(index=False))

In [None]:
# Create performance comparison visualization
# Extract format from test name and categorize benchmarks
def extract_format_and_test(test_name):
    if 'CSV' in test_name:
        fmt = 'CSV'
    elif 'Parquet' in test_name:
        fmt = 'Parquet'
    elif 'Delta' in test_name:
        fmt = 'Delta'
    else:
        fmt = 'Unknown'
    
    if 'Full Scan' in test_name:
        test = 'Full Scan'
    elif 'Aggregation' in test_name:
        test = 'Aggregation'
    elif 'Filter' in test_name:
        test = 'Filter'
    elif 'Select 2 Columns' in test_name:
        test = 'Column Select'
    else:
        test = 'Other'
    
    return fmt, test

# Only use the most recent format benchmarks (last 12 entries)
recent_results = results_df.tail(12).copy()
recent_results[['format', 'test_type']] = recent_results['test_name'].apply(
    lambda x: pd.Series(extract_format_and_test(x))
)

# Filter for comparison tests
comparison_tests = recent_results[recent_results['test_type'].isin(['Full Scan', 'Aggregation', 'Filter', 'Column Select'])]

if len(comparison_tests) > 0:
    # Create grouped bar chart
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # Pivot data for grouped bars
    pivot_data = comparison_tests.pivot_table(
        index='test_type',
        columns='format',
        values='duration_seconds',
        aggfunc='first'
    )
    
    # Plot grouped bars
    pivot_data.plot(kind='bar', ax=ax, color=['#e74c3c', '#3498db', '#2ecc71'], width=0.7)
    
    ax.set_title('Format Performance Comparison - fact_sales', fontsize=16, fontweight='bold')
    ax.set_ylabel('Duration (seconds)', fontsize=12)
    ax.set_xlabel('Benchmark Type', fontsize=12)
    ax.legend(title='Format', fontsize=11)
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / 'format_performance_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"✓ Plot saved to: {PLOTS_DIR / 'format_performance_comparison.png'}")
else:
    print("⚠ Not enough data for visualization")

## Summary Report

In [None]:
# Print comprehensive benchmark summary
print_benchmark_summary()

print("\n" + "="*80)
print("KEY FINDINGS")
print("="*80)
print("""
Expected Performance Characteristics:

1. STORAGE EFFICIENCY:
   - Parquet/Delta: 50-70% smaller than CSV due to compression
   - Columnar format benefits: Better compression ratios

2. FULL SCAN PERFORMANCE:
   - CSV: Slowest (row-oriented, no compression)
   - Parquet/Delta: Faster (columnar, compressed)

3. AGGREGATION PERFORMANCE:
   - Columnar formats excel: Only read required columns
   - CSV must read all columns regardless

4. FILTER PUSHDOWN:
   - Parquet/Delta: Support predicate pushdown
   - Delta: Additional data skipping with statistics

5. COLUMN PROJECTION:
   - Parquet/Delta: Huge advantage when selecting few columns
   - CSV: Must read entire row even for single column
""")
print("="*80)
print("\n✓ Format benchmark completed!")
print("Next step: Run notebook 03_join_optimization.ipynb")
print("="*80)