# Data Engine Performance Comparison

This notebook compares the performance of **pandas**, **Polars**, and **DuckDB** across identical operations to help choose the best engine for different use cases.

## Test Scenarios
1. **Data Loading**: CSV reading performance
2. **Basic Operations**: Filtering, sorting, grouping
3. **Aggregations**: Complex groupby operations
4. **Joins**: Inner and left joins on large datasets
5. **Window Functions**: Ranking and moving averages
6. **Memory Usage**: Peak memory consumption
7. **ML Integration**: Feature engineering pipeline

## Decision Matrix
Based on results, we'll provide recommendations for:
- Small datasets (< 1M rows)
- Medium datasets (1-10M rows) 
- Large datasets (> 10M rows)
- Memory-constrained environments
- SQL-heavy workflows
- ML pipeline integration

In [None]:
import pandas as pd
import polars as pl
import duckdb
import numpy as np
import time
import psutil
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Memory monitoring
import tracemalloc
from memory_profiler import profile

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Package versions:")
print(f"pandas: {pd.__version__}")
print(f"polars: {pl.__version__}")
print(f"duckdb: {duckdb.__version__}")

## Test Data Generation

We'll create identical datasets for fair comparison across engines.

In [None]:
def generate_test_data(n_rows: int, save_path: str = None) -> dict:
    """
    Generate identical test dataset for all engines.
    
    Args:
        n_rows: Number of rows to generate
        save_path: Path to save CSV file
    
    Returns:
        Dict with data for each engine
    """
    np.random.seed(42)
    
    # Generate base data
    data = {
        'customer_id': np.random.randint(1, 100000, n_rows),
        'transaction_id': [f'TXN_{i:08d}' for i in range(n_rows)],
        'amount': np.random.exponential(50, n_rows).round(2),
        'merchant_category': np.random.choice(['grocery', 'gas', 'restaurant', 'retail', 'online'], n_rows),
        'transaction_date': pd.date_range('2023-01-01', periods=n_rows, freq='1min'),
        'is_fraud': np.random.choice([0, 1], n_rows, p=[0.99, 0.01]),
        'account_balance': np.random.normal(5000, 2000, n_rows).round(2),
        'merchant_id': np.random.randint(1, 10000, n_rows),
        'payment_method': np.random.choice(['credit', 'debit', 'cash'], n_rows),
        'location_lat': np.random.uniform(25, 50, n_rows),
        'location_lon': np.random.uniform(-125, -65, n_rows)
    }
    
    # Create pandas DataFrame
    df_pandas = pd.DataFrame(data)
    
    # Save to CSV for consistent loading
    if save_path:
        df_pandas.to_csv(save_path, index=False)
    
    # Create Polars DataFrame
    df_polars = pl.DataFrame(data)
    
    return {
        'pandas': df_pandas,
        'polars': df_polars,
        'csv_path': save_path
    }

# Generate test datasets of different sizes
datasets = {}
sizes = [10000, 100000, 1000000]  # 10K, 100K, 1M rows

print("Generating test datasets...")
for size in sizes:
    csv_path = f"../data/test_data_{size}.csv"
    datasets[size] = generate_test_data(size, csv_path)
    print(f"Generated {size:,} rows dataset")

print("\nDataset sizes (MB):")
for size in sizes:
    df = datasets[size]['pandas']
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"{size:,} rows: {memory_mb:.2f} MB")

## Performance Measurement Framework

In [None]:
class PerformanceTracker:
    """
    Track execution time and memory usage for operations.
    """
    
    def __init__(self):
        self.results = []
    
    def measure_operation(self, operation_name: str, engine: str, operation_func, *args, **kwargs):
        """
        Measure time and memory for an operation.
        """
        # Get initial memory
        process = psutil.Process()
        initial_memory = process.memory_info().rss / 1024**2  # MB
        
        # Start timing
        start_time = time.time()
        
        try:
            # Execute operation
            result = operation_func(*args, **kwargs)
            
            # Calculate metrics
            execution_time = time.time() - start_time
            peak_memory = process.memory_info().rss / 1024**2  # MB
            memory_delta = peak_memory - initial_memory
            
            # Store results
            self.results.append({
                'operation': operation_name,
                'engine': engine,
                'execution_time': execution_time,
                'initial_memory_mb': initial_memory,
                'peak_memory_mb': peak_memory,
                'memory_delta_mb': memory_delta,
                'success': True,
                'error': None
            })
            
            return result
            
        except Exception as e:
            execution_time = time.time() - start_time
            
            self.results.append({
                'operation': operation_name,
                'engine': engine,
                'execution_time': execution_time,
                'initial_memory_mb': initial_memory,
                'peak_memory_mb': initial_memory,
                'memory_delta_mb': 0,
                'success': False,
                'error': str(e)
            })
            
            print(f"Error in {engine} {operation_name}: {e}")
            return None
    
    def get_results_df(self):
        """Return results as pandas DataFrame."""
        return pd.DataFrame(self.results)
    
    def clear_results(self):
        """Clear stored results."""
        self.results = []

# Initialize tracker
tracker = PerformanceTracker()

print("Performance tracking framework initialized")

## Test 1: Data Loading Performance

In [None]:
def test_data_loading(size: int):
    """
    Test CSV loading performance across engines.
    """
    csv_path = f"../data/test_data_{size}.csv"
    
    print(f"\nTesting data loading for {size:,} rows...")
    
    # pandas
    def load_pandas():
        return pd.read_csv(csv_path)
    
    df_pandas = tracker.measure_operation(f'load_csv_{size}', 'pandas', load_pandas)
    
    # Polars
    def load_polars():
        return pl.read_csv(csv_path)
    
    df_polars = tracker.measure_operation(f'load_csv_{size}', 'polars', load_polars)
    
    # DuckDB
    def load_duckdb():
        conn = duckdb.connect()
        return conn.execute(f"SELECT * FROM read_csv_auto('{csv_path}')").fetchdf()
    
    df_duckdb = tracker.measure_operation(f'load_csv_{size}', 'duckdb', load_duckdb)
    
    return df_pandas, df_polars, df_duckdb

# Test loading for all dataset sizes
loaded_data = {}
for size in sizes:
    loaded_data[size] = test_data_loading(size)

print("\nData loading tests completed")

## Test 2: Basic Operations

In [None]:
def test_basic_operations(size: int):
    """
    Test basic operations: filtering, sorting, selecting.
    """
    print(f"\nTesting basic operations for {size:,} rows...")
    
    # Get data
    df_pandas, df_polars, df_duckdb = loaded_data[size]
    
    # Test 1: Filter operation
    print("  Testing filtering...")
    
    # pandas filter
    def pandas_filter():
        return df_pandas[df_pandas['amount'] > 100]
    
    tracker.measure_operation(f'filter_{size}', 'pandas', pandas_filter)
    
    # Polars filter
    def polars_filter():
        return df_polars.filter(pl.col('amount') > 100)
    
    tracker.measure_operation(f'filter_{size}', 'polars', polars_filter)
    
    # DuckDB filter
    def duckdb_filter():
        conn = duckdb.connect()
        return conn.execute("SELECT * FROM df_duckdb WHERE amount > 100").fetchdf()
    
    tracker.measure_operation(f'filter_{size}', 'duckdb', duckdb_filter)
    
    # Test 2: Sort operation
    print("  Testing sorting...")
    
    # pandas sort
    def pandas_sort():
        return df_pandas.sort_values(['amount', 'transaction_date'])
    
    tracker.measure_operation(f'sort_{size}', 'pandas', pandas_sort)
    
    # Polars sort
    def polars_sort():
        return df_polars.sort(['amount', 'transaction_date'])
    
    tracker.measure_operation(f'sort_{size}', 'polars', polars_sort)
    
    # DuckDB sort
    def duckdb_sort():
        conn = duckdb.connect()
        return conn.execute("SELECT * FROM df_duckdb ORDER BY amount, transaction_date").fetchdf()
    
    tracker.measure_operation(f'sort_{size}', 'duckdb', duckdb_sort)
    
    # Test 3: Column selection and transformation
    print("  Testing column operations...")
    
    # pandas select and transform
    def pandas_select():
        return df_pandas[['customer_id', 'amount', 'merchant_category']].assign(
            amount_log=lambda x: np.log1p(x['amount'])
        )
    
    tracker.measure_operation(f'select_transform_{size}', 'pandas', pandas_select)
    
    # Polars select and transform
    def polars_select():
        return df_polars.select([
            pl.col('customer_id'),
            pl.col('amount'),
            pl.col('merchant_category'),
            pl.col('amount').log1p().alias('amount_log')
        ])
    
    tracker.measure_operation(f'select_transform_{size}', 'polars', polars_select)
    
    # DuckDB select and transform
    def duckdb_select():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT 
                customer_id,
                amount,
                merchant_category,
                ln(1 + amount) as amount_log
            FROM df_duckdb
        """).fetchdf()
    
    tracker.measure_operation(f'select_transform_{size}', 'duckdb', duckdb_select)

# Test basic operations for all sizes
for size in sizes:
    test_basic_operations(size)

print("\nBasic operations tests completed")

## Test 3: Aggregation Operations

In [None]:
def test_aggregations(size: int):
    """
    Test aggregation operations: groupby, multiple aggregations.
    """
    print(f"\nTesting aggregations for {size:,} rows...")
    
    # Get data
    df_pandas, df_polars, df_duckdb = loaded_data[size]
    
    # Test 1: Simple groupby
    print("  Testing simple groupby...")
    
    # pandas groupby
    def pandas_groupby():
        return df_pandas.groupby('merchant_category')['amount'].agg(['mean', 'sum', 'count'])
    
    tracker.measure_operation(f'groupby_simple_{size}', 'pandas', pandas_groupby)
    
    # Polars groupby
    def polars_groupby():
        return df_polars.group_by('merchant_category').agg([
            pl.col('amount').mean().alias('amount_mean'),
            pl.col('amount').sum().alias('amount_sum'),
            pl.col('amount').count().alias('amount_count')
        ])
    
    tracker.measure_operation(f'groupby_simple_{size}', 'polars', polars_groupby)
    
    # DuckDB groupby
    def duckdb_groupby():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT 
                merchant_category,
                AVG(amount) as amount_mean,
                SUM(amount) as amount_sum,
                COUNT(amount) as amount_count
            FROM df_duckdb 
            GROUP BY merchant_category
        """).fetchdf()
    
    tracker.measure_operation(f'groupby_simple_{size}', 'duckdb', duckdb_groupby)
    
    # Test 2: Complex groupby with multiple columns
    print("  Testing complex groupby...")
    
    # pandas complex groupby
    def pandas_complex_groupby():
        return df_pandas.groupby(['merchant_category', 'payment_method']).agg({
            'amount': ['mean', 'std', 'min', 'max'],
            'is_fraud': 'sum',
            'customer_id': 'nunique'
        })
    
    tracker.measure_operation(f'groupby_complex_{size}', 'pandas', pandas_complex_groupby)
    
    # Polars complex groupby
    def polars_complex_groupby():
        return df_polars.group_by(['merchant_category', 'payment_method']).agg([
            pl.col('amount').mean().alias('amount_mean'),
            pl.col('amount').std().alias('amount_std'),
            pl.col('amount').min().alias('amount_min'),
            pl.col('amount').max().alias('amount_max'),
            pl.col('is_fraud').sum().alias('fraud_sum'),
            pl.col('customer_id').n_unique().alias('unique_customers')
        ])
    
    tracker.measure_operation(f'groupby_complex_{size}', 'polars', polars_complex_groupby)
    
    # DuckDB complex groupby
    def duckdb_complex_groupby():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT 
                merchant_category,
                payment_method,
                AVG(amount) as amount_mean,
                STDDEV(amount) as amount_std,
                MIN(amount) as amount_min,
                MAX(amount) as amount_max,
                SUM(is_fraud) as fraud_sum,
                COUNT(DISTINCT customer_id) as unique_customers
            FROM df_duckdb 
            GROUP BY merchant_category, payment_method
        """).fetchdf()
    
    tracker.measure_operation(f'groupby_complex_{size}', 'duckdb', duckdb_complex_groupby)

# Test aggregations for all sizes
for size in sizes:
    test_aggregations(size)

print("\nAggregation tests completed")

## Test 4: Join Operations

In [None]:
def test_joins(size: int):
    """
    Test join operations with lookup tables.
    """
    print(f"\nTesting joins for {size:,} rows...")
    
    # Get data
    df_pandas, df_polars, df_duckdb = loaded_data[size]
    
    # Create lookup table
    np.random.seed(42)
    lookup_data = {
        'merchant_id': range(1, 10001),
        'merchant_name': [f'Merchant_{i}' for i in range(1, 10001)],
        'merchant_city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 10000),
        'merchant_type': np.random.choice(['chain', 'local', 'franchise'], 10000)
    }
    
    lookup_pandas = pd.DataFrame(lookup_data)
    lookup_polars = pl.DataFrame(lookup_data)
    lookup_duckdb = pd.DataFrame(lookup_data)  # For DuckDB registration
    
    # Test 1: Inner join
    print("  Testing inner join...")
    
    # pandas inner join
    def pandas_inner_join():
        return df_pandas.merge(lookup_pandas, on='merchant_id', how='inner')
    
    tracker.measure_operation(f'inner_join_{size}', 'pandas', pandas_inner_join)
    
    # Polars inner join
    def polars_inner_join():
        return df_polars.join(lookup_polars, on='merchant_id', how='inner')
    
    tracker.measure_operation(f'inner_join_{size}', 'polars', polars_inner_join)
    
    # DuckDB inner join
    def duckdb_inner_join():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT t.*, l.merchant_name, l.merchant_city, l.merchant_type
            FROM df_duckdb t
            INNER JOIN lookup_duckdb l ON t.merchant_id = l.merchant_id
        """).fetchdf()
    
    tracker.measure_operation(f'inner_join_{size}', 'duckdb', duckdb_inner_join)
    
    # Test 2: Left join with aggregation
    print("  Testing left join with aggregation...")
    
    # pandas left join + aggregation
    def pandas_left_join_agg():
        joined = df_pandas.merge(lookup_pandas, on='merchant_id', how='left')
        return joined.groupby('merchant_city')['amount'].agg(['sum', 'mean', 'count'])
    
    tracker.measure_operation(f'left_join_agg_{size}', 'pandas', pandas_left_join_agg)
    
    # Polars left join + aggregation
    def polars_left_join_agg():
        return (df_polars
                .join(lookup_polars, on='merchant_id', how='left')
                .group_by('merchant_city')
                .agg([
                    pl.col('amount').sum().alias('amount_sum'),
                    pl.col('amount').mean().alias('amount_mean'),
                    pl.col('amount').count().alias('amount_count')
                ]))
    
    tracker.measure_operation(f'left_join_agg_{size}', 'polars', polars_left_join_agg)
    
    # DuckDB left join + aggregation
    def duckdb_left_join_agg():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT 
                l.merchant_city,
                SUM(t.amount) as amount_sum,
                AVG(t.amount) as amount_mean,
                COUNT(t.amount) as amount_count
            FROM df_duckdb t
            LEFT JOIN lookup_duckdb l ON t.merchant_id = l.merchant_id
            GROUP BY l.merchant_city
        """).fetchdf()
    
    tracker.measure_operation(f'left_join_agg_{size}', 'duckdb', duckdb_left_join_agg)

# Test joins for all sizes (skip largest for memory constraints)
for size in sizes[:2]:  # Only test on smaller datasets for joins
    test_joins(size)

print("\nJoin tests completed")

## Test 5: Window Functions

In [None]:
def test_window_functions(size: int):
    """
    Test window functions: ranking, moving averages.
    """
    print(f"\nTesting window functions for {size:,} rows...")
    
    # Get data
    df_pandas, df_polars, df_duckdb = loaded_data[size]
    
    # Test 1: Ranking within groups
    print("  Testing ranking...")
    
    # pandas ranking
    def pandas_ranking():
        return df_pandas.assign(
            amount_rank=df_pandas.groupby('merchant_category')['amount'].rank(method='dense', ascending=False)
        )
    
    tracker.measure_operation(f'ranking_{size}', 'pandas', pandas_ranking)
    
    # Polars ranking
    def polars_ranking():
        return df_polars.with_columns(
            pl.col('amount').rank(method='dense', descending=True)
            .over('merchant_category')
            .alias('amount_rank')
        )
    
    tracker.measure_operation(f'ranking_{size}', 'polars', polars_ranking)
    
    # DuckDB ranking
    def duckdb_ranking():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT *,
                DENSE_RANK() OVER (PARTITION BY merchant_category ORDER BY amount DESC) as amount_rank
            FROM df_duckdb
        """).fetchdf()
    
    tracker.measure_operation(f'ranking_{size}', 'duckdb', duckdb_ranking)
    
    # Test 2: Moving averages
    print("  Testing moving averages...")
    
    # pandas moving average
    def pandas_moving_avg():
        sorted_df = df_pandas.sort_values(['customer_id', 'transaction_date'])
        return sorted_df.assign(
            amount_ma_7=sorted_df.groupby('customer_id')['amount'].rolling(window=7, min_periods=1).mean().reset_index(drop=True)
        )
    
    tracker.measure_operation(f'moving_avg_{size}', 'pandas', pandas_moving_avg)
    
    # Polars moving average
    def polars_moving_avg():
        return (df_polars
                .sort(['customer_id', 'transaction_date'])
                .with_columns(
                    pl.col('amount').rolling_mean(window_size=7, min_periods=1)
                    .over('customer_id')
                    .alias('amount_ma_7')
                ))
    
    tracker.measure_operation(f'moving_avg_{size}', 'polars', polars_moving_avg)
    
    # DuckDB moving average
    def duckdb_moving_avg():
        conn = duckdb.connect()
        return conn.execute("""
            SELECT *,
                AVG(amount) OVER (
                    PARTITION BY customer_id 
                    ORDER BY transaction_date 
                    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
                ) as amount_ma_7
            FROM df_duckdb
            ORDER BY customer_id, transaction_date
        """).fetchdf()
    
    tracker.measure_operation(f'moving_avg_{size}', 'duckdb', duckdb_moving_avg)

# Test window functions for smaller datasets
for size in sizes[:2]:  # Only test on smaller datasets
    test_window_functions(size)

print("\nWindow function tests completed")

## Test 6: ML Pipeline Integration

In [None]:
def test_ml_integration(size: int):
    """
    Test ML pipeline integration: feature engineering, preprocessing.
    """
    print(f"\nTesting ML integration for {size:,} rows...")
    
    # Get data
    df_pandas, df_polars, df_duckdb = loaded_data[size]
    
    # Feature engineering pipeline
    print("  Testing feature engineering...")
    
    # pandas feature engineering
    def pandas_feature_engineering():
        df = df_pandas.copy()
        
        # Create features
        df['hour'] = df['transaction_date'].dt.hour
        df['day_of_week'] = df['transaction_date'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6])
        df['amount_log'] = np.log1p(df['amount'])
        df['amount_normalized'] = (df['amount'] - df['amount'].mean()) / df['amount'].std()
        
        # Categorical encoding
        df = pd.get_dummies(df, columns=['merchant_category', 'payment_method'], prefix=['cat', 'pay'])
        
        # Customer aggregations
        customer_stats = df.groupby('customer_id')['amount'].agg(['mean', 'std', 'count']).add_prefix('customer_')
        df = df.merge(customer_stats, left_on='customer_id', right_index=True)
        
        return df
    
    result_pandas = tracker.measure_operation(f'ml_features_{size}', 'pandas', pandas_feature_engineering)
    
    # Polars feature engineering
    def polars_feature_engineering():
        # Create base features
        df = df_polars.with_columns([
            pl.col('transaction_date').dt.hour().alias('hour'),
            pl.col('transaction_date').dt.weekday().alias('day_of_week'),
            pl.col('amount').log1p().alias('amount_log'),
            ((pl.col('amount') - pl.col('amount').mean()) / pl.col('amount').std()).alias('amount_normalized')
        ]).with_columns(
            pl.col('day_of_week').is_in([6, 7]).alias('is_weekend')
        )
        
        # One-hot encoding
        df = df.to_dummies(['merchant_category', 'payment_method'])
        
        # Customer aggregations
        customer_stats = df.group_by('customer_id').agg([
            pl.col('amount').mean().alias('customer_mean'),
            pl.col('amount').std().alias('customer_std'),
            pl.col('amount').count().alias('customer_count')
        ])
        
        df = df.join(customer_stats, on='customer_id')
        
        return df
    
    result_polars = tracker.measure_operation(f'ml_features_{size}', 'polars', polars_feature_engineering)
    
    # DuckDB feature engineering
    def duckdb_feature_engineering():
        conn = duckdb.connect()
        
        # Create customer stats first
        conn.execute("""
            CREATE TEMP TABLE customer_stats AS
            SELECT 
                customer_id,
                AVG(amount) as customer_mean,
                STDDEV(amount) as customer_std,
                COUNT(amount) as customer_count
            FROM df_duckdb
            GROUP BY customer_id
        """)
        
        # Main feature engineering query
        return conn.execute("""
            SELECT 
                t.*,
                EXTRACT(hour FROM transaction_date) as hour,
                EXTRACT(dow FROM transaction_date) as day_of_week,
                CASE WHEN EXTRACT(dow FROM transaction_date) IN (6, 0) THEN true ELSE false END as is_weekend,
                LN(1 + amount) as amount_log,
                (amount - (SELECT AVG(amount) FROM df_duckdb)) / (SELECT STDDEV(amount) FROM df_duckdb) as amount_normalized,
                CASE WHEN merchant_category = 'grocery' THEN 1 ELSE 0 END as cat_grocery,
                CASE WHEN merchant_category = 'gas' THEN 1 ELSE 0 END as cat_gas,
                CASE WHEN merchant_category = 'restaurant' THEN 1 ELSE 0 END as cat_restaurant,
                CASE WHEN merchant_category = 'retail' THEN 1 ELSE 0 END as cat_retail,
                CASE WHEN merchant_category = 'online' THEN 1 ELSE 0 END as cat_online,
                CASE WHEN payment_method = 'credit' THEN 1 ELSE 0 END as pay_credit,
                CASE WHEN payment_method = 'debit' THEN 1 ELSE 0 END as pay_debit,
                CASE WHEN payment_method = 'cash' THEN 1 ELSE 0 END as pay_cash,
                cs.customer_mean,
                cs.customer_std,
                cs.customer_count
            FROM df_duckdb t
            LEFT JOIN customer_stats cs ON t.customer_id = cs.customer_id
        """).fetchdf()
    
    result_duckdb = tracker.measure_operation(f'ml_features_{size}', 'duckdb', duckdb_feature_engineering)
    
    return result_pandas, result_polars, result_duckdb

# Test ML integration for smaller datasets
ml_results = {}
for size in sizes[:2]:  # Only test on smaller datasets
    ml_results[size] = test_ml_integration(size)

print("\nML integration tests completed")

## Performance Analysis and Visualization

In [None]:
# Get results as DataFrame
results_df = tracker.get_results_df()

# Filter only successful operations
successful_results = results_df[results_df['success'] == True].copy()

print(f"Total operations tested: {len(results_df)}")
print(f"Successful operations: {len(successful_results)}")
print(f"Failed operations: {len(results_df) - len(successful_results)}")

# Display first few results
print("\nSample results:")
print(successful_results[['operation', 'engine', 'execution_time', 'memory_delta_mb']].head(10))

In [None]:
# Create comprehensive performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Data Engine Performance Comparison', fontsize=16, fontweight='bold')

# 1. Execution Time Comparison
ax1 = axes[0, 0]
execution_pivot = successful_results.pivot_table(
    index='operation', 
    columns='engine', 
    values='execution_time', 
    aggfunc='mean'
)

execution_pivot.plot(kind='bar', ax=ax1, width=0.8)
ax1.set_title('Average Execution Time by Operation', fontweight='bold')
ax1.set_ylabel('Time (seconds)')
ax1.set_xlabel('Operation')
ax1.legend(title='Engine')
ax1.tick_params(axis='x', rotation=45)

# 2. Memory Usage Comparison
ax2 = axes[0, 1]
memory_pivot = successful_results.pivot_table(
    index='operation', 
    columns='engine', 
    values='memory_delta_mb', 
    aggfunc='mean'
)

memory_pivot.plot(kind='bar', ax=ax2, width=0.8)
ax2.set_title('Average Memory Usage by Operation', fontweight='bold')
ax2.set_ylabel('Memory Delta (MB)')
ax2.set_xlabel('Operation')
ax2.legend(title='Engine')
ax2.tick_params(axis='x', rotation=45)

# 3. Performance by Dataset Size
ax3 = axes[1, 0]
# Extract dataset size from operation name
successful_results['dataset_size'] = successful_results['operation'].str.extract(r'_(\d+)$')[0]
size_perf = successful_results.groupby(['dataset_size', 'engine'])['execution_time'].mean().unstack()

size_perf.plot(kind='line', ax=ax3, marker='o', linewidth=2, markersize=6)
ax3.set_title('Execution Time vs Dataset Size', fontweight='bold')
ax3.set_ylabel('Average Time (seconds)')
ax3.set_xlabel('Dataset Size (rows)')
ax3.legend(title='Engine')
ax3.set_yscale('log')

# 4. Engine Performance Summary (Radar Chart Data)
ax4 = axes[1, 1]
engine_summary = successful_results.groupby('engine').agg({
    'execution_time': 'mean',
    'memory_delta_mb': 'mean'
}).round(3)

# Normalize scores (lower is better, so invert)
engine_summary['speed_score'] = 1 / engine_summary['execution_time']
engine_summary['memory_score'] = 1 / (engine_summary['memory_delta_mb'] + 0.1)  # Add small constant to avoid division by zero

# Plot normalized scores
engines = engine_summary.index
x_pos = range(len(engines))

width = 0.35
ax4.bar([x - width/2 for x in x_pos], engine_summary['speed_score'], width, label='Speed Score', alpha=0.8)
ax4.bar([x + width/2 for x in x_pos], engine_summary['memory_score'], width, label='Memory Score', alpha=0.8)

ax4.set_title('Engine Performance Scores\n(Higher is Better)', fontweight='bold')
ax4.set_ylabel('Normalized Score')
ax4.set_xlabel('Engine')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(engines)
ax4.legend()

plt.tight_layout()
plt.savefig('../artifacts/reports/engine_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nEngine Performance Summary:")
print(engine_summary[['execution_time', 'memory_delta_mb']])

In [None]:
# Detailed performance breakdown by operation type
operation_types = {
    'load_csv': 'Data Loading',
    'filter': 'Filtering',
    'sort': 'Sorting',
    'select_transform': 'Selection/Transform',
    'groupby_simple': 'Simple Aggregation',
    'groupby_complex': 'Complex Aggregation',
    'inner_join': 'Inner Join',
    'left_join_agg': 'Join + Aggregation',
    'ranking': 'Window Functions',
    'moving_avg': 'Moving Average',
    'ml_features': 'ML Feature Engineering'
}

# Extract operation type
successful_results['operation_type'] = successful_results['operation'].str.extract(r'^([^_]+(?:_[^_\d]+)?)')
successful_results['operation_type'] = successful_results['operation_type'].map(operation_types).fillna('Other')

# Create detailed comparison table
comparison_table = successful_results.groupby(['operation_type', 'engine']).agg({
    'execution_time': ['mean', 'std'],
    'memory_delta_mb': ['mean', 'std']
}).round(4)

print("Detailed Performance Comparison by Operation Type:")
print("=" * 80)
print(comparison_table)

## Engine Recommendations and Decision Matrix

In [None]:
def generate_recommendations(results_df):
    """
    Generate recommendations based on performance results.
    """
    recommendations = {
        'Small Datasets (< 100K rows)': {},
        'Medium Datasets (100K - 1M rows)': {},
        'Large Datasets (> 1M rows)': {},
        'Use Case Specific': {}
    }
    
    # Analyze performance by dataset size
    for operation_type in operation_types.values():
        op_data = results_df[results_df['operation_type'] == operation_type]
        
        if len(op_data) == 0:
            continue
        
        # Find best performer for each size category
        for size_category in ['10000', '100000', '1000000']:
            size_data = op_data[op_data['dataset_size'] == size_category]
            
            if len(size_data) == 0:
                continue
            
            # Best time performer
            best_time = size_data.loc[size_data['execution_time'].idxmin()]
            
            # Best memory performer
            best_memory = size_data.loc[size_data['memory_delta_mb'].idxmin()]
            
            if size_category == '10000':
                category = 'Small Datasets (< 100K rows)'
            elif size_category == '100000':
                category = 'Medium Datasets (100K - 1M rows)'
            else:
                category = 'Large Datasets (> 1M rows)'
            
            if operation_type not in recommendations[category]:
                recommendations[category][operation_type] = {}
            
            recommendations[category][operation_type] = {
                'best_speed': best_time['engine'],
                'best_memory': best_memory['engine'],
                'speed_time': f"{best_time['execution_time']:.4f}s",
                'memory_usage': f"{best_memory['memory_delta_mb']:.2f}MB"
            }
    
    # Use case specific recommendations
    recommendations['Use Case Specific'] = {
        'SQL-Heavy Analytics': {
            'recommended': 'duckdb',
            'reason': 'Native SQL support, optimized for analytical queries'
        },
        'Memory-Constrained Environments': {
            'recommended': 'polars',
            'reason': 'Efficient memory usage and lazy evaluation'
        },
        'Existing Pandas Workflows': {
            'recommended': 'pandas',
            'reason': 'Drop-in replacement, extensive ecosystem'
        },
        'Large-Scale Data Processing': {
            'recommended': 'polars',
            'reason': 'Parallel processing and query optimization'
        },
        'Complex Window Functions': {
            'recommended': 'duckdb',
            'reason': 'Advanced SQL window function support'
        },
        'ML Feature Engineering': {
            'recommended': 'polars',
            'reason': 'Fast transformations and memory efficiency'
        }
    }
    
    return recommendations

# Generate recommendations
recommendations = generate_recommendations(successful_results)

print("ENGINE SELECTION DECISION MATRIX")
print("=" * 50)
print()

for category, ops in recommendations.items():
    print(f"\n{category.upper()}:")
    print("-" * len(category))
    
    if category == 'Use Case Specific':
        for use_case, rec in ops.items():
            print(f"\n  {use_case}:")
            print(f"    Recommended: {rec['recommended'].upper()}")
            print(f"    Reason: {rec['reason']}")
    else:
        for operation, rec in ops.items():
            print(f"\n  {operation}:")
            print(f"    Best Speed: {rec['best_speed'].upper()} ({rec['speed_time']})")
            print(f"    Best Memory: {rec['best_memory'].upper()} ({rec['memory_usage']})")

## Summary Statistics and Winner Analysis

In [None]:
# Calculate winner statistics
def analyze_winners(results_df):
    """
    Analyze which engine wins most often across different metrics.
    """
    winners = {'speed': {}, 'memory': {}}
    
    # Group by operation and find winners
    for operation in results_df['operation'].unique():
        op_data = results_df[results_df['operation'] == operation]
        
        if len(op_data) >= 2:  # Need at least 2 engines to compare
            # Speed winner (lowest time)
            speed_winner = op_data.loc[op_data['execution_time'].idxmin(), 'engine']
            if speed_winner not in winners['speed']:
                winners['speed'][speed_winner] = 0
            winners['speed'][speed_winner] += 1
            
            # Memory winner (lowest memory delta)
            memory_winner = op_data.loc[op_data['memory_delta_mb'].idxmin(), 'engine']
            if memory_winner not in winners['memory']:
                winners['memory'][memory_winner] = 0
            winners['memory'][memory_winner] += 1
    
    return winners

winners = analyze_winners(successful_results)

print("WINNER ANALYSIS")
print("=" * 30)
print()

print("Speed Champions (Number of operations won):")
for engine, wins in sorted(winners['speed'].items(), key=lambda x: x[1], reverse=True):
    print(f"  {engine.upper()}: {wins} operations")

print("\nMemory Champions (Number of operations won):")
for engine, wins in sorted(winners['memory'].items(), key=lambda x: x[1], reverse=True):
    print(f"  {engine.upper()}: {wins} operations")

# Overall performance scores
print("\n" + "=" * 50)
print("OVERALL PERFORMANCE SCORES")
print("=" * 50)

engine_scores = successful_results.groupby('engine').agg({
    'execution_time': ['mean', 'median', 'std'],
    'memory_delta_mb': ['mean', 'median', 'std']
}).round(4)

print("\nExecution Time Statistics:")
print(engine_scores['execution_time'])

print("\nMemory Usage Statistics:")
print(engine_scores['memory_delta_mb'])

# Calculate relative performance
baseline_engine = 'pandas'
if baseline_engine in engine_scores.index:
    print(f"\nRelative Performance (vs {baseline_engine.upper()}):")
    baseline_time = engine_scores.loc[baseline_engine, ('execution_time', 'mean')]
    baseline_memory = engine_scores.loc[baseline_engine, ('memory_delta_mb', 'mean')]
    
    for engine in engine_scores.index:
        time_ratio = engine_scores.loc[engine, ('execution_time', 'mean')] / baseline_time
        memory_ratio = engine_scores.loc[engine, ('memory_delta_mb', 'mean')] / baseline_memory if baseline_memory > 0 else 1
        
        print(f"  {engine.upper()}:")
        print(f"    Speed: {time_ratio:.2f}x ({'faster' if time_ratio < 1 else 'slower'})")
        print(f"    Memory: {memory_ratio:.2f}x ({'less' if memory_ratio < 1 else 'more'})")

## Save Results and Generate Report

In [None]:
# Save detailed results
output_dir = Path('../artifacts/reports')
output_dir.mkdir(parents=True, exist_ok=True)

# Save raw results
results_df.to_csv(output_dir / 'engine_comparison_results.csv', index=False)

# Save performance summary
summary_report = {
    'test_summary': {
        'total_operations': len(results_df),
        'successful_operations': len(successful_results),
        'engines_tested': list(results_df['engine'].unique()),
        'dataset_sizes': sizes,
        'operation_types': list(operation_types.values())
    },
    'winners': winners,
    'recommendations': recommendations,
    'performance_summary': engine_scores.to_dict()
}

import json
with open(output_dir / 'engine_comparison_summary.json', 'w') as f:
    json.dump(summary_report, f, indent=2, default=str)

print(f"Results saved to {output_dir}")
print("\nFiles generated:")
print("  - engine_comparison_results.csv: Detailed performance data")
print("  - engine_comparison_summary.json: Summary and recommendations")
print("  - engine_comparison.png: Performance visualization")

print("\n" + "=" * 70)
print("ENGINE COMPARISON COMPLETE")
print("=" * 70)
print("\nKey Takeaways:")
print("1. Each engine has its strengths for different use cases")
print("2. Consider dataset size, operation type, and constraints")
print("3. Use the decision matrix above for engine selection")
print("4. Test with your specific data and workload for best results")