# Polars Engine Testing and Analysis

This notebook demonstrates Polars capabilities, lazy evaluation benefits, and performance comparisons with pandas for the ML Pipeline Framework.

## Topics Covered:
- ⚡ Lazy evaluation with Polars
- 🚀 Parallel processing capabilities
- 💾 Memory efficiency vs pandas
- 🔧 Complex aggregations and window functions
- 🌊 Streaming large datasets
- 🤝 Integration with ML pipeline

## Setup and Imports

In [None]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import psutil
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(f"Polars version: {pl.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"System CPU cores: {psutil.cpu_count()}")
print(f"System memory: {psutil.virtual_memory().total / 1024**3:.1f} GB")
print(f"Available memory: {psutil.virtual_memory().available / 1024**3:.1f} GB")

# Set Polars configuration for better performance
pl.Config.set_streaming_chunk_size(25000)
pl.Config.set_fmt_str_lengths(50)

print(f"\nPolars configuration:")
print(f"  Streaming chunk size: {pl.Config.streaming_chunk_size()}")
print(f"  String display length: {pl.Config.fmt_str_lengths()}")

## Test Data Generation

In [None]:
def generate_polars_test_data(n_rows=100000, save_path=None):
    """Generate test data optimized for Polars testing."""
    
    np.random.seed(42)
    
    # Generate data with various data types
    data = {
        # Integer columns
        'id': range(n_rows),
        'customer_id': np.random.randint(1, n_rows//10, n_rows),
        'product_id': np.random.randint(1, 1000, n_rows),
        'quantity': np.random.randint(1, 10, n_rows),
        
        # Float columns
        'price': np.random.lognormal(3, 1, n_rows),
        'discount': np.random.beta(2, 8, n_rows),
        'rating': np.random.normal(4.0, 1.0, n_rows),
        
        # String columns
        'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], n_rows),
        'region': np.random.choice(['North', 'South', 'East', 'West'], n_rows),
        'status': np.random.choice(['Active', 'Inactive', 'Pending'], n_rows),
        
        # Boolean column
        'is_premium': np.random.choice([True, False], n_rows, p=[0.3, 0.7]),
        
        # Date columns
        'order_date': pd.date_range('2020-01-01', periods=n_rows, freq='15min'),
        'ship_date': pd.date_range('2020-01-02', periods=n_rows, freq='15min'),
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add calculated columns
    df['total_amount'] = df['price'] * df['quantity'] * (1 - df['discount'])
    df['days_to_ship'] = (df['ship_date'] - df['order_date']).dt.days
    
    # Add some missing values
    missing_indices = np.random.choice(df.index, size=int(0.02 * len(df)), replace=False)
    df.loc[missing_indices, 'rating'] = np.nan
    
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_csv(save_path, index=False)
        print(f"Test data saved to {save_path}")
    
    return df

# Generate test datasets
test_sizes = [50000, 100000, 250000]
polars_test_files = {}

for size in test_sizes:
    file_path = f'../data/polars_test_{size}.csv'
    polars_test_files[size] = file_path
    
    if not os.path.exists(file_path):
        print(f"Generating Polars test data with {size:,} rows...")
        df = generate_polars_test_data(size, file_path)
    else:
        print(f"Polars test data with {size:,} rows already exists")

print("\nPolars test files:")
for size, path in polars_test_files.items():
    file_size = os.path.getsize(path) / 1024**2
    print(f"  {size:,} rows: {path} ({file_size:.1f} MB)")

## Lazy Evaluation Demonstration

In [None]:
print("🧠 Polars Lazy Evaluation Demo")
print("=" * 40)

# Load data with Polars (lazy)
test_file = polars_test_files[100000]

print("\n1. Creating Lazy Frame (no data loaded yet)...")
start_time = time.time()

# Create lazy frame - this is instant
lazy_df = pl.scan_csv(test_file)

lazy_create_time = time.time() - start_time
print(f"   Lazy frame creation time: {lazy_create_time:.6f}s (instant!)")
print(f"   Schema: {lazy_df.schema}")

print("\n2. Building query plan (still no execution)...")
query_start = time.time()

# Complex query - still no execution
lazy_query = (
    lazy_df
    .filter(pl.col("price") > 50)
    .with_columns([
        pl.col("total_amount").alias("amount"),
        (pl.col("price") * pl.col("discount")).alias("discount_amount")
    ])
    .group_by(["category", "region"])
    .agg([
        pl.col("amount").sum().alias("total_sales"),
        pl.col("amount").mean().alias("avg_sale"),
        pl.col("quantity").sum().alias("total_quantity"),
        pl.count().alias("transaction_count")
    ])
    .filter(pl.col("total_sales") > 1000)
    .sort("total_sales", descending=True)
)

query_build_time = time.time() - query_start
print(f"   Query plan build time: {query_build_time:.6f}s")

# Show the query plan
print("\n3. Query Plan (optimized):")
print(lazy_query.explain())

print("\n4. Executing query (now data is actually processed)...")
exec_start = time.time()

result = lazy_query.collect()

exec_time = time.time() - exec_start
print(f"   Query execution time: {exec_time:.3f}s")
print(f"   Result shape: {result.shape}")
print(f"\nFirst 5 results:")
print(result.head())

print(f"\n📊 Lazy Evaluation Benefits:")
print(f"   • Query optimization before execution")
print(f"   • No intermediate memory usage")
print(f"   • Predicate pushdown (filters applied early)")
print(f"   • Projection pushdown (only needed columns)")

## Performance Comparison: Polars vs Pandas

In [None]:
def benchmark_polars_vs_pandas(file_path, operation_name, polars_func, pandas_func):
    """Benchmark Polars vs Pandas for specific operations."""
    
    results = {}
    
    # Benchmark Polars
    print(f"\n🚀 Benchmarking {operation_name}...")
    
    # Polars (lazy)
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss / 1024**2
    
    polars_result = polars_func(file_path)
    
    polars_time = time.time() - start_time
    polars_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory
    
    # Pandas
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss / 1024**2
    
    pandas_result = pandas_func(file_path)
    
    pandas_time = time.time() - start_time
    pandas_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory
    
    # Calculate speedup
    speedup = pandas_time / polars_time if polars_time > 0 else float('inf')
    memory_ratio = pandas_memory / polars_memory if polars_memory > 0 else float('inf')
    
    results = {
        'operation': operation_name,
        'polars_time': polars_time,
        'pandas_time': pandas_time,
        'speedup': speedup,
        'polars_memory': polars_memory,
        'pandas_memory': pandas_memory,
        'memory_ratio': memory_ratio
    }
    
    print(f"   Polars: {polars_time:.3f}s, {polars_memory:.1f}MB")
    print(f"   Pandas: {pandas_time:.3f}s, {pandas_memory:.1f}MB")
    print(f"   Speedup: {speedup:.2f}x, Memory reduction: {memory_ratio:.2f}x")
    
    return results

# Define benchmark operations
def polars_groupby_agg(file_path):
    return (
        pl.scan_csv(file_path)
        .group_by(["category", "region"])
        .agg([
            pl.col("total_amount").sum().alias("total_sales"),
            pl.col("total_amount").mean().alias("avg_sale"),
            pl.col("quantity").sum().alias("total_qty"),
            pl.count().alias("count")
        ])
        .collect()
    )

def pandas_groupby_agg(file_path):
    df = pd.read_csv(file_path)
    return df.groupby(["category", "region"]).agg({
        'total_amount': ['sum', 'mean'],
        'quantity': 'sum',
        'id': 'count'
    })

def polars_filter_sort(file_path):
    return (
        pl.scan_csv(file_path)
        .filter((pl.col("price") > 100) & (pl.col("is_premium") == True))
        .sort(["total_amount", "order_date"], descending=[True, False])
        .collect()
    )

def pandas_filter_sort(file_path):
    df = pd.read_csv(file_path)
    filtered = df[(df['price'] > 100) & (df['is_premium'] == True)]
    return filtered.sort_values(['total_amount', 'order_date'], ascending=[False, True])

def polars_window_function(file_path):
    return (
        pl.scan_csv(file_path)
        .with_columns([
            pl.col("total_amount").rank(method="ordinal").over("category").alias("rank_in_category"),
            pl.col("price").mean().over("customer_id").alias("customer_avg_price")
        ])
        .collect()
    )

def pandas_window_function(file_path):
    df = pd.read_csv(file_path)
    df['rank_in_category'] = df.groupby('category')['total_amount'].rank(method='first')
    df['customer_avg_price'] = df.groupby('customer_id')['price'].transform('mean')
    return df

# Run benchmarks
benchmark_operations = [
    ('GroupBy Aggregation', polars_groupby_agg, pandas_groupby_agg),
    ('Filter & Sort', polars_filter_sort, pandas_filter_sort),
    ('Window Functions', polars_window_function, pandas_window_function)
]

benchmark_results = []
test_file = polars_test_files[100000]

for op_name, polars_func, pandas_func in benchmark_operations:
    try:
        result = benchmark_polars_vs_pandas(test_file, op_name, polars_func, pandas_func)
        benchmark_results.append(result)
    except Exception as e:
        print(f"Error in {op_name}: {e}")

# Summary
if benchmark_results:
    results_df = pd.DataFrame(benchmark_results)
    
    print(f"\n📊 Performance Summary:")
    print(f"   Average speedup: {results_df['speedup'].mean():.2f}x")
    print(f"   Average memory reduction: {results_df['memory_ratio'].mean():.2f}x")
    print(f"   Best speedup: {results_df['speedup'].max():.2f}x ({results_df.loc[results_df['speedup'].idxmax(), 'operation']})")

## Complex Aggregations and Window Functions

In [None]:
print("🔧 Advanced Polars Operations Demo")
print("=" * 40)

# Load data
df = pl.scan_csv(polars_test_files[100000])

print("\n1. Complex Aggregations with Multiple Grouping Levels")
complex_agg = (
    df
    .group_by(["category", "region", "status"])
    .agg([
        pl.col("total_amount").sum().alias("total_sales"),
        pl.col("total_amount").mean().alias("avg_sale"),
        pl.col("total_amount").std().alias("sales_std"),
        pl.col("total_amount").quantile(0.5).alias("median_sale"),
        pl.col("total_amount").quantile(0.95).alias("p95_sale"),
        pl.col("customer_id").n_unique().alias("unique_customers"),
        pl.count().alias("transaction_count")
    ])
    .filter(pl.col("transaction_count") > 100)
    .sort("total_sales", descending=True)
    .collect()
)

print(f"   Result shape: {complex_agg.shape}")
print("   Top 5 category-region combinations:")
print(complex_agg.head())

print("\n2. Advanced Window Functions")
window_analysis = (
    df
    .with_columns([
        # Ranking within groups
        pl.col("total_amount").rank(method="ordinal", descending=True).over("category").alias("category_rank"),
        
        # Rolling statistics
        pl.col("total_amount").mean().over("customer_id").alias("customer_avg_spend"),
        pl.col("total_amount").std().over("customer_id").alias("customer_spend_volatility"),
        
        # Percentage of total
        (pl.col("total_amount") / pl.col("total_amount").sum().over("category") * 100).alias("pct_of_category_sales"),
        
        # Lead/lag operations (simulated with shift)
        pl.col("total_amount").shift(1).over(["customer_id"]).alias("prev_purchase_amount"),
        pl.col("total_amount").shift(-1).over(["customer_id"]).alias("next_purchase_amount")
    ])
    .filter(pl.col("category_rank") <= 10)  # Top 10 in each category
    .collect()
)

print(f"   Window analysis result shape: {window_analysis.shape}")
print("   Sample with window functions:")
print(window_analysis.select([
    "customer_id", "category", "total_amount", "category_rank", 
    "customer_avg_spend", "pct_of_category_sales"
]).head())

print("\n3. Time-based Analysis")
time_analysis = (
    df
    .with_columns([
        pl.col("order_date").str.to_datetime("%Y-%m-%d %H:%M:%S").alias("order_datetime"),
    ])
    .with_columns([
        pl.col("order_datetime").dt.year().alias("year"),
        pl.col("order_datetime").dt.month().alias("month"),
        pl.col("order_datetime").dt.weekday().alias("weekday"),
        pl.col("order_datetime").dt.hour().alias("hour")
    ])
    .group_by(["year", "month", "category"])
    .agg([
        pl.col("total_amount").sum().alias("monthly_sales"),
        pl.col("total_amount").count().alias("monthly_transactions"),
        pl.col("customer_id").n_unique().alias("monthly_customers")
    ])
    .sort(["year", "month", "monthly_sales"], descending=[True, True, True])
    .collect()
)

print(f"   Time analysis result shape: {time_analysis.shape}")
print("   Monthly trends sample:")
print(time_analysis.head())

print("\n4. Conditional Aggregations")
conditional_agg = (
    df
    .group_by("category")
    .agg([
        # Conditional sums
        pl.when(pl.col("is_premium") == True)
        .then(pl.col("total_amount"))
        .otherwise(0)
        .sum()
        .alias("premium_sales"),
        
        pl.when(pl.col("discount") > 0.1)
        .then(pl.col("total_amount"))
        .otherwise(0)
        .sum()
        .alias("discounted_sales"),
        
        # Conditional counts
        pl.when(pl.col("rating") >= 4.0)
        .then(1)
        .otherwise(0)
        .sum()
        .alias("high_rating_count"),
        
        pl.col("total_amount").sum().alias("total_sales"),
        pl.count().alias("total_transactions")
    ])
    .with_columns([
        (pl.col("premium_sales") / pl.col("total_sales") * 100).alias("premium_pct"),
        (pl.col("discounted_sales") / pl.col("total_sales") * 100).alias("discount_pct"),
        (pl.col("high_rating_count") / pl.col("total_transactions") * 100).alias("high_rating_pct")
    ])
    .collect()
)

print(f"   Conditional aggregation result:")
print(conditional_agg)

## Streaming Large Datasets

In [None]:
print("🌊 Polars Streaming Demo")
print("=" * 30)

# Use the largest test file
large_file = polars_test_files[250000]

print(f"\nProcessing file: {large_file}")
print(f"File size: {os.path.getsize(large_file) / 1024**2:.1f} MB")

print("\n1. Streaming Aggregation (processes data in chunks)")

start_time = time.time()
start_memory = psutil.Process().memory_info().rss / 1024**2

# Streaming query - processes data in chunks without loading everything
streaming_result = (
    pl.scan_csv(large_file)
    .filter(pl.col("price") > 50)
    .group_by(["category", "region"])
    .agg([
        pl.col("total_amount").sum().alias("total_sales"),
        pl.col("total_amount").mean().alias("avg_sale"),
        pl.col("quantity").sum().alias("total_quantity"),
        pl.count().alias("transaction_count")
    ])
    .sort("total_sales", descending=True)
    .collect(streaming=True)  # Enable streaming
)

streaming_time = time.time() - start_time
streaming_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory

print(f"   Streaming execution time: {streaming_time:.3f}s")
print(f"   Memory used: {streaming_memory:.1f} MB")
print(f"   Result shape: {streaming_result.shape}")
print("   Top results:")
print(streaming_result.head())

print("\n2. Compare with Non-streaming (loads all data)")

start_time = time.time()
start_memory = psutil.Process().memory_info().rss / 1024**2

# Non-streaming query
regular_result = (
    pl.scan_csv(large_file)
    .filter(pl.col("price") > 50)
    .group_by(["category", "region"])
    .agg([
        pl.col("total_amount").sum().alias("total_sales"),
        pl.col("total_amount").mean().alias("avg_sale"),
        pl.col("quantity").sum().alias("total_quantity"),
        pl.count().alias("transaction_count")
    ])
    .sort("total_sales", descending=True)
    .collect(streaming=False)  # Disable streaming
)

regular_time = time.time() - start_time
regular_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory

print(f"   Regular execution time: {regular_time:.3f}s")
print(f"   Memory used: {regular_memory:.1f} MB")

print(f"\n📊 Streaming vs Regular:")
print(f"   Time difference: {(regular_time - streaming_time):.3f}s")
print(f"   Memory savings: {regular_memory - streaming_memory:.1f} MB")
print(f"   Memory reduction: {((regular_memory - streaming_memory) / regular_memory * 100):.1f}%")

print("\n3. Streaming with Multiple Operations")

complex_streaming = (
    pl.scan_csv(large_file)
    .with_columns([
        pl.col("order_date").str.to_datetime("%Y-%m-%d %H:%M:%S").alias("order_datetime"),
    ])
    .with_columns([
        pl.col("order_datetime").dt.month().alias("month"),
        (pl.col("total_amount") > pl.col("total_amount").mean()).alias("above_avg_sale")
    ])
    .group_by(["month", "category", "above_avg_sale"])
    .agg([
        pl.col("total_amount").sum().alias("sales"),
        pl.count().alias("count")
    ])
    .collect(streaming=True)
)

print(f"   Complex streaming result shape: {complex_streaming.shape}")
print("   Sample:")
print(complex_streaming.head())

print("\n✅ Streaming Benefits:")
print("   • Process datasets larger than memory")
print("   • Consistent memory usage regardless of data size")
print("   • Automatic query optimization")
print("   • Parallelization across chunks")

## Memory Efficiency Analysis

In [None]:
print("💾 Memory Efficiency: Polars vs Pandas")
print("=" * 45)

def analyze_memory_usage(file_path, dataset_name):
    """Compare memory usage between Polars and Pandas."""
    
    print(f"\nAnalyzing {dataset_name}...")
    file_size = os.path.getsize(file_path) / 1024**2
    print(f"File size: {file_size:.1f} MB")
    
    # Pandas memory usage
    start_memory = psutil.Process().memory_info().rss / 1024**2
    pandas_df = pd.read_csv(file_path)
    pandas_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory
    pandas_internal_memory = pandas_df.memory_usage(deep=True).sum() / 1024**2
    
    print(f"\nPandas:")
    print(f"  System memory used: {pandas_memory:.1f} MB")
    print(f"  DataFrame memory: {pandas_internal_memory:.1f} MB")
    print(f"  Memory overhead: {pandas_memory - pandas_internal_memory:.1f} MB")
    
    # Clear pandas dataframe
    del pandas_df
    
    # Polars memory usage
    start_memory = psutil.Process().memory_info().rss / 1024**2
    polars_df = pl.read_csv(file_path)
    polars_memory = psutil.Process().memory_info().rss / 1024**2 - start_memory
    
    # Polars doesn't have direct memory_usage method, estimate from types
    polars_estimated_memory = 0
    for col, dtype in polars_df.schema.items():
        if dtype == pl.Int64:
            polars_estimated_memory += len(polars_df) * 8
        elif dtype == pl.Float64:
            polars_estimated_memory += len(polars_df) * 8
        elif dtype == pl.Boolean:
            polars_estimated_memory += len(polars_df) * 1
        else:  # String types
            avg_str_len = polars_df[col].str.len_chars().mean() if col in polars_df.columns else 20
            polars_estimated_memory += len(polars_df) * avg_str_len
    
    polars_estimated_memory = polars_estimated_memory / 1024**2
    
    print(f"\nPolars:")
    print(f"  System memory used: {polars_memory:.1f} MB")
    print(f"  Estimated DataFrame memory: {polars_estimated_memory:.1f} MB")
    print(f"  Memory overhead: {polars_memory - polars_estimated_memory:.1f} MB")
    
    # Comparison
    memory_improvement = (pandas_memory - polars_memory) / pandas_memory * 100
    print(f"\n📊 Comparison:")
    print(f"  Polars uses {memory_improvement:.1f}% less memory than Pandas")
    print(f"  Memory ratio (Pandas/Polars): {pandas_memory / polars_memory:.2f}x")
    
    return {
        'dataset': dataset_name,
        'file_size_mb': file_size,
        'pandas_memory': pandas_memory,
        'polars_memory': polars_memory,
        'memory_ratio': pandas_memory / polars_memory,
        'memory_improvement_pct': memory_improvement
    }

# Analyze different dataset sizes
memory_comparisons = []

for size, file_path in polars_test_files.items():
    try:
        result = analyze_memory_usage(file_path, f"{size:,} rows")
        memory_comparisons.append(result)
    except Exception as e:
        print(f"Error analyzing {size} rows: {e}")

# Summary
if memory_comparisons:
    memory_df = pd.DataFrame(memory_comparisons)
    
    print(f"\n📈 Memory Efficiency Summary:")
    print(memory_df.to_string(index=False, float_format='%.2f'))
    
    avg_improvement = memory_df['memory_improvement_pct'].mean()
    avg_ratio = memory_df['memory_ratio'].mean()
    
    print(f"\n🎯 Key Insights:")
    print(f"   • Average memory reduction: {avg_improvement:.1f}%")
    print(f"   • Average memory ratio: {avg_ratio:.2f}x less memory")
    print(f"   • Best improvement: {memory_df['memory_improvement_pct'].max():.1f}%")
    print(f"   • Consistent efficiency across dataset sizes")

## Integration with ML Pipeline

In [None]:
print("🤝 Polars Integration with ML Pipeline")
print("=" * 45)

# Load data with Polars
df = pl.scan_csv(polars_test_files[100000])

print("\n1. Data Preprocessing with Polars")

# Feature engineering pipeline
preprocessed = (
    df
    .with_columns([
        # Create derived features
        (pl.col("price") * pl.col("quantity")).alias("gross_amount"),
        (pl.col("discount") * 100).round(2).alias("discount_pct"),
        
        # Date features
        pl.col("order_date").str.to_datetime("%Y-%m-%d %H:%M:%S").alias("order_datetime"),
    ])
    .with_columns([
        pl.col("order_datetime").dt.month().alias("order_month"),
        pl.col("order_datetime").dt.weekday().alias("order_weekday"),
        pl.col("order_datetime").dt.hour().alias("order_hour"),
    ])
    .with_columns([
        # Categorical encoding (convert to numeric)
        pl.col("category").map_elements(lambda x: hash(x) % 1000, return_dtype=pl.Int32).alias("category_encoded"),
        pl.col("region").map_elements(lambda x: hash(x) % 100, return_dtype=pl.Int32).alias("region_encoded"),
        pl.col("status").map_elements(lambda x: hash(x) % 10, return_dtype=pl.Int32).alias("status_encoded"),
        
        # Boolean to int
        pl.col("is_premium").cast(pl.Int8).alias("is_premium_int"),
    ])
    .collect()
)

print(f"   Preprocessed data shape: {preprocessed.shape}")
print(f"   New columns created: {set(preprocessed.columns) - set(['id', 'customer_id', 'product_id', 'quantity', 'price', 'discount', 'rating', 'category', 'region', 'status', 'is_premium', 'order_date', 'ship_date', 'total_amount', 'days_to_ship'])}")

print("\n2. Converting to ML-ready format")

# Select features for ML
feature_columns = [
    'price', 'quantity', 'discount_pct', 'rating', 'days_to_ship',
    'order_month', 'order_weekday', 'order_hour',
    'category_encoded', 'region_encoded', 'status_encoded', 'is_premium_int'
]

# Create feature matrix and target
ml_ready = (
    preprocessed
    .select(feature_columns + ['total_amount'])  # target variable
    .drop_nulls()  # Remove rows with missing values
)

print(f"   ML-ready data shape: {ml_ready.shape}")
print(f"   Features: {feature_columns}")
print(f"   Target: total_amount")

# Convert to numpy arrays for sklearn compatibility
X = ml_ready.select(feature_columns).to_numpy()
y = ml_ready.select('total_amount').to_numpy().flatten()

print(f"   Feature matrix shape: {X.shape}")
print(f"   Target vector shape: {y.shape}")

print("\n3. Quick ML Model Training (Demo)")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"   Model performance:")
print(f"     R² Score: {r2:.4f}")
print(f"     RMSE: {np.sqrt(mse):.2f}")

# Feature importance (using Polars for analysis)
importance_df = pl.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort('importance', descending=True)

print(f"\n   Top 5 important features:")
print(importance_df.head())

print("\n4. Data Export Options")

# Export preprocessed data
export_options = {
    'csv': lambda df, path: df.write_csv(path),
    'parquet': lambda df, path: df.write_parquet(path),
    'json': lambda df, path: df.write_ndjson(path),
    'pandas': lambda df, path: df.to_pandas()
}

print(f"   Available export formats: {list(export_options.keys())}")

# Example: Convert to pandas for scikit-learn
pandas_df = ml_ready.to_pandas()
print(f"   Converted to pandas: {pandas_df.shape}")
print(f"   Memory usage: {pandas_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

print("\n✅ Integration Benefits:")
print("   • Fast data preprocessing with lazy evaluation")
print("   • Memory efficient feature engineering")
print("   • Easy conversion to pandas/numpy for ML libraries")
print("   • Multiple export formats for different use cases")
print("   • Consistent performance across data sizes")

## Performance Visualization

In [None]:
# Create comprehensive performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Polars vs Pandas Performance Analysis', fontsize=16)

# 1. Speed comparison
if benchmark_results:
    ops = [r['operation'] for r in benchmark_results]
    speedups = [r['speedup'] for r in benchmark_results]
    
    bars = axes[0, 0].bar(range(len(ops)), speedups, color='skyblue', alpha=0.8)
    axes[0, 0].set_xlabel('Operation')
    axes[0, 0].set_ylabel('Speedup (x faster)')
    axes[0, 0].set_title('Polars Speedup vs Pandas')
    axes[0, 0].set_xticks(range(len(ops)))
    axes[0, 0].set_xticklabels(ops, rotation=45, ha='right')
    axes[0, 0].axhline(y=1, color='red', linestyle='--', alpha=0.7, label='Same speed')
    axes[0, 0].legend()
    
    # Add value labels on bars
    for bar, speedup in zip(bars, speedups):
        axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                       f'{speedup:.1f}x', ha='center', va='bottom')
else:
    axes[0, 0].text(0.5, 0.5, 'No benchmark data available', ha='center', va='center')
    axes[0, 0].set_title('Speed Comparison')

# 2. Memory efficiency
if memory_comparisons:
    datasets = [comp['dataset'] for comp in memory_comparisons]
    pandas_mem = [comp['pandas_memory'] for comp in memory_comparisons]
    polars_mem = [comp['polars_memory'] for comp in memory_comparisons]
    
    x = np.arange(len(datasets))
    width = 0.35
    
    axes[0, 1].bar(x - width/2, pandas_mem, width, label='Pandas', color='orange', alpha=0.8)
    axes[0, 1].bar(x + width/2, polars_mem, width, label='Polars', color='green', alpha=0.8)
    
    axes[0, 1].set_xlabel('Dataset Size')
    axes[0, 1].set_ylabel('Memory Usage (MB)')
    axes[0, 1].set_title('Memory Usage Comparison')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(datasets)
    axes[0, 1].legend()
else:
    axes[0, 1].text(0.5, 0.5, 'No memory data available', ha='center', va='center')
    axes[0, 1].set_title('Memory Usage Comparison')

# 3. Memory efficiency ratio
if memory_comparisons:
    ratios = [comp['memory_ratio'] for comp in memory_comparisons]
    
    axes[1, 0].bar(datasets, ratios, color='purple', alpha=0.7)
    axes[1, 0].set_xlabel('Dataset Size')
    axes[1, 0].set_ylabel('Memory Ratio (Pandas/Polars)')
    axes[1, 0].set_title('Memory Efficiency Ratio')
    axes[1, 0].axhline(y=1, color='red', linestyle='--', alpha=0.7, label='Same efficiency')
    axes[1, 0].legend()
    
    # Add value labels
    for i, ratio in enumerate(ratios):
        axes[1, 0].text(i, ratio + 0.05, f'{ratio:.1f}x', ha='center', va='bottom')
else:
    axes[1, 0].text(0.5, 0.5, 'No ratio data available', ha='center', va='center')
    axes[1, 0].set_title('Memory Efficiency Ratio')

# 4. Combined performance score
if benchmark_results and memory_comparisons:
    # Calculate combined score (speedup * memory efficiency)
    combined_scores = []
    labels = []
    
    for bench in benchmark_results:
        # Find matching memory comparison (use average)
        avg_memory_ratio = np.mean([comp['memory_ratio'] for comp in memory_comparisons])
        combined_score = bench['speedup'] * avg_memory_ratio
        combined_scores.append(combined_score)
        labels.append(bench['operation'])
    
    bars = axes[1, 1].bar(range(len(labels)), combined_scores, color='teal', alpha=0.8)
    axes[1, 1].set_xlabel('Operation')
    axes[1, 1].set_ylabel('Combined Score (Speed × Memory)')
    axes[1, 1].set_title('Overall Performance Advantage')
    axes[1, 1].set_xticks(range(len(labels)))
    axes[1, 1].set_xticklabels(labels, rotation=45, ha='right')
    
    # Add value labels
    for bar, score in zip(bars, combined_scores):
        axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.2,
                       f'{score:.1f}', ha='center', va='bottom')
else:
    axes[1, 1].text(0.5, 0.5, 'No combined data available', ha='center', va='center')
    axes[1, 1].set_title('Overall Performance Advantage')

plt.tight_layout()
plt.show()

# Summary statistics
print("\n📊 POLARS PERFORMANCE SUMMARY")
print("=" * 40)

if benchmark_results:
    avg_speedup = np.mean([r['speedup'] for r in benchmark_results])
    best_speedup = max([r['speedup'] for r in benchmark_results])
    print(f"\n⚡ Speed Performance:")
    print(f"   • Average speedup: {avg_speedup:.2f}x")
    print(f"   • Best speedup: {best_speedup:.2f}x")

if memory_comparisons:
    avg_memory_improvement = np.mean([comp['memory_improvement_pct'] for comp in memory_comparisons])
    avg_memory_ratio = np.mean([comp['memory_ratio'] for comp in memory_comparisons])
    print(f"\n💾 Memory Efficiency:")
    print(f"   • Average memory reduction: {avg_memory_improvement:.1f}%")
    print(f"   • Average memory efficiency: {avg_memory_ratio:.2f}x")

print(f"\n🎯 Key Advantages:")
print(f"   • Lazy evaluation with query optimization")
print(f"   • Automatic parallelization")
print(f"   • Streaming for large datasets")
print(f"   • Memory efficient columnar storage")
print(f"   • Easy integration with existing ML workflows")

## Best Practices and Recommendations

In [None]:
print("\n🎯 POLARS BEST PRACTICES AND RECOMMENDATIONS")
print("=" * 55)

print("\n1. WHEN TO USE POLARS:")
print("   ✅ Large datasets (> 1GB)")
print("   ✅ Complex aggregations and transformations")
print("   ✅ Memory-constrained environments")
print("   ✅ ETL pipelines with performance requirements")
print("   ✅ Time-series analysis with window functions")

print("\n2. OPTIMIZATION TECHNIQUES:")
print("   • Use lazy evaluation (scan_csv vs read_csv)")
print("   • Enable streaming for large datasets")
print("   • Filter early in the query pipeline")
print("   • Use select() to reduce columns before heavy operations")
print("   • Leverage built-in parallelization (no manual threading needed)")

print("\n3. MEMORY MANAGEMENT:")
print("   • Use appropriate data types (pl.Int32 vs pl.Int64)")
print("   • Leverage categorical data when appropriate")
print("   • Process data in chunks with streaming")
print("   • Avoid collecting() until absolutely necessary")

print("\n4. INTEGRATION PATTERNS:")
print("   • Use Polars for data preprocessing")
print("   • Convert to_pandas() only for ML model training")
print("   • Use to_numpy() for direct array operations")
print("   • Export to parquet for intermediate storage")

print("\n5. COMMON PATTERNS:")
print("   • Chain operations for query optimization")
print("   • Use with_columns() for feature engineering")
print("   • Leverage group_by().agg() for complex aggregations")
print("   • Use window functions for time-series features")

print("\n6. PERFORMANCE TIPS:")
print("   • Prefer Polars expressions over map_elements()")
print("   • Use join() instead of manual merging logic")
print("   • Batch similar operations together")
print("   • Monitor query plans with explain()")

print("\n7. WHEN TO STICK WITH PANDAS:")
print("   • Small datasets (< 100MB)")
print("   • Heavy integration with pandas ecosystem")
print("   • Complex custom functions not available in Polars")
print("   • Interactive data exploration (better visualization support)")

# Performance recommendations based on our tests
if benchmark_results and memory_comparisons:
    print(f"\n📊 PERFORMANCE INSIGHTS (This System):")
    
    if benchmark_results:
        best_operation = max(benchmark_results, key=lambda x: x['speedup'])
        print(f"   • Best speedup achieved: {best_operation['speedup']:.2f}x ({best_operation['operation']})")
    
    if memory_comparisons:
        best_memory = max(memory_comparisons, key=lambda x: x['memory_improvement_pct'])
        print(f"   • Best memory reduction: {best_memory['memory_improvement_pct']:.1f}% ({best_memory['dataset']})")
    
    print(f"   • Recommended for datasets > 50MB")
    print(f"   • Streaming recommended for datasets > 500MB")

print("\n✅ Polars engine analysis complete!")
print("\n🚀 Ready for production use with:")
print("   • Significant performance improvements")
print("   • Reduced memory footprint")
print("   • Easy ML pipeline integration")
print("   • Scalable data processing capabilities")