In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
from datetime import datetime, MINYEAR
from recipies import Ingredients, Recipe
from recipies.selector import all_predictors
from recipies.constants import Accumulator
from recipies.steps import StepHistorical, StepSklearn
from sklearn.impute import MissingIndicator

In [None]:
# Set up random state for reproducible results
rand_state = np.random.RandomState(42)

# Create time columns for two different groups
timecolumn = pl.concat(
    [
        pl.datetime_range(datetime(MINYEAR, 1, 1, 0), datetime(MINYEAR, 1, 1, 5), "1h", eager=True),
        pl.datetime_range(datetime(MINYEAR, 1, 1, 0), datetime(MINYEAR, 1, 1, 3), "1h", eager=True),
    ]
)

# Create sample DataFrame
df = pl.DataFrame(
    {
        "id": [1] * 6 + [2] * 4,
        "time": timecolumn,
        "y": rand_state.normal(size=(10,)),
        "x1": rand_state.normal(loc=10, scale=5, size=(10,)),
        "x2": rand_state.binomial(n=1, p=0.3, size=(10,)),
        "x3": pl.Series(["a", "b", "c", "a", "c", "b", "c", "a", "b", "c"], dtype=pl.Categorical),
        "x4": pl.Series(["x", "y", "y", "x", "y", "y", "x", "x", "y", "x"], dtype=pl.Categorical),
    }
)

# Introduce some missing values
df = df.with_columns(
    pl.when(pl.int_range(pl.len()).is_in([1, 2, 4, 7])).then(None).otherwise(pl.col("x1")).alias("x1")
)

# Create Ingredients and Recipe
ing = Ingredients(df)
rec = Recipe(ing, outcomes=["y"], predictors=["x1", "x2", "x3", "x4"], groups=["id"], sequences=["time"])
rec.add_step(StepSklearn(MissingIndicator(features="all"), sel=all_predictors()))
rec.add_step("impute", "x1", method="mean")
rec.add_step(StepHistorical(sel=all_predictors(), fun=Accumulator.MEAN, suffix="mean_hist"))

# Apply the recipe to the ingredients
df = rec.prep()
df2 = df.copy()
# Apply the recipe to a new DataFrame (e.g., test set)
df2 = rec.bake(df2)

In [None]:
# Load the benchmark results
results = pd.read_csv(
    "/Users/robin/Downloads/results_datasizes_[50, 100, 1000, 10000, 100000, 1000000]_seeds_[1, 2, 3, 4, 5]_datetime_2025-07-09_12-27-28.csv"
)

In [None]:
# Filter to only include steps containing "Historical"
results = results[results["step"].str.contains("Historical", case=False, na=False)]

In [None]:
print(f"Filtered dataset shape: {results.shape}")
print(f"Steps after filtering: {results['step'].unique()}")
print(results)

In [None]:
# Display basic info about the dataset
print("Dataset shape:", results.shape)
print("\nColumn names:")
print(results.columns.tolist())
print("\nFirst few rows:")
print(results.head())
print("\nUnique data sizes:", sorted(results["data_size"].unique()))
print("\nUnique steps:", results["step"].unique())

In [None]:
# Aggregate performance by data size (across all steps)
# Since your data is already aggregated by step, we'll aggregate further by data_size

summary = (
    results.groupby("data_size")
    .agg(
        {
            "duration_mean_Polars": "mean",
            "duration_mean_Pandas": "mean",
            "duration_std_Polars": "mean",
            "duration_std_Pandas": "mean",
            "memory_mean_Polars": "mean",
            "memory_mean_Pandas": "mean",
            "memory_std_Polars": "mean",
            "memory_std_Pandas": "mean",
            "speedup": "mean",
        }
    )
    .round(4)
)

# Reset index to make data_size a column
summary = summary.reset_index()

print("Aggregated Performance by Data Size:")
print(summary)

In [None]:
# Calculate additional performance metrics
summary["duration_ratio_pandas_vs_polars"] = (
    summary["duration_mean_Pandas"] / summary["duration_mean_Polars"]
).round(2)
summary["memory_ratio_pandas_vs_polars"] = (
    summary["memory_mean_Pandas"] / summary["memory_mean_Polars"]
).round(2)

# Calculate coefficient of variation for each backend
summary["duration_cv_polars"] = (
    summary["duration_std_Polars"] / summary["duration_mean_Polars"] * 100
).round(2)
summary["duration_cv_pandas"] = (
    summary["duration_std_Pandas"] / summary["duration_mean_Pandas"] * 100
).round(2)
summary["memory_cv_polars"] = (summary["memory_std_Polars"] / summary["memory_mean_Polars"] * 100).round(2)
summary["memory_cv_pandas"] = (summary["memory_std_Pandas"] / summary["memory_mean_Pandas"] * 100).round(2)

print("\nEnhanced Performance Summary:")
print(
    summary[
        [
            "data_size",
            "duration_ratio_pandas_vs_polars",
            "memory_ratio_pandas_vs_polars",
            "duration_cv_polars",
            "duration_cv_pandas",
            "memory_cv_polars",
            "memory_cv_pandas",
        ]
    ]
)

In [None]:
# Create comprehensive visualizations
plt.style.use("default")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Duration comparison with error bars
ax1 = axes[0, 0]
ax1.errorbar(
    summary["data_size"],
    summary["duration_mean_Polars"],
    yerr=summary["duration_std_Polars"],
    label="Polars",
    marker="o",
    capsize=5,
    color="blue",
)
ax1.errorbar(
    summary["data_size"],
    summary["duration_mean_Pandas"],
    yerr=summary["duration_std_Pandas"],
    label="Pandas",
    marker="s",
    capsize=5,
    color="orange",
)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Data Size")
ax1.set_ylabel("Duration (seconds)")
ax1.set_title("Performance Comparison: Duration vs Data Size")
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Memory usage comparison with error bars
ax2 = axes[0, 1]
ax2.errorbar(
    summary["data_size"],
    summary["memory_mean_Polars"],
    yerr=summary["memory_std_Polars"],
    label="Polars",
    marker="o",
    capsize=5,
    color="blue",
)
ax2.errorbar(
    summary["data_size"],
    summary["memory_mean_Pandas"],
    yerr=summary["memory_std_Pandas"],
    label="Pandas",
    marker="s",
    capsize=5,
    color="orange",
)
ax2.set_xscale("log")
ax2.set_yscale("log")
ax2.set_xlabel("Data Size")
ax2.set_ylabel("Memory Usage (MB)")
ax2.set_title("Memory Usage Comparison vs Data Size")
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Speed ratio (Pandas/Polars - higher means Polars is faster)
ax3 = axes[0, 2]
ax3.plot(summary["data_size"], summary["duration_ratio_pandas_vs_polars"], "go-", linewidth=2, markersize=8)
ax3.axhline(y=1, color="r", linestyle="--", alpha=0.7, label="Equal performance")
ax3.set_xscale("log")
ax3.set_xlabel("Data Size")
ax3.set_ylabel("Speed Ratio (Pandas/Polars)")
ax3.set_title("Speed Advantage: Pandas vs Polars\n(>1 means Polars is faster)")
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Memory ratio (Pandas/Polars - higher means Polars uses less memory)
ax4 = axes[1, 0]
ax4.plot(summary["data_size"], summary["memory_ratio_pandas_vs_polars"], "bo-", linewidth=2, markersize=8)
ax4.axhline(y=1, color="r", linestyle="--", alpha=0.7, label="Equal memory usage")
ax4.set_xscale("log")
ax4.set_xlabel("Data Size")
ax4.set_ylabel("Memory Ratio (Pandas/Polars)")
ax4.set_title("Memory Efficiency: Pandas vs Polars\n(>1 means Polars uses less memory)")
ax4.legend()
ax4.grid(True, alpha=0.3)

# 5. Coefficient of Variation comparison for Duration
ax5 = axes[1, 1]
ax5.plot(
    summary["data_size"], summary["duration_cv_polars"], "b^-", linewidth=2, markersize=8, label="Polars CV"
)
ax5.plot(
    summary["data_size"], summary["duration_cv_pandas"], "ro-", linewidth=2, markersize=8, label="Pandas CV"
)
ax5.set_xscale("log")
ax5.set_xlabel("Data Size")
ax5.set_ylabel("Coefficient of Variation (%)")
ax5.set_title("Duration Variability: CV Comparison")
ax5.legend()
ax5.grid(True, alpha=0.3)

# 6. Speedup over data sizes
ax6 = axes[1, 2]
ax6.plot(summary["data_size"], summary["speedup"], "mo-", linewidth=2, markersize=8)
ax6.axhline(y=1, color="r", linestyle="--", alpha=0.7, label="No speedup")
ax6.set_xscale("log")
ax6.set_xlabel("Data Size")
ax6.set_ylabel("Speedup Factor")
ax6.set_title("Average Speedup Across All Steps")
ax6.legend()
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Create a detailed summary report
print("=" * 100)
print("COMPREHENSIVE BENCHMARK RESULTS SUMMARY")
print("=" * 100)

for _, row in summary.iterrows():
    data_size = int(row["data_size"])
    print(f"\nData Size: {data_size:,} rows")
    print("-" * 60)

    # Duration results
    polars_duration = row["duration_mean_Polars"]
    pandas_duration = row["duration_mean_Pandas"]
    polars_duration_std = row["duration_std_Polars"]
    pandas_duration_std = row["duration_std_Pandas"]
    duration_ratio = row["duration_ratio_pandas_vs_polars"]

    print("PERFORMANCE (Duration):")
    print(
        f"  Polars:  {polars_duration:.4f} ± {polars_duration_std:.4f} seconds (CV: {row['duration_cv_polars']:.1f}%)"
    )
    print(
        f"  Pandas:  {pandas_duration:.4f} ± {pandas_duration_std:.4f} seconds (CV: {row['duration_cv_pandas']:.1f}%)"
    )
    print(f"  Ratio:   {duration_ratio:.2f}x ({'Polars faster' if duration_ratio > 1 else 'Pandas faster'})")

    # Memory results
    polars_memory = row["memory_mean_Polars"]
    pandas_memory = row["memory_mean_Pandas"]
    polars_memory_std = row["memory_std_Polars"]
    pandas_memory_std = row["memory_std_Pandas"]
    memory_ratio = row["memory_ratio_pandas_vs_polars"]

    print("\nMEMORY USAGE:")
    print(f"  Polars:  {polars_memory:.2f} ± {polars_memory_std:.2f} MB (CV: {row['memory_cv_polars']:.1f}%)")
    print(f"  Pandas:  {pandas_memory:.2f} ± {pandas_memory_std:.2f} MB (CV: {row['memory_cv_pandas']:.1f}%)")
    print(
        f"  Ratio:   {memory_ratio:.2f}x ({'Polars more efficient' if memory_ratio > 1 else 'Pandas more efficient'})"
    )

    # Overall speedup
    speedup = row["speedup"]
    print(f"\nOVERALL SPEEDUP: {speedup:.2f}x")

In [None]:
# Performance by step analysis
print("\n" + "=" * 100)
print("PERFORMANCE BY STEP ANALYSIS")
print("=" * 100)

step_analysis = (
    results.groupby("step")
    .agg(
        {
            "duration_mean_Polars": "mean",
            "duration_mean_Pandas": "mean",
            "memory_mean_Polars": "mean",
            "memory_mean_Pandas": "mean",
            "speedup": "mean",
        }
    )
    .round(4)
)

step_analysis["duration_ratio"] = (
    step_analysis["duration_mean_Pandas"] / step_analysis["duration_mean_Polars"]
).round(2)
step_analysis["memory_ratio"] = (
    step_analysis["memory_mean_Pandas"] / step_analysis["memory_mean_Polars"]
).round(2)

print("\nAverage Performance by Step (across all data sizes):")
print(step_analysis)

# Find best and worst performing steps for each backend
print(f"\nFastest steps for Polars: {step_analysis.nsmallest(3, 'duration_mean_Polars').index.tolist()}")
print(f"Fastest steps for Pandas: {step_analysis.nsmallest(3, 'duration_mean_Pandas').index.tolist()}")
print(
    f"Steps where Polars has biggest advantage: {step_analysis.nlargest(3, 'duration_ratio').index.tolist()}"
)
print(
    f"Most memory efficient steps (Polars): {step_analysis.nsmallest(3, 'memory_mean_Polars').index.tolist()}"
)

In [None]:
# Export results to CSV files
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

# Export aggregated summary by data size
summary_file = f"performance_summary_by_datasize_{timestamp}.csv"
summary.to_csv(summary_file, index=False)
print(f"\nData size performance summary exported to: {summary_file}")

# Export step analysis
step_file = f"performance_summary_by_step_{timestamp}.csv"
step_analysis.to_csv(step_file, index=True)
print(f"Step performance summary exported to: {step_file}")

# Export detailed comparison table
comparison_df = summary[
    ["data_size", "duration_ratio_pandas_vs_polars", "memory_ratio_pandas_vs_polars", "speedup"]
].copy()
comparison_df.columns = [
    "data_size",
    "speed_ratio_pandas_vs_polars",
    "memory_ratio_pandas_vs_polars",
    "speedup",
]
comparison_file = f"performance_comparison_{timestamp}.csv"
comparison_df.to_csv(comparison_file, index=False)
print(f"Performance comparison exported to: {comparison_file}")

In [None]:
# Create a heatmap showing performance across data sizes and steps
plt.figure(figsize=(12, 8))

# Pivot the data to create a matrix for heatmap
duration_heatmap_data = results.pivot(index="step", columns="data_size", values="speedup")

# Create heatmap
sns.heatmap(
    duration_heatmap_data,
    annot=True,
    fmt=".2f",
    cmap="RdYlBu_r",
    center=1,
    cbar_kws={"label": "Speedup (Polars vs Pandas)"},
)
plt.title("Speedup Heatmap: Polars vs Pandas\nAcross Data Sizes and Steps")
plt.xlabel("Data Size")
plt.ylabel("Processing Step")
plt.tight_layout()
plt.show()

In [None]:
# Summary statistics
print("\n" + "=" * 80)
print("OVERALL BENCHMARK STATISTICS")
print("=" * 80)

print(
    f"Data sizes tested: {len(summary)} ({min(summary['data_size']):,} to {max(summary['data_size']):,} rows)"
)
print(f"Processing steps analyzed: {len(step_analysis)}")
print(f"Total benchmark combinations: {len(results)}")

print("\nOverall Performance Summary:")
print(f"Average speedup across all tests: {results['speedup'].mean():.2f}x")
print(f"Best speedup achieved: {results['speedup'].max():.2f}x")
print(f"Worst speedup: {results['speedup'].min():.2f}x")

print("\nDuration Performance:")
print(f"Average Polars duration: {summary['duration_mean_Polars'].mean():.4f} seconds")
print(f"Average Pandas duration: {summary['duration_mean_Pandas'].mean():.4f} seconds")
print(
    f"Overall speed ratio: {(summary['duration_mean_Pandas'].mean() / summary['duration_mean_Polars'].mean()):.2f}x"
)

print("\nMemory Performance:")
print(f"Average Polars memory: {summary['memory_mean_Polars'].mean():.2f} MB")
print(f"Average Pandas memory: {summary['memory_mean_Pandas'].mean():.2f} MB")
print(
    f"Overall memory ratio: {(summary['memory_mean_Pandas'].mean() / summary['memory_mean_Polars'].mean()):.2f}x"
)