From 3a5d62701366da8bc4ae800de813e11dafc3be79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 16 Nov 2025 21:58:29 -0800
Subject: [PATCH 1/3] test(performance): add cold start benchmarking
 infrastructure

Add comprehensive tooling to measure and compare cold start performance
across different branches and code changes.

Changes:
- Add test_cold_start.py: measures import times, module counts, and lazy
  loading status with 10 iterations per measurement
- Add benchmark_cold_start.sh: automates running benchmarks on different
  git branches with stash/restore logic
- Add compare_benchmarks.py: analyzes and visualizes differences between
  two benchmark runs with colored output
- Add benchmark_results/ to .gitignore: exclude generated JSON data

The benchmark suite validates:
- Import time for runpod, runpod.serverless, and runpod.endpoint
- Total module count and runpod-specific module count
- Whether paramiko and SSH CLI modules are eagerly or lazy-loaded
- Performance regression detection (fails if import > 1000ms)

Usage:
  # Run on current branch
  uv run pytest tests/test_performance/test_cold_start.py

  # Compare two branches
  ./scripts/benchmark_cold_start.sh main feature-branch

Results saved to benchmark_results/ as timestamped JSON files for
historical comparison and CI/CD integration.
---
 .gitignore                                |   1 +
 scripts/benchmark_cold_start.sh           | 134 ++++++++++++
 scripts/compare_benchmarks.py             | 174 ++++++++++++++++
 tests/test_performance/__init__.py        |   1 +
 tests/test_performance/test_cold_start.py | 240 ++++++++++++++++++++++
 5 files changed, 550 insertions(+)
 create mode 100755 scripts/benchmark_cold_start.sh
 create mode 100755 scripts/compare_benchmarks.py
 create mode 100644 tests/test_performance/__init__.py
 create mode 100644 tests/test_performance/test_cold_start.py

diff --git a/.gitignore b/.gitignore
index 36fa868c..eb63accf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,4 @@ runpod/_version.py
 .runpod_jobs.pkl
 
 *.lock
+benchmark_results/
diff --git a/scripts/benchmark_cold_start.sh b/scripts/benchmark_cold_start.sh
new file mode 100755
index 00000000..28d303c6
--- /dev/null
+++ b/scripts/benchmark_cold_start.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Run cold start benchmarks on different git branches and compare results.
+#
+# Usage:
+#   ./scripts/benchmark_cold_start.sh                    # Run on current branch
+#   ./scripts/benchmark_cold_start.sh main               # Run on main branch
+#   ./scripts/benchmark_cold_start.sh main feat/lazy    # Compare two branches
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+RESULTS_DIR="$PROJECT_ROOT/benchmark_results"
+
+mkdir -p "$RESULTS_DIR"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Function to run benchmark on a branch
+run_benchmark_on_branch() {
+    local branch=$1
+    local output_file=$2
+
+    log_info "Running benchmark on branch: $branch"
+
+    # Save current state
+    local current_branch=$(git branch --show-current)
+    local has_changes=$(git status --porcelain)
+
+    if [ -n "$has_changes" ]; then
+        log_warn "Working directory has uncommitted changes"
+        log_warn "Stashing changes..."
+        git stash push -m "Benchmark stash $(date +%s)"
+    fi
+
+    # Checkout target branch
+    if [ "$branch" != "$current_branch" ]; then
+        log_info "Checking out branch: $branch"
+        git checkout "$branch"
+    fi
+
+    # Install dependencies
+    log_info "Installing dependencies..."
+    uv sync --group test > /dev/null 2>&1
+
+    # Run benchmark
+    log_info "Running benchmark..."
+    cd "$PROJECT_ROOT"
+    uv run python tests/test_performance/test_cold_start.py > /dev/null 2>&1
+
+    # Copy latest result to output file
+    if [ -f "$RESULTS_DIR/cold_start_latest.json" ]; then
+        cp "$RESULTS_DIR/cold_start_latest.json" "$output_file"
+        log_info "Results saved to: $output_file"
+    else
+        log_error "Benchmark failed to produce results"
+        return 1
+    fi
+
+    # Restore original state
+    if [ "$branch" != "$current_branch" ]; then
+        log_info "Returning to branch: $current_branch"
+        git checkout "$current_branch"
+    fi
+
+    if [ -n "$has_changes" ]; then
+        log_info "Restoring stashed changes..."
+        git stash pop > /dev/null 2>&1
+    fi
+}
+
+# Main script logic
+if [ $# -eq 0 ]; then
+    # Run on current branch only
+    log_info "Running benchmark on current branch"
+    current_branch=$(git branch --show-current || echo "detached")
+    output_file="$RESULTS_DIR/cold_start_${current_branch//\//_}_$(date +%s).json"
+
+    uv run python tests/test_performance/test_cold_start.py
+
+    if [ -f "$RESULTS_DIR/cold_start_latest.json" ]; then
+        cp "$RESULTS_DIR/cold_start_latest.json" "$output_file"
+        log_info "Results saved to: $output_file"
+        log_info "Latest results: $RESULTS_DIR/cold_start_latest.json"
+    fi
+
+elif [ $# -eq 1 ]; then
+    # Run on specified branch
+    branch=$1
+    output_file="$RESULTS_DIR/cold_start_${branch//\//_}_$(date +%s).json"
+    run_benchmark_on_branch "$branch" "$output_file"
+
+elif [ $# -eq 2 ]; then
+    # Compare two branches
+    baseline_branch=$1
+    optimized_branch=$2
+
+    log_info "Comparing branches: $baseline_branch vs $optimized_branch"
+
+    baseline_file="$RESULTS_DIR/cold_start_baseline_$(date +%s).json"
+    optimized_file="$RESULTS_DIR/cold_start_optimized_$(date +%s).json"
+
+    # Run benchmarks
+    run_benchmark_on_branch "$baseline_branch" "$baseline_file"
+    run_benchmark_on_branch "$optimized_branch" "$optimized_file"
+
+    # Compare results
+    log_info "Comparing results..."
+    uv run python "$SCRIPT_DIR/compare_benchmarks.py" "$baseline_file" "$optimized_file"
+
+else
+    log_error "Invalid number of arguments"
+    echo "Usage:"
+    echo "  $0                           # Run on current branch"
+    echo "  $0 <branch>                  # Run on specified branch"
+    echo "  $0 <baseline> <optimized>    # Compare two branches"
+    exit 1
+fi
diff --git a/scripts/compare_benchmarks.py b/scripts/compare_benchmarks.py
new file mode 100755
index 00000000..e6b801d6
--- /dev/null
+++ b/scripts/compare_benchmarks.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""
+Compare cold start benchmark results between two runs.
+
+Usage:
+    python scripts/compare_benchmarks.py baseline.json optimized.json
+    python scripts/compare_benchmarks.py benchmark_results/cold_start_1234.json benchmark_results/cold_start_5678.json
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def load_benchmark(file_path: str) -> dict:
+    """Load benchmark results from JSON file."""
+    with open(file_path) as f:
+        return json.load(f)
+
+
+def calculate_improvement(baseline: float, optimized: float) -> dict:
+    """Calculate improvement metrics."""
+    diff = baseline - optimized
+    percent = (diff / baseline) * 100 if baseline > 0 else 0
+
+    return {
+        "diff_ms": round(diff, 2),
+        "percent": round(percent, 2),
+        "improved": diff > 0,
+    }
+
+
+def compare_benchmarks(baseline_file: str, optimized_file: str):
+    """Compare two benchmark results and print analysis."""
+    baseline = load_benchmark(baseline_file)
+    optimized = load_benchmark(optimized_file)
+
+    print("=" * 70)
+    print("COLD START BENCHMARK COMPARISON")
+    print("=" * 70)
+    print(f"\nBaseline:  {baseline_file}")
+    print(f"Optimized: {optimized_file}")
+    print()
+
+    # Compare main measurements
+    print("IMPORT TIME COMPARISON")
+    print("-" * 70)
+    print(
+        f"{'Metric':<25} {'Baseline':>12} {'Optimized':>12} {'Δ ms':>10} {'Δ %':>8}"
+    )
+    print("-" * 70)
+
+    measurements = baseline["measurements"]
+    opt_measurements = optimized["measurements"]
+
+    total_improvement_ms = 0
+    total_baseline_ms = 0
+
+    for key in sorted(measurements.keys()):
+        if key in opt_measurements:
+            baseline_val = measurements[key]["mean"]
+            optimized_val = opt_measurements[key]["mean"]
+            improvement = calculate_improvement(baseline_val, optimized_val)
+
+            symbol = "↓" if improvement["improved"] else "↑"
+            color = "\033[92m" if improvement["improved"] else "\033[91m"
+            reset = "\033[0m"
+
+            print(
+                f"{key:<25} {baseline_val:>10.2f}ms {optimized_val:>10.2f}ms "
+                f"{color}{symbol}{improvement['diff_ms']:>8.2f}ms {improvement['percent']:>6.2f}%{reset}"
+            )
+
+            if key == "runpod_total":
+                total_improvement_ms = improvement["diff_ms"]
+                total_baseline_ms = baseline_val
+
+    print("-" * 70)
+
+    # Module counts
+    print("\nMODULE LOAD COMPARISON")
+    print("-" * 70)
+
+    baseline_counts = baseline.get("module_counts", {})
+    opt_counts = optimized.get("module_counts", {})
+
+    if baseline_counts and opt_counts:
+        total_diff = baseline_counts["total"] - opt_counts["total"]
+        filtered_diff = baseline_counts["filtered"] - opt_counts["filtered"]
+
+        print(f"Total modules loaded:")
+        print(
+            f"  Baseline:  {baseline_counts['total']:>4}  Optimized: {opt_counts['total']:>4}  Δ: {total_diff:>4}"
+        )
+        print(f"Runpod modules loaded:")
+        print(
+            f"  Baseline:  {baseline_counts['filtered']:>4}  Optimized: {opt_counts['filtered']:>4}  Δ: {filtered_diff:>4}"
+        )
+
+    # Lazy loading checks
+    print("\nLAZY LOADING STATUS")
+    print("-" * 70)
+
+    checks = [
+        ("paramiko_eagerly_loaded", "Paramiko"),
+        ("ssh_cli_loaded", "SSH CLI"),
+    ]
+
+    for key, label in checks:
+        baseline_loaded = baseline.get(key, False)
+        opt_loaded = optimized.get(key, False)
+
+        baseline_status = "LOADED" if baseline_loaded else "NOT LOADED"
+        opt_status = "LOADED" if opt_loaded else "NOT LOADED"
+
+        if baseline_loaded and not opt_loaded:
+            status_symbol = "✓ NOW LAZY"
+            color = "\033[92m"
+        elif not baseline_loaded and opt_loaded:
+            status_symbol = "✗ NOW EAGER"
+            color = "\033[91m"
+        else:
+            status_symbol = "- NO CHANGE"
+            color = "\033[93m"
+
+        reset = "\033[0m"
+        print(
+            f"{label:<20} Baseline: {baseline_status:<12} Optimized: {opt_status:<12} {color}{status_symbol}{reset}"
+        )
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+
+    if total_improvement_ms > 0:
+        percent_improvement = (
+            total_improvement_ms / total_baseline_ms
+        ) * 100
+        print(f"✓ Cold start improved by {total_improvement_ms:.2f}ms")
+        print(
+            f"✓ That's a {percent_improvement:.1f}% improvement over baseline"
+        )
+        print(
+            f"✓ Baseline: {total_baseline_ms:.2f}ms → Optimized: {total_baseline_ms - total_improvement_ms:.2f}ms"
+        )
+    elif total_improvement_ms < 0:
+        print(
+            f"✗ Cold start regressed by {abs(total_improvement_ms):.2f}ms"
+        )
+        print("  Review changes - performance got worse!")
+    else:
+        print("- No significant change in cold start time")
+
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python compare_benchmarks.py <baseline.json> <optimized.json>")
+        sys.exit(1)
+
+    baseline_file = sys.argv[1]
+    optimized_file = sys.argv[2]
+
+    if not Path(baseline_file).exists():
+        print(f"Error: Baseline file not found: {baseline_file}")
+        sys.exit(1)
+
+    if not Path(optimized_file).exists():
+        print(f"Error: Optimized file not found: {optimized_file}")
+        sys.exit(1)
+
+    compare_benchmarks(baseline_file, optimized_file)
diff --git a/tests/test_performance/__init__.py b/tests/test_performance/__init__.py
new file mode 100644
index 00000000..11c36e26
--- /dev/null
+++ b/tests/test_performance/__init__.py
@@ -0,0 +1 @@
+"""Performance and benchmark tests for runpod."""
diff --git a/tests/test_performance/test_cold_start.py b/tests/test_performance/test_cold_start.py
new file mode 100644
index 00000000..8e1f79a3
--- /dev/null
+++ b/tests/test_performance/test_cold_start.py
@@ -0,0 +1,240 @@
+"""
+Cold start performance benchmarks for runpod package.
+
+These tests measure import times and memory usage to track cold start
+performance across different branches and changes.
+"""
+
+import json
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+
+def measure_import_time(module_name: str, iterations: int = 10) -> dict:
+    """
+    Measure the time it takes to import a module in a fresh Python process.
+
+    Args:
+        module_name: Name of the module to import
+        iterations: Number of iterations to average
+
+    Returns:
+        dict with min, max, mean, and median times in milliseconds
+    """
+    times = []
+
+    for _ in range(iterations):
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-c",
+                f"import time; start = time.perf_counter(); import {module_name}; "
+                f"print((time.perf_counter() - start) * 1000)",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+
+        if result.returncode == 0:
+            times.append(float(result.stdout.strip()))
+        else:
+            raise RuntimeError(
+                f"Failed to import {module_name}: {result.stderr}"
+            )
+
+    times.sort()
+    return {
+        "min": round(times[0], 2),
+        "max": round(times[-1], 2),
+        "mean": round(sum(times) / len(times), 2),
+        "median": round(times[len(times) // 2], 2),
+        "iterations": iterations,
+    }
+
+
+def count_loaded_modules(module_name: str, module_filter: str = None) -> dict:
+    """
+    Count how many modules are loaded after importing a module.
+
+    Args:
+        module_name: Name of the module to import
+        module_filter: Optional filter to count specific module namespaces
+
+    Returns:
+        dict with total count and filtered count
+    """
+    script = f"""
+import sys
+import {module_name}
+
+all_modules = list(sys.modules.keys())
+total = len(all_modules)
+
+if {repr(module_filter)}:
+    filtered = [m for m in all_modules if {repr(module_filter)} in m]
+    print(f"{{total}},{{len(filtered)}}")
+else:
+    print(f"{{total}},0")
+"""
+
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        capture_output=True,
+        text=True,
+        timeout=10,
+    )
+
+    if result.returncode == 0:
+        total, filtered = result.stdout.strip().split(",")
+        return {"total": int(total), "filtered": int(filtered)}
+    else:
+        raise RuntimeError(f"Failed to count modules: {result.stderr}")
+
+
+def check_module_loaded(import_statement: str, module_to_check: str) -> bool:
+    """
+    Check if a specific module is loaded after an import statement.
+
+    Args:
+        import_statement: Python import statement to execute
+        module_to_check: Module name to check in sys.modules
+
+    Returns:
+        True if module is loaded, False otherwise
+    """
+    script = f"""
+import sys
+{import_statement}
+print('yes' if '{module_to_check}' in sys.modules else 'no')
+"""
+
+    result = subprocess.run(
+        [sys.executable, "-c", script],
+        capture_output=True,
+        text=True,
+        timeout=10,
+    )
+
+    if result.returncode == 0:
+        return result.stdout.strip() == "yes"
+    else:
+        raise RuntimeError(f"Failed to check module: {result.stderr}")
+
+
+def run_full_benchmark() -> dict:
+    """
+    Run a comprehensive cold start benchmark suite.
+
+    Returns:
+        dict with all benchmark results
+    """
+    print("Running cold start benchmarks...")
+    print("-" * 60)
+
+    benchmark_results = {
+        "timestamp": time.time(),
+        "python_version": sys.version,
+        "measurements": {},
+    }
+
+    # Measure main runpod import
+    print("Measuring 'import runpod'...")
+    benchmark_results["measurements"]["runpod_total"] = measure_import_time(
+        "runpod"
+    )
+    print(
+        f"  Mean: {benchmark_results['measurements']['runpod_total']['mean']}ms"
+    )
+
+    # Measure serverless-only import
+    print("Measuring 'import runpod.serverless'...")
+    benchmark_results["measurements"][
+        "runpod_serverless"
+    ] = measure_import_time("runpod.serverless")
+    print(
+        f"  Mean: {benchmark_results['measurements']['runpod_serverless']['mean']}ms"
+    )
+
+    # Measure endpoint import
+    print("Measuring 'import runpod.endpoint'...")
+    benchmark_results["measurements"]["runpod_endpoint"] = measure_import_time(
+        "runpod.endpoint"
+    )
+    print(
+        f"  Mean: {benchmark_results['measurements']['runpod_endpoint']['mean']}ms"
+    )
+
+    # Count loaded modules
+    print("Counting loaded modules...")
+    module_counts = count_loaded_modules("runpod", "runpod")
+    benchmark_results["module_counts"] = module_counts
+    print(f"  Total modules: {module_counts['total']}")
+    print(f"  Runpod modules: {module_counts['filtered']}")
+
+    # Check if paramiko is loaded
+    print("Checking if paramiko is eagerly loaded...")
+    paramiko_loaded = check_module_loaded("import runpod", "paramiko")
+    benchmark_results["paramiko_eagerly_loaded"] = paramiko_loaded
+    print(f"  Paramiko loaded: {paramiko_loaded}")
+
+    # Check if CLI modules are loaded
+    print("Checking if CLI modules are loaded...")
+    cli_loaded = check_module_loaded("import runpod", "runpod.cli.groups.ssh")
+    benchmark_results["ssh_cli_loaded"] = cli_loaded
+    print(f"  SSH CLI loaded: {cli_loaded}")
+
+    # Measure heavy dependencies if they're loaded
+    if paramiko_loaded:
+        print("Measuring 'import paramiko' (since it's loaded)...")
+        try:
+            benchmark_results["measurements"][
+                "paramiko"
+            ] = measure_import_time("paramiko")
+            print(
+                f"  Mean: {benchmark_results['measurements']['paramiko']['mean']}ms"
+            )
+        except Exception as e:
+            print(f"  Failed: {e}")
+
+    print("-" * 60)
+    print("Benchmark complete!")
+
+    return benchmark_results
+
+
+def test_cold_start_benchmark(tmp_path):
+    """
+    Pytest test that runs the benchmark and saves results to a file.
+    """
+    results = run_full_benchmark()
+
+    # Save results to a timestamped file
+    output_dir = Path("benchmark_results")
+    output_dir.mkdir(exist_ok=True)
+
+    timestamp = int(time.time())
+    output_file = output_dir / f"cold_start_{timestamp}.json"
+
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+
+    print(f"\nResults saved to: {output_file}")
+
+    # Also save as latest for easy comparison
+    latest_file = output_dir / "cold_start_latest.json"
+    with open(latest_file, "w") as f:
+        json.dump(results, f, indent=2)
+
+    # Assert that import time is reasonable (adjust threshold as needed)
+    assert (
+        results["measurements"]["runpod_total"]["mean"] < 1000
+    ), "Import time exceeds 1000ms"
+
+
+if __name__ == "__main__":
+    results = run_full_benchmark()
+    print("\nFull Results:")
+    print(json.dumps(results, indent=2))

From 3f29b04f2a1812dc2d3d978eed22864a86d6e71e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 18 Nov 2025 03:32:29 -0800
Subject: [PATCH 2/3] docs(performance): add comprehensive benchmarking usage
 guide

Add detailed README for cold start benchmarking tools covering:
- Quick start examples for common use cases
- Tool documentation with usage patterns and output examples
- Result file structure and naming conventions
- Performance targets and interpretation guidance
- CI/CD integration examples
- Troubleshooting common issues

The guide enables developers to effectively measure, compare, and
validate cold start performance improvements across code changes.
---
 scripts/README.md | 285 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 285 insertions(+)
 create mode 100644 scripts/README.md

diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..cfa29e64
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,285 @@
+# Cold Start Benchmarking
+
+Performance benchmarking tools for measuring and comparing cold start times across different code changes.
+
+## Quick Start
+
+```bash
+# Run benchmark on current branch
+uv run pytest tests/test_performance/test_cold_start.py
+
+# Compare two branches
+./scripts/benchmark_cold_start.sh main my-feature-branch
+
+# Compare two existing result files
+uv run python scripts/compare_benchmarks.py benchmark_results/cold_start_baseline.json benchmark_results/cold_start_latest.json
+```
+
+## What Gets Measured
+
+- **Import times**: `import runpod`, `import runpod.serverless`, `import runpod.endpoint`
+- **Module counts**: Total modules loaded and runpod-specific modules
+- **Lazy loading status**: Whether paramiko and SSH CLI are eagerly or lazy-loaded
+- **Statistics**: Min, max, mean, median across 10 iterations per measurement
+
+## Tools
+
+### 1. test_cold_start.py
+
+Core benchmark test that measures import performance in fresh Python subprocesses.
+
+```bash
+# Run as pytest test
+uv run pytest tests/test_performance/test_cold_start.py -v
+
+# Run as standalone script
+uv run python tests/test_performance/test_cold_start.py
+
+# Results saved to:
+# - benchmark_results/cold_start_<timestamp>.json
+# - benchmark_results/cold_start_latest.json (always latest)
+```
+
+**Output Example:**
+```
+Running cold start benchmarks...
+------------------------------------------------------------
+Measuring 'import runpod'...
+  Mean: 273.29ms
+Measuring 'import runpod.serverless'...
+  Mean: 332.18ms
+Counting loaded modules...
+  Total modules: 582
+  Runpod modules: 46
+Checking if paramiko is eagerly loaded...
+  Paramiko loaded: False
+```
+
+### 2. benchmark_cold_start.sh
+
+Automated benchmark runner that handles git branch switching, dependency installation, and result collection.
+
+```bash
+# Run on current branch (no git operations)
+./scripts/benchmark_cold_start.sh
+
+# Run on specific branch
+./scripts/benchmark_cold_start.sh main
+
+# Compare two branches (runs both, then compares)
+./scripts/benchmark_cold_start.sh main feature/lazy-loading
+```
+
+**Features:**
+- Automatic stash/unstash of uncommitted changes
+- Dependency installation per branch
+- Safe branch switching with restoration
+- Timestamped result files
+- Automatic comparison when comparing branches
+
+**Safety:**
+- Stashes uncommitted changes before switching branches
+- Restores original branch after completion
+- Handles errors gracefully
+
+### 3. compare_benchmarks.py
+
+Analyzes and visualizes differences between two benchmark runs with colored terminal output.
+
+```bash
+uv run python scripts/compare_benchmarks.py <baseline.json> <optimized.json>
+```
+
+**Output Example:**
+```
+======================================================================
+COLD START BENCHMARK COMPARISON
+======================================================================
+
+IMPORT TIME COMPARISON
+----------------------------------------------------------------------
+Metric                        Baseline    Optimized       Δ ms      Δ %
+----------------------------------------------------------------------
+runpod_total                  285.64ms     273.29ms ↓  12.35ms   4.32%
+runpod_serverless             376.33ms     395.14ms ↑ -18.81ms  -5.00%
+runpod_endpoint               378.61ms     399.36ms ↑ -20.75ms  -5.48%
+
+MODULE LOAD COMPARISON
+----------------------------------------------------------------------
+Total modules loaded:
+  Baseline:   698  Optimized:  582  Δ:  116
+Runpod modules loaded:
+  Baseline:    48  Optimized:   46  Δ:    2
+
+LAZY LOADING STATUS
+----------------------------------------------------------------------
+Paramiko             Baseline: LOADED       Optimized: NOT LOADED   ✓ NOW LAZY
+SSH CLI              Baseline: LOADED       Optimized: NOT LOADED   ✓ NOW LAZY
+
+======================================================================
+SUMMARY
+======================================================================
+✓ Cold start improved by 12.35ms
+✓ That's a 4.3% improvement over baseline
+✓ Baseline: 285.64ms → Optimized: 273.29ms
+======================================================================
+```
+
+**Color coding:**
+- Green: Improvements (faster times, lazy loading achieved)
+- Red: Regressions (slower times, eager loading introduced)
+- Yellow: No change
+
+## Result Files
+
+All benchmark results are saved to `benchmark_results/` (gitignored).
+
+**File naming:**
+- `cold_start_<timestamp>.json` - Timestamped result
+- `cold_start_latest.json` - Always contains most recent result
+- `cold_start_baseline.json` - Manually saved baseline for comparison
+
+**JSON structure:**
+```json
+{
+  "timestamp": 1763179522.0437188,
+  "python_version": "3.8.20 (default, Oct  2 2024, 16:12:59) [Clang 18.1.8 ]",
+  "measurements": {
+    "runpod_total": {
+      "min": 375.97,
+      "max": 527.9,
+      "mean": 393.91,
+      "median": 380.4,
+      "iterations": 10
+    }
+  },
+  "module_counts": {
+    "total": 698,
+    "filtered": 48
+  },
+  "paramiko_eagerly_loaded": true,
+  "ssh_cli_loaded": true
+}
+```
+
+## Common Workflows
+
+### Testing a Performance Optimization
+
+```bash
+# 1. Save baseline on main branch
+git checkout main
+./scripts/benchmark_cold_start.sh
+cp benchmark_results/cold_start_latest.json benchmark_results/cold_start_baseline.json
+
+# 2. Switch to feature branch
+git checkout feature/my-optimization
+
+# 3. Run benchmark and compare
+./scripts/benchmark_cold_start.sh
+uv run python scripts/compare_benchmarks.py \
+  benchmark_results/cold_start_baseline.json \
+  benchmark_results/cold_start_latest.json
+```
+
+### Comparing Multiple Approaches
+
+```bash
+# Compare three different optimization branches
+./scripts/benchmark_cold_start.sh main > results_main.txt
+./scripts/benchmark_cold_start.sh feature/approach-1 > results_1.txt
+./scripts/benchmark_cold_start.sh feature/approach-2 > results_2.txt
+
+# Then compare each against baseline
+uv run python scripts/compare_benchmarks.py \
+  benchmark_results/cold_start_main_*.json \
+  benchmark_results/cold_start_approach-1_*.json
+```
+
+### CI/CD Integration
+
+Add to your GitHub Actions workflow:
+
+```yaml
+- name: Run cold start benchmark
+  run: |
+    uv run pytest tests/test_performance/test_cold_start.py --timeout=120
+
+- name: Upload benchmark results
+  uses: actions/upload-artifact@v3
+  with:
+    name: benchmark-results
+    path: benchmark_results/cold_start_latest.json
+```
+
+## Performance Targets
+
+Based on testing with Python 3.8:
+
+- **Cold start (import runpod)**: < 300ms (mean)
+- **Serverless import**: < 400ms (mean)
+- **Module count**: < 600 total modules
+- **Test assertion**: Fails if import > 1000ms
+
+## Interpreting Results
+
+### Import Time Variance
+
+Subprocess-based measurements have inherent variance:
+- First run in sequence: Often 20-50ms slower (Python startup overhead)
+- Subsequent runs: More stable
+- **Use median or mean** for comparison, not single runs
+
+### Module Count
+
+- **Fewer modules = faster cold start**: Each module has import overhead
+- **Runpod-specific modules**: Should be minimal (40-50)
+- **Total modules**: Includes stdlib and dependencies
+- **Target reduction**: Removing 100+ modules typically saves 10-30ms
+
+### Lazy Loading Validation
+
+- `paramiko_eagerly_loaded: false` - Good for serverless workers
+- `ssh_cli_loaded: false` - Good for SDK users
+- These should only be `true` when CLI commands are invoked
+
+## Troubleshooting
+
+### High Variance in Results
+
+If you see >100ms variance between runs:
+- System is under load
+- Disk I/O contention
+- Python bytecode cache issues
+
+**Solution:** Run multiple times and use median values.
+
+### benchmark_cold_start.sh Fails
+
+```bash
+# Check git status
+git status
+
+# Manually restore if script failed mid-execution
+git checkout <original-branch>
+git stash pop
+```
+
+### Import Errors During Benchmark
+
+Ensure dependencies are installed:
+```bash
+uv sync --group test
+```
+
+## Benchmark Accuracy
+
+- **Iterations**: 10 per measurement (configurable in test)
+- **Process isolation**: Each measurement uses fresh subprocess
+- **Python cache**: Cleared by subprocess creation
+- **System state**: Cannot control OS-level caching
+
+For production performance testing, consider:
+- Running on CI with consistent environment
+- Multiple runs at different times
+- Comparing trends over multiple commits

From a77f08a8afe9c3eb51dc3c12602859aa529360d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 18 Nov 2025 03:47:38 -0800
Subject: [PATCH 3/3] fix(performance): address Copilot PR feedback

Address code review feedback from PR #467:

1. Fix median calculation for even-length lists
   - Previously only returned single middle value
   - Now correctly averages the two middle values for even-length lists
   - Maintains correct behavior for odd-length lists

2. Update usage message to match documented pattern
   - Changed from "python" to "uv run python scripts/..."
   - Aligns with project's uv-based tooling conventions
   - Matches usage examples in README and throughout codebase

These fixes improve statistical accuracy and documentation consistency.
---
 scripts/compare_benchmarks.py             | 2 +-
 tests/test_performance/test_cold_start.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/compare_benchmarks.py b/scripts/compare_benchmarks.py
index e6b801d6..e942f262 100755
--- a/scripts/compare_benchmarks.py
+++ b/scripts/compare_benchmarks.py
@@ -157,7 +157,7 @@ def compare_benchmarks(baseline_file: str, optimized_file: str):
 
 if __name__ == "__main__":
     if len(sys.argv) != 3:
-        print("Usage: python compare_benchmarks.py <baseline.json> <optimized.json>")
+        print("Usage: uv run python scripts/compare_benchmarks.py <baseline.json> <optimized.json>")
         sys.exit(1)
 
     baseline_file = sys.argv[1]
diff --git a/tests/test_performance/test_cold_start.py b/tests/test_performance/test_cold_start.py
index 8e1f79a3..a8e555ae 100644
--- a/tests/test_performance/test_cold_start.py
+++ b/tests/test_performance/test_cold_start.py
@@ -50,7 +50,11 @@ def measure_import_time(module_name: str, iterations: int = 10) -> dict:
         "min": round(times[0], 2),
         "max": round(times[-1], 2),
         "mean": round(sum(times) / len(times), 2),
-        "median": round(times[len(times) // 2], 2),
+        "median": round(
+            times[len(times) // 2] if len(times) % 2 == 1 else
+            (times[len(times) // 2 - 1] + times[len(times) // 2]) / 2,
+            2
+        ),
         "iterations": iterations,
     }