From 3a5d62701366da8bc4ae800de813e11dafc3be79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 16 Nov 2025 21:58:29 -0800 Subject: [PATCH 1/3] test(performance): add cold start benchmarking infrastructure Add comprehensive tooling to measure and compare cold start performance across different branches and code changes. Changes: - Add test_cold_start.py: measures import times, module counts, and lazy loading status with 10 iterations per measurement - Add benchmark_cold_start.sh: automates running benchmarks on different git branches with stash/restore logic - Add compare_benchmarks.py: analyzes and visualizes differences between two benchmark runs with colored output - Add benchmark_results/ to .gitignore: exclude generated JSON data The benchmark suite validates: - Import time for runpod, runpod.serverless, and runpod.endpoint - Total module count and runpod-specific module count - Whether paramiko and SSH CLI modules are eagerly or lazy-loaded - Performance regression detection (fails if import > 1000ms) Usage: # Run on current branch uv run pytest tests/test_performance/test_cold_start.py # Compare two branches ./scripts/benchmark_cold_start.sh main feature-branch Results saved to benchmark_results/ as timestamped JSON files for historical comparison and CI/CD integration. --- .gitignore | 1 + scripts/benchmark_cold_start.sh | 134 ++++++++++++ scripts/compare_benchmarks.py | 174 ++++++++++++++++ tests/test_performance/__init__.py | 1 + tests/test_performance/test_cold_start.py | 240 ++++++++++++++++++++++ 5 files changed, 550 insertions(+) create mode 100755 scripts/benchmark_cold_start.sh create mode 100755 scripts/compare_benchmarks.py create mode 100644 tests/test_performance/__init__.py create mode 100644 tests/test_performance/test_cold_start.py diff --git a/.gitignore b/.gitignore index 36fa868c..eb63accf 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ runpod/_version.py .runpod_jobs.pkl *.lock +benchmark_results/ diff --git a/scripts/benchmark_cold_start.sh b/scripts/benchmark_cold_start.sh new file mode 100755 index 00000000..28d303c6 --- /dev/null +++ b/scripts/benchmark_cold_start.sh @@ -0,0 +1,134 @@ +#!/bin/bash +# Run cold start benchmarks on different git branches and compare results. +# +# Usage: +# ./scripts/benchmark_cold_start.sh # Run on current branch +# ./scripts/benchmark_cold_start.sh main # Run on main branch +# ./scripts/benchmark_cold_start.sh main feat/lazy # Compare two branches + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +RESULTS_DIR="$PROJECT_ROOT/benchmark_results" + +mkdir -p "$RESULTS_DIR" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Function to run benchmark on a branch +run_benchmark_on_branch() { + local branch=$1 + local output_file=$2 + + log_info "Running benchmark on branch: $branch" + + # Save current state + local current_branch=$(git branch --show-current) + local has_changes=$(git status --porcelain) + + if [ -n "$has_changes" ]; then + log_warn "Working directory has uncommitted changes" + log_warn "Stashing changes..." + git stash push -m "Benchmark stash $(date +%s)" + fi + + # Checkout target branch + if [ "$branch" != "$current_branch" ]; then + log_info "Checking out branch: $branch" + git checkout "$branch" + fi + + # Install dependencies + log_info "Installing dependencies..." + uv sync --group test > /dev/null 2>&1 + + # Run benchmark + log_info "Running benchmark..." + cd "$PROJECT_ROOT" + uv run python tests/test_performance/test_cold_start.py > /dev/null 2>&1 + + # Copy latest result to output file + if [ -f "$RESULTS_DIR/cold_start_latest.json" ]; then + cp "$RESULTS_DIR/cold_start_latest.json" "$output_file" + log_info "Results saved to: $output_file" + else + log_error "Benchmark failed to produce results" + return 1 + fi + + # Restore original state + if [ "$branch" != "$current_branch" ]; then + log_info "Returning to branch: $current_branch" + git checkout "$current_branch" + fi + + if [ -n "$has_changes" ]; then + log_info "Restoring stashed changes..." + git stash pop > /dev/null 2>&1 + fi +} + +# Main script logic +if [ $# -eq 0 ]; then + # Run on current branch only + log_info "Running benchmark on current branch" + current_branch=$(git branch --show-current || echo "detached") + output_file="$RESULTS_DIR/cold_start_${current_branch//\//_}_$(date +%s).json" + + uv run python tests/test_performance/test_cold_start.py + + if [ -f "$RESULTS_DIR/cold_start_latest.json" ]; then + cp "$RESULTS_DIR/cold_start_latest.json" "$output_file" + log_info "Results saved to: $output_file" + log_info "Latest results: $RESULTS_DIR/cold_start_latest.json" + fi + +elif [ $# -eq 1 ]; then + # Run on specified branch + branch=$1 + output_file="$RESULTS_DIR/cold_start_${branch//\//_}_$(date +%s).json" + run_benchmark_on_branch "$branch" "$output_file" + +elif [ $# -eq 2 ]; then + # Compare two branches + baseline_branch=$1 + optimized_branch=$2 + + log_info "Comparing branches: $baseline_branch vs $optimized_branch" + + baseline_file="$RESULTS_DIR/cold_start_baseline_$(date +%s).json" + optimized_file="$RESULTS_DIR/cold_start_optimized_$(date +%s).json" + + # Run benchmarks + run_benchmark_on_branch "$baseline_branch" "$baseline_file" + run_benchmark_on_branch "$optimized_branch" "$optimized_file" + + # Compare results + log_info "Comparing results..." + uv run python "$SCRIPT_DIR/compare_benchmarks.py" "$baseline_file" "$optimized_file" + +else + log_error "Invalid number of arguments" + echo "Usage:" + echo " $0 # Run on current branch" + echo " $0 # Run on specified branch" + echo " $0 # Compare two branches" + exit 1 +fi diff --git a/scripts/compare_benchmarks.py b/scripts/compare_benchmarks.py new file mode 100755 index 00000000..e6b801d6 --- /dev/null +++ b/scripts/compare_benchmarks.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Compare cold start benchmark results between two runs. + +Usage: + python scripts/compare_benchmarks.py baseline.json optimized.json + python scripts/compare_benchmarks.py benchmark_results/cold_start_1234.json benchmark_results/cold_start_5678.json +""" + +import json +import sys +from pathlib import Path + + +def load_benchmark(file_path: str) -> dict: + """Load benchmark results from JSON file.""" + with open(file_path) as f: + return json.load(f) + + +def calculate_improvement(baseline: float, optimized: float) -> dict: + """Calculate improvement metrics.""" + diff = baseline - optimized + percent = (diff / baseline) * 100 if baseline > 0 else 0 + + return { + "diff_ms": round(diff, 2), + "percent": round(percent, 2), + "improved": diff > 0, + } + + +def compare_benchmarks(baseline_file: str, optimized_file: str): + """Compare two benchmark results and print analysis.""" + baseline = load_benchmark(baseline_file) + optimized = load_benchmark(optimized_file) + + print("=" * 70) + print("COLD START BENCHMARK COMPARISON") + print("=" * 70) + print(f"\nBaseline: {baseline_file}") + print(f"Optimized: {optimized_file}") + print() + + # Compare main measurements + print("IMPORT TIME COMPARISON") + print("-" * 70) + print( + f"{'Metric':<25} {'Baseline':>12} {'Optimized':>12} {'Δ ms':>10} {'Δ %':>8}" + ) + print("-" * 70) + + measurements = baseline["measurements"] + opt_measurements = optimized["measurements"] + + total_improvement_ms = 0 + total_baseline_ms = 0 + + for key in sorted(measurements.keys()): + if key in opt_measurements: + baseline_val = measurements[key]["mean"] + optimized_val = opt_measurements[key]["mean"] + improvement = calculate_improvement(baseline_val, optimized_val) + + symbol = "↓" if improvement["improved"] else "↑" + color = "\033[92m" if improvement["improved"] else "\033[91m" + reset = "\033[0m" + + print( + f"{key:<25} {baseline_val:>10.2f}ms {optimized_val:>10.2f}ms " + f"{color}{symbol}{improvement['diff_ms']:>8.2f}ms {improvement['percent']:>6.2f}%{reset}" + ) + + if key == "runpod_total": + total_improvement_ms = improvement["diff_ms"] + total_baseline_ms = baseline_val + + print("-" * 70) + + # Module counts + print("\nMODULE LOAD COMPARISON") + print("-" * 70) + + baseline_counts = baseline.get("module_counts", {}) + opt_counts = optimized.get("module_counts", {}) + + if baseline_counts and opt_counts: + total_diff = baseline_counts["total"] - opt_counts["total"] + filtered_diff = baseline_counts["filtered"] - opt_counts["filtered"] + + print(f"Total modules loaded:") + print( + f" Baseline: {baseline_counts['total']:>4} Optimized: {opt_counts['total']:>4} Δ: {total_diff:>4}" + ) + print(f"Runpod modules loaded:") + print( + f" Baseline: {baseline_counts['filtered']:>4} Optimized: {opt_counts['filtered']:>4} Δ: {filtered_diff:>4}" + ) + + # Lazy loading checks + print("\nLAZY LOADING STATUS") + print("-" * 70) + + checks = [ + ("paramiko_eagerly_loaded", "Paramiko"), + ("ssh_cli_loaded", "SSH CLI"), + ] + + for key, label in checks: + baseline_loaded = baseline.get(key, False) + opt_loaded = optimized.get(key, False) + + baseline_status = "LOADED" if baseline_loaded else "NOT LOADED" + opt_status = "LOADED" if opt_loaded else "NOT LOADED" + + if baseline_loaded and not opt_loaded: + status_symbol = "✓ NOW LAZY" + color = "\033[92m" + elif not baseline_loaded and opt_loaded: + status_symbol = "✗ NOW EAGER" + color = "\033[91m" + else: + status_symbol = "- NO CHANGE" + color = "\033[93m" + + reset = "\033[0m" + print( + f"{label:<20} Baseline: {baseline_status:<12} Optimized: {opt_status:<12} {color}{status_symbol}{reset}" + ) + + # Summary + print("\n" + "=" * 70) + print("SUMMARY") + print("=" * 70) + + if total_improvement_ms > 0: + percent_improvement = ( + total_improvement_ms / total_baseline_ms + ) * 100 + print(f"✓ Cold start improved by {total_improvement_ms:.2f}ms") + print( + f"✓ That's a {percent_improvement:.1f}% improvement over baseline" + ) + print( + f"✓ Baseline: {total_baseline_ms:.2f}ms → Optimized: {total_baseline_ms - total_improvement_ms:.2f}ms" + ) + elif total_improvement_ms < 0: + print( + f"✗ Cold start regressed by {abs(total_improvement_ms):.2f}ms" + ) + print(" Review changes - performance got worse!") + else: + print("- No significant change in cold start time") + + print("=" * 70) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python compare_benchmarks.py ") + sys.exit(1) + + baseline_file = sys.argv[1] + optimized_file = sys.argv[2] + + if not Path(baseline_file).exists(): + print(f"Error: Baseline file not found: {baseline_file}") + sys.exit(1) + + if not Path(optimized_file).exists(): + print(f"Error: Optimized file not found: {optimized_file}") + sys.exit(1) + + compare_benchmarks(baseline_file, optimized_file) diff --git a/tests/test_performance/__init__.py b/tests/test_performance/__init__.py new file mode 100644 index 00000000..11c36e26 --- /dev/null +++ b/tests/test_performance/__init__.py @@ -0,0 +1 @@ +"""Performance and benchmark tests for runpod.""" diff --git a/tests/test_performance/test_cold_start.py b/tests/test_performance/test_cold_start.py new file mode 100644 index 00000000..8e1f79a3 --- /dev/null +++ b/tests/test_performance/test_cold_start.py @@ -0,0 +1,240 @@ +""" +Cold start performance benchmarks for runpod package. + +These tests measure import times and memory usage to track cold start +performance across different branches and changes. +""" + +import json +import subprocess +import sys +import time +from pathlib import Path + + +def measure_import_time(module_name: str, iterations: int = 10) -> dict: + """ + Measure the time it takes to import a module in a fresh Python process. + + Args: + module_name: Name of the module to import + iterations: Number of iterations to average + + Returns: + dict with min, max, mean, and median times in milliseconds + """ + times = [] + + for _ in range(iterations): + result = subprocess.run( + [ + sys.executable, + "-c", + f"import time; start = time.perf_counter(); import {module_name}; " + f"print((time.perf_counter() - start) * 1000)", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + times.append(float(result.stdout.strip())) + else: + raise RuntimeError( + f"Failed to import {module_name}: {result.stderr}" + ) + + times.sort() + return { + "min": round(times[0], 2), + "max": round(times[-1], 2), + "mean": round(sum(times) / len(times), 2), + "median": round(times[len(times) // 2], 2), + "iterations": iterations, + } + + +def count_loaded_modules(module_name: str, module_filter: str = None) -> dict: + """ + Count how many modules are loaded after importing a module. + + Args: + module_name: Name of the module to import + module_filter: Optional filter to count specific module namespaces + + Returns: + dict with total count and filtered count + """ + script = f""" +import sys +import {module_name} + +all_modules = list(sys.modules.keys()) +total = len(all_modules) + +if {repr(module_filter)}: + filtered = [m for m in all_modules if {repr(module_filter)} in m] + print(f"{{total}},{{len(filtered)}}") +else: + print(f"{{total}},0") +""" + + result = subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + total, filtered = result.stdout.strip().split(",") + return {"total": int(total), "filtered": int(filtered)} + else: + raise RuntimeError(f"Failed to count modules: {result.stderr}") + + +def check_module_loaded(import_statement: str, module_to_check: str) -> bool: + """ + Check if a specific module is loaded after an import statement. + + Args: + import_statement: Python import statement to execute + module_to_check: Module name to check in sys.modules + + Returns: + True if module is loaded, False otherwise + """ + script = f""" +import sys +{import_statement} +print('yes' if '{module_to_check}' in sys.modules else 'no') +""" + + result = subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + return result.stdout.strip() == "yes" + else: + raise RuntimeError(f"Failed to check module: {result.stderr}") + + +def run_full_benchmark() -> dict: + """ + Run a comprehensive cold start benchmark suite. + + Returns: + dict with all benchmark results + """ + print("Running cold start benchmarks...") + print("-" * 60) + + benchmark_results = { + "timestamp": time.time(), + "python_version": sys.version, + "measurements": {}, + } + + # Measure main runpod import + print("Measuring 'import runpod'...") + benchmark_results["measurements"]["runpod_total"] = measure_import_time( + "runpod" + ) + print( + f" Mean: {benchmark_results['measurements']['runpod_total']['mean']}ms" + ) + + # Measure serverless-only import + print("Measuring 'import runpod.serverless'...") + benchmark_results["measurements"][ + "runpod_serverless" + ] = measure_import_time("runpod.serverless") + print( + f" Mean: {benchmark_results['measurements']['runpod_serverless']['mean']}ms" + ) + + # Measure endpoint import + print("Measuring 'import runpod.endpoint'...") + benchmark_results["measurements"]["runpod_endpoint"] = measure_import_time( + "runpod.endpoint" + ) + print( + f" Mean: {benchmark_results['measurements']['runpod_endpoint']['mean']}ms" + ) + + # Count loaded modules + print("Counting loaded modules...") + module_counts = count_loaded_modules("runpod", "runpod") + benchmark_results["module_counts"] = module_counts + print(f" Total modules: {module_counts['total']}") + print(f" Runpod modules: {module_counts['filtered']}") + + # Check if paramiko is loaded + print("Checking if paramiko is eagerly loaded...") + paramiko_loaded = check_module_loaded("import runpod", "paramiko") + benchmark_results["paramiko_eagerly_loaded"] = paramiko_loaded + print(f" Paramiko loaded: {paramiko_loaded}") + + # Check if CLI modules are loaded + print("Checking if CLI modules are loaded...") + cli_loaded = check_module_loaded("import runpod", "runpod.cli.groups.ssh") + benchmark_results["ssh_cli_loaded"] = cli_loaded + print(f" SSH CLI loaded: {cli_loaded}") + + # Measure heavy dependencies if they're loaded + if paramiko_loaded: + print("Measuring 'import paramiko' (since it's loaded)...") + try: + benchmark_results["measurements"][ + "paramiko" + ] = measure_import_time("paramiko") + print( + f" Mean: {benchmark_results['measurements']['paramiko']['mean']}ms" + ) + except Exception as e: + print(f" Failed: {e}") + + print("-" * 60) + print("Benchmark complete!") + + return benchmark_results + + +def test_cold_start_benchmark(tmp_path): + """ + Pytest test that runs the benchmark and saves results to a file. + """ + results = run_full_benchmark() + + # Save results to a timestamped file + output_dir = Path("benchmark_results") + output_dir.mkdir(exist_ok=True) + + timestamp = int(time.time()) + output_file = output_dir / f"cold_start_{timestamp}.json" + + with open(output_file, "w") as f: + json.dump(results, f, indent=2) + + print(f"\nResults saved to: {output_file}") + + # Also save as latest for easy comparison + latest_file = output_dir / "cold_start_latest.json" + with open(latest_file, "w") as f: + json.dump(results, f, indent=2) + + # Assert that import time is reasonable (adjust threshold as needed) + assert ( + results["measurements"]["runpod_total"]["mean"] < 1000 + ), "Import time exceeds 1000ms" + + +if __name__ == "__main__": + results = run_full_benchmark() + print("\nFull Results:") + print(json.dumps(results, indent=2)) From 3f29b04f2a1812dc2d3d978eed22864a86d6e71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 18 Nov 2025 03:32:29 -0800 Subject: [PATCH 2/3] docs(performance): add comprehensive benchmarking usage guide Add detailed README for cold start benchmarking tools covering: - Quick start examples for common use cases - Tool documentation with usage patterns and output examples - Result file structure and naming conventions - Performance targets and interpretation guidance - CI/CD integration examples - Troubleshooting common issues The guide enables developers to effectively measure, compare, and validate cold start performance improvements across code changes. --- scripts/README.md | 285 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 scripts/README.md diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..cfa29e64 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,285 @@ +# Cold Start Benchmarking + +Performance benchmarking tools for measuring and comparing cold start times across different code changes. + +## Quick Start + +```bash +# Run benchmark on current branch +uv run pytest tests/test_performance/test_cold_start.py + +# Compare two branches +./scripts/benchmark_cold_start.sh main my-feature-branch + +# Compare two existing result files +uv run python scripts/compare_benchmarks.py benchmark_results/cold_start_baseline.json benchmark_results/cold_start_latest.json +``` + +## What Gets Measured + +- **Import times**: `import runpod`, `import runpod.serverless`, `import runpod.endpoint` +- **Module counts**: Total modules loaded and runpod-specific modules +- **Lazy loading status**: Whether paramiko and SSH CLI are eagerly or lazy-loaded +- **Statistics**: Min, max, mean, median across 10 iterations per measurement + +## Tools + +### 1. test_cold_start.py + +Core benchmark test that measures import performance in fresh Python subprocesses. + +```bash +# Run as pytest test +uv run pytest tests/test_performance/test_cold_start.py -v + +# Run as standalone script +uv run python tests/test_performance/test_cold_start.py + +# Results saved to: +# - benchmark_results/cold_start_.json +# - benchmark_results/cold_start_latest.json (always latest) +``` + +**Output Example:** +``` +Running cold start benchmarks... +------------------------------------------------------------ +Measuring 'import runpod'... + Mean: 273.29ms +Measuring 'import runpod.serverless'... + Mean: 332.18ms +Counting loaded modules... + Total modules: 582 + Runpod modules: 46 +Checking if paramiko is eagerly loaded... + Paramiko loaded: False +``` + +### 2. benchmark_cold_start.sh + +Automated benchmark runner that handles git branch switching, dependency installation, and result collection. + +```bash +# Run on current branch (no git operations) +./scripts/benchmark_cold_start.sh + +# Run on specific branch +./scripts/benchmark_cold_start.sh main + +# Compare two branches (runs both, then compares) +./scripts/benchmark_cold_start.sh main feature/lazy-loading +``` + +**Features:** +- Automatic stash/unstash of uncommitted changes +- Dependency installation per branch +- Safe branch switching with restoration +- Timestamped result files +- Automatic comparison when comparing branches + +**Safety:** +- Stashes uncommitted changes before switching branches +- Restores original branch after completion +- Handles errors gracefully + +### 3. compare_benchmarks.py + +Analyzes and visualizes differences between two benchmark runs with colored terminal output. + +```bash +uv run python scripts/compare_benchmarks.py +``` + +**Output Example:** +``` +====================================================================== +COLD START BENCHMARK COMPARISON +====================================================================== + +IMPORT TIME COMPARISON +---------------------------------------------------------------------- +Metric Baseline Optimized Δ ms Δ % +---------------------------------------------------------------------- +runpod_total 285.64ms 273.29ms ↓ 12.35ms 4.32% +runpod_serverless 376.33ms 395.14ms ↑ -18.81ms -5.00% +runpod_endpoint 378.61ms 399.36ms ↑ -20.75ms -5.48% + +MODULE LOAD COMPARISON +---------------------------------------------------------------------- +Total modules loaded: + Baseline: 698 Optimized: 582 Δ: 116 +Runpod modules loaded: + Baseline: 48 Optimized: 46 Δ: 2 + +LAZY LOADING STATUS +---------------------------------------------------------------------- +Paramiko Baseline: LOADED Optimized: NOT LOADED ✓ NOW LAZY +SSH CLI Baseline: LOADED Optimized: NOT LOADED ✓ NOW LAZY + +====================================================================== +SUMMARY +====================================================================== +✓ Cold start improved by 12.35ms +✓ That's a 4.3% improvement over baseline +✓ Baseline: 285.64ms → Optimized: 273.29ms +====================================================================== +``` + +**Color coding:** +- Green: Improvements (faster times, lazy loading achieved) +- Red: Regressions (slower times, eager loading introduced) +- Yellow: No change + +## Result Files + +All benchmark results are saved to `benchmark_results/` (gitignored). + +**File naming:** +- `cold_start_.json` - Timestamped result +- `cold_start_latest.json` - Always contains most recent result +- `cold_start_baseline.json` - Manually saved baseline for comparison + +**JSON structure:** +```json +{ + "timestamp": 1763179522.0437188, + "python_version": "3.8.20 (default, Oct 2 2024, 16:12:59) [Clang 18.1.8 ]", + "measurements": { + "runpod_total": { + "min": 375.97, + "max": 527.9, + "mean": 393.91, + "median": 380.4, + "iterations": 10 + } + }, + "module_counts": { + "total": 698, + "filtered": 48 + }, + "paramiko_eagerly_loaded": true, + "ssh_cli_loaded": true +} +``` + +## Common Workflows + +### Testing a Performance Optimization + +```bash +# 1. Save baseline on main branch +git checkout main +./scripts/benchmark_cold_start.sh +cp benchmark_results/cold_start_latest.json benchmark_results/cold_start_baseline.json + +# 2. Switch to feature branch +git checkout feature/my-optimization + +# 3. Run benchmark and compare +./scripts/benchmark_cold_start.sh +uv run python scripts/compare_benchmarks.py \ + benchmark_results/cold_start_baseline.json \ + benchmark_results/cold_start_latest.json +``` + +### Comparing Multiple Approaches + +```bash +# Compare three different optimization branches +./scripts/benchmark_cold_start.sh main > results_main.txt +./scripts/benchmark_cold_start.sh feature/approach-1 > results_1.txt +./scripts/benchmark_cold_start.sh feature/approach-2 > results_2.txt + +# Then compare each against baseline +uv run python scripts/compare_benchmarks.py \ + benchmark_results/cold_start_main_*.json \ + benchmark_results/cold_start_approach-1_*.json +``` + +### CI/CD Integration + +Add to your GitHub Actions workflow: + +```yaml +- name: Run cold start benchmark + run: | + uv run pytest tests/test_performance/test_cold_start.py --timeout=120 + +- name: Upload benchmark results + uses: actions/upload-artifact@v3 + with: + name: benchmark-results + path: benchmark_results/cold_start_latest.json +``` + +## Performance Targets + +Based on testing with Python 3.8: + +- **Cold start (import runpod)**: < 300ms (mean) +- **Serverless import**: < 400ms (mean) +- **Module count**: < 600 total modules +- **Test assertion**: Fails if import > 1000ms + +## Interpreting Results + +### Import Time Variance + +Subprocess-based measurements have inherent variance: +- First run in sequence: Often 20-50ms slower (Python startup overhead) +- Subsequent runs: More stable +- **Use median or mean** for comparison, not single runs + +### Module Count + +- **Fewer modules = faster cold start**: Each module has import overhead +- **Runpod-specific modules**: Should be minimal (40-50) +- **Total modules**: Includes stdlib and dependencies +- **Target reduction**: Removing 100+ modules typically saves 10-30ms + +### Lazy Loading Validation + +- `paramiko_eagerly_loaded: false` - Good for serverless workers +- `ssh_cli_loaded: false` - Good for SDK users +- These should only be `true` when CLI commands are invoked + +## Troubleshooting + +### High Variance in Results + +If you see >100ms variance between runs: +- System is under load +- Disk I/O contention +- Python bytecode cache issues + +**Solution:** Run multiple times and use median values. + +### benchmark_cold_start.sh Fails + +```bash +# Check git status +git status + +# Manually restore if script failed mid-execution +git checkout +git stash pop +``` + +### Import Errors During Benchmark + +Ensure dependencies are installed: +```bash +uv sync --group test +``` + +## Benchmark Accuracy + +- **Iterations**: 10 per measurement (configurable in test) +- **Process isolation**: Each measurement uses fresh subprocess +- **Python cache**: Cleared by subprocess creation +- **System state**: Cannot control OS-level caching + +For production performance testing, consider: +- Running on CI with consistent environment +- Multiple runs at different times +- Comparing trends over multiple commits From a77f08a8afe9c3eb51dc3c12602859aa529360d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 18 Nov 2025 03:47:38 -0800 Subject: [PATCH 3/3] fix(performance): address Copilot PR feedback Address code review feedback from PR #467: 1. Fix median calculation for even-length lists - Previously only returned single middle value - Now correctly averages the two middle values for even-length lists - Maintains correct behavior for odd-length lists 2. Update usage message to match documented pattern - Changed from "python" to "uv run python scripts/..." - Aligns with project's uv-based tooling conventions - Matches usage examples in README and throughout codebase These fixes improve statistical accuracy and documentation consistency. --- scripts/compare_benchmarks.py | 2 +- tests/test_performance/test_cold_start.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/compare_benchmarks.py b/scripts/compare_benchmarks.py index e6b801d6..e942f262 100755 --- a/scripts/compare_benchmarks.py +++ b/scripts/compare_benchmarks.py @@ -157,7 +157,7 @@ def compare_benchmarks(baseline_file: str, optimized_file: str): if __name__ == "__main__": if len(sys.argv) != 3: - print("Usage: python compare_benchmarks.py ") + print("Usage: uv run python scripts/compare_benchmarks.py ") sys.exit(1) baseline_file = sys.argv[1] diff --git a/tests/test_performance/test_cold_start.py b/tests/test_performance/test_cold_start.py index 8e1f79a3..a8e555ae 100644 --- a/tests/test_performance/test_cold_start.py +++ b/tests/test_performance/test_cold_start.py @@ -50,7 +50,11 @@ def measure_import_time(module_name: str, iterations: int = 10) -> dict: "min": round(times[0], 2), "max": round(times[-1], 2), "mean": round(sum(times) / len(times), 2), - "median": round(times[len(times) // 2], 2), + "median": round( + times[len(times) // 2] if len(times) % 2 == 1 else + (times[len(times) // 2 - 1] + times[len(times) // 2]) / 2, + 2 + ), "iterations": iterations, }