# Query Profiling: mpc_orbits Performance Measurement

Measures query performance against the mpc_orbits table (1.51M rows) to inform
indexing, materialization, and interactive query budget decisions.

Each query runs 3 times; we report median timing and IQR.  EXPLAIN ANALYZE
reveals sequential vs. index scans and buffer hit rates.

In [1]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from lib.db import connect, timed_query, timed_explain, query_log
from lib.orbits import build_orbit_query, build_null_rates_query, build_value_distribution_query

## 1. Define Test Queries

Progressive complexity from unfiltered scan to JSONB extraction.

In [2]:
# Test query definitions: (label, query_builder_kwargs)
TEST_QUERIES = [
    ("1. Count all",
     dict(count_only=True)),
    
    ("2. Full scan (5 cols)",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"])),
    
    ("3. Filter: orbit_type_int=2 (Apollo)",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"],
          orbit_types=[2])),
    
    ("4. Filter: h > 22",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a", "h"],
          h_range=(22, 35))),
    
    ("5. Filter: NEOs (q < 1.3)",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a", "h"],
          q_range=(0, 1.3))),
    
    ("6. Combined: Apollo + H>22",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a", "h"],
          orbit_types=[2], h_range=(22, 35))),
    
    ("7. NEAs + Tisserand",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"],
          orbit_types=[1, 2, 3, 4], include_tisserand=True)),
    
    ("8. Full scan + Tisserand",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"],
          include_tisserand=True)),
    
    ("9. JSONB: orbit_quality + SNR",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"],
          include_jsonb_fields=["orbit_quality", "snr"])),
    
    ("10. JSONB: all MOIDs",
     dict(columns=["packed_primary_provisional_designation", "earth_moid"],
          include_jsonb_fields=["mars_moid", "venus_moid", "jupiter_moid"])),
]

## 2. Run Each Query 3x, Collect Timings

In [3]:
N_RUNS = 3
results = []

with connect() as conn:
    for label, kwargs in TEST_QUERIES:
        sql, params = build_orbit_query(**kwargs)
        timings = []
        row_count = 0
        
        for run in range(N_RUNS):
            query_log.clear()
            df = timed_query(conn, sql, params, label=label)
            rec = query_log.records[-1]
            timings.append(rec.elapsed_sec)
            row_count = rec.row_count
            conn.rollback()  # reset transaction state
        
        results.append({
            "label": label,
            "row_count": row_count,
            "median_sec": np.median(timings),
            "q25_sec": np.percentile(timings, 25),
            "q75_sec": np.percentile(timings, 75),
            "min_sec": min(timings),
            "max_sec": max(timings),
            "all_timings": timings,
        })
        print(f"{label}: {row_count:>10,} rows, median {np.median(timings):.3f}s")

timing_df = pd.DataFrame(results)
timing_df

1. Count all:          1 rows, median 0.263s


2. Full scan (5 cols):  1,515,621 rows, median 29.466s


3. Filter: orbit_type_int=2 (Apollo):      4,601 rows, median 0.379s


4. Filter: h > 22:     30,475 rows, median 0.826s


5. Filter: NEOs (q < 1.3):     41,015 rows, median 1.145s


6. Combined: Apollo + H>22:      2,384 rows, median 0.493s


7. NEAs + Tisserand:      8,549 rows, median 0.355s


8. Full scan + Tisserand:  1,515,621 rows, median 38.052s


9. JSONB: orbit_quality + SNR:  1,515,621 rows, median 48.160s


10. JSONB: all MOIDs:  1,515,621 rows, median 66.114s


Unnamed: 0,label,row_count,median_sec,q25_sec,q75_sec,min_sec,max_sec,all_timings
0,1. Count all,1,0.26286,0.261383,0.283066,0.259906,0.303272,"[0.3032717921305448, 0.25990637484937906, 0.26..."
1,2. Full scan (5 cols),1515621,29.465859,25.695699,29.673297,21.925538,29.880735,"[29.880734791979194, 21.92553845909424, 29.465..."
2,3. Filter: orbit_type_int=2 (Apollo),4601,0.379099,0.376357,0.381163,0.373614,0.383226,"[0.3832258749753237, 0.3736143750138581, 0.379..."
3,4. Filter: h > 22,30475,0.826283,0.785807,0.903574,0.74533,0.980865,"[0.7453304589726031, 0.9808654170483351, 0.826..."
4,5. Filter: NEOs (q < 1.3),41015,1.14508,1.098201,1.145418,1.051321,1.145756,"[1.0513214168604463, 1.1457562500145286, 1.145..."
5,6. Combined: Apollo + H>22,2384,0.493183,0.474563,0.518557,0.455942,0.543931,"[0.543930625077337, 0.493183417012915, 0.45594..."
6,7. NEAs + Tisserand,8549,0.354632,0.349305,0.37204,0.343977,0.389448,"[0.34397683409042656, 0.389447791967541, 0.354..."
7,8. Full scan + Tisserand,1515621,38.052372,37.538163,39.521986,37.023953,40.991599,"[37.02395320893265, 40.99159908317961, 38.0523..."
8,9. JSONB: orbit_quality + SNR,1515621,48.160067,47.321763,90.239414,46.483458,132.31876,"[132.31876004184596, 48.16006704210304, 46.483..."
9,10. JSONB: all MOIDs,1515621,66.113841,64.655358,68.929911,63.196876,71.745982,"[66.11384079209529, 71.74598199990578, 63.1968..."


## 3. Query Time Bar Chart

In [4]:
fig = go.Figure()

fig.add_trace(go.Bar(
    y=timing_df["label"],
    x=timing_df["median_sec"],
    orientation="h",
    error_x=dict(
        type="data",
        symmetric=False,
        array=(timing_df["q75_sec"] - timing_df["median_sec"]).tolist(),
        arrayminus=(timing_df["median_sec"] - timing_df["q25_sec"]).tolist(),
    ),
    marker_color="#4363d8",
    hovertemplate="%{y}<br>Median: %{x:.3f}s<br>Rows: %{customdata:,}<extra></extra>",
    customdata=timing_df["row_count"],
))

fig.update_layout(
    title="Query Performance: mpc_orbits (1.51M rows)",
    xaxis_title="Median execution time (seconds)",
    yaxis=dict(autorange="reversed"),
    height=500,
    margin=dict(l=250),
)
fig.show()

## 4. Throughput: Rows vs. Time

In [5]:
# Exclude count_only queries (row_count = 1)
throughput_df = timing_df[timing_df["row_count"] > 1].copy()
throughput_df["rows_per_sec"] = throughput_df["row_count"] / throughput_df["median_sec"]

fig = px.scatter(
    throughput_df,
    x="row_count",
    y="median_sec",
    text="label",
    hover_data=["rows_per_sec"],
    log_x=True,
    title="Throughput: Row Count vs. Query Time",
    labels={"row_count": "Rows returned", "median_sec": "Median time (s)"},
)
fig.update_traces(textposition="top center", marker_size=10)
fig.show()

## 5. EXPLAIN ANALYZE on Key Queries

In [6]:
import json

explain_queries = [
    ("Full scan (5 cols)",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"])),
    ("Filter: orbit_type_int=2",
     dict(columns=["packed_primary_provisional_designation", "q", "e", "i", "a"],
          orbit_types=[2])),
    ("JSONB: orbit_quality + SNR",
     dict(columns=["packed_primary_provisional_designation"],
          include_jsonb_fields=["orbit_quality", "snr"])),
]

with connect() as conn:
    for label, kwargs in explain_queries:
        sql, params = build_orbit_query(**kwargs)
        plan = timed_explain(conn, sql, params, label=f"EXPLAIN: {label}")
        
        print(f"\n{'='*60}")
        print(f"EXPLAIN: {label}")
        print(f"{'='*60}")
        
        # Extract key metrics from the plan
        p = plan.get("Plan", plan)
        print(f"  Node Type:     {p.get('Node Type', 'N/A')}")
        print(f"  Actual Rows:   {p.get('Actual Rows', 'N/A'):,}")
        print(f"  Actual Time:   {p.get('Actual Total Time', 'N/A'):.1f} ms")
        print(f"  Shared Hit:    {p.get('Shared Hit Blocks', 'N/A'):,}")
        print(f"  Shared Read:   {p.get('Shared Read Blocks', 'N/A'):,}")
        
        # Full plan for inspection
        print(f"\n  Full plan:")
        print(json.dumps(plan, indent=2)[:2000])


EXPLAIN: Full scan (5 cols)
  Node Type:     Seq Scan
  Actual Rows:   1,515,621
  Actual Time:   1178.9 ms
  Shared Hit:    68,584
  Shared Read:   0

  Full plan:
{
  "Plan": {
    "Node Type": "Seq Scan",
    "Parallel Aware": false,
    "Async Capable": false,
    "Relation Name": "mpc_orbits",
    "Alias": "mpc_orbits",
    "Startup Cost": 0.0,
    "Total Cost": 98890.86,
    "Plan Rows": 1515343,
    "Plan Width": 40,
    "Actual Startup Time": 0.06,
    "Actual Total Time": 1178.916,
    "Actual Rows": 1515621,
    "Actual Loops": 1,
    "Shared Hit Blocks": 68584,
    "Shared Read Blocks": 0,
    "Shared Dirtied Blocks": 0,
    "Shared Written Blocks": 0,
    "Local Hit Blocks": 0,
    "Local Read Blocks": 0,
    "Local Dirtied Blocks": 0,
    "Local Written Blocks": 0,
    "Temp Read Blocks": 0,
    "Temp Written Blocks": 0
  },
  "Planning": {
    "Shared Hit Blocks": 98,
    "Shared Read Blocks": 0,
    "Shared Dirtied Blocks": 0,
    "Shared Written Blocks": 0,
    "Local 


EXPLAIN: Filter: orbit_type_int=2
  Node Type:     Gather
  Actual Rows:   4,601
  Actual Time:   258.0 ms
  Shared Hit:    68,584
  Shared Read:   0

  Full plan:
{
  "Plan": {
    "Node Type": "Gather",
    "Parallel Aware": false,
    "Async Capable": false,
    "Startup Cost": 1000.0,
    "Total Cost": 78002.56,
    "Plan Rows": 5051,
    "Plan Width": 40,
    "Actual Startup Time": 1.631,
    "Actual Total Time": 257.984,
    "Actual Rows": 4601,
    "Actual Loops": 1,
    "Workers Planned": 2,
    "Workers Launched": 2,
    "Single Copy": false,
    "Shared Hit Blocks": 68584,
    "Shared Read Blocks": 0,
    "Shared Dirtied Blocks": 0,
    "Shared Written Blocks": 0,
    "Local Hit Blocks": 0,
    "Local Read Blocks": 0,
    "Local Dirtied Blocks": 0,
    "Local Written Blocks": 0,
    "Temp Read Blocks": 0,
    "Temp Written Blocks": 0,
    "Plans": [
      {
        "Node Type": "Seq Scan",
        "Parent Relationship": "Outer",
        "Parallel Aware": true,
        "Async


EXPLAIN: JSONB: orbit_quality + SNR
  Node Type:     Seq Scan
  Actual Rows:   1,515,621
  Actual Time:   45665.0 ms
  Shared Hit:    12,992,593
  Shared Read:   0

  Full plan:
{
  "Plan": {
    "Node Type": "Seq Scan",
    "Parallel Aware": false,
    "Async Capable": false,
    "Relation Name": "mpc_orbits",
    "Alias": "mpc_orbits",
    "Startup Cost": 0.0,
    "Total Cost": 106467.57,
    "Plan Rows": 1515343,
    "Plan Width": 72,
    "Actual Startup Time": 0.511,
    "Actual Total Time": 45664.951,
    "Actual Rows": 1515621,
    "Actual Loops": 1,
    "Shared Hit Blocks": 12992593,
    "Shared Read Blocks": 0,
    "Shared Dirtied Blocks": 0,
    "Shared Written Blocks": 0,
    "Local Hit Blocks": 0,
    "Local Read Blocks": 0,
    "Local Dirtied Blocks": 0,
    "Local Written Blocks": 0,
    "Temp Read Blocks": 0,
    "Temp Written Blocks": 0
  },
  "Planning": {
    "Shared Hit Blocks": 9,
    "Shared Read Blocks": 0,
    "Shared Dirtied Blocks": 0,
    "Shared Written Block

## 6. Timing Variance (Box Plots)

In [7]:
# Explode all_timings into individual rows for box plot
box_rows = []
for _, row in timing_df.iterrows():
    for t in row["all_timings"]:
        box_rows.append({"label": row["label"], "time_sec": t})
box_df = pd.DataFrame(box_rows)

fig = px.box(
    box_df,
    y="label",
    x="time_sec",
    orientation="h",
    title="Query Timing Variance (3 runs each)",
    labels={"time_sec": "Execution time (seconds)", "label": ""},
)
fig.update_layout(height=500, margin=dict(l=250), yaxis=dict(autorange="reversed"))
fig.show()

## 7. Summary & Recommendations

In [8]:
print("Query Performance Summary")
print("=" * 60)
query_log.clear()

# Recompute summary from results
for r in results:
    print(f"  {r['label']:<40} {r['row_count']:>10,} rows  {r['median_sec']:>8.3f}s")

print("\nKey Findings:")
full_scan_time = timing_df.loc[timing_df["label"].str.contains("Full scan \\(5"), "median_sec"].values
jsonb_time = timing_df.loc[timing_df["label"].str.contains("JSONB: orbit_quality"), "median_sec"].values

if len(full_scan_time) > 0 and len(jsonb_time) > 0:
    overhead = jsonb_time[0] / full_scan_time[0]
    print(f"  - JSONB extraction overhead: {overhead:.1f}x vs flat column scan")

print("  - See EXPLAIN output above for index vs. sequential scan details")
print("  - Tisserand computation is pure arithmetic — negligible overhead expected")

Query Performance Summary
  1. Count all                                      1 rows     0.263s
  2. Full scan (5 cols)                     1,515,621 rows    29.466s
  3. Filter: orbit_type_int=2 (Apollo)          4,601 rows     0.379s
  4. Filter: h > 22                            30,475 rows     0.826s
  5. Filter: NEOs (q < 1.3)                    41,015 rows     1.145s
  6. Combined: Apollo + H>22                    2,384 rows     0.493s
  7. NEAs + Tisserand                           8,549 rows     0.355s
  8. Full scan + Tisserand                  1,515,621 rows    38.052s
  9. JSONB: orbit_quality + SNR             1,515,621 rows    48.160s
  10. JSONB: all MOIDs                      1,515,621 rows    66.114s

Key Findings:
  - JSONB extraction overhead: 1.6x vs flat column scan
  - See EXPLAIN output above for index vs. sequential scan details
  - Tisserand computation is pure arithmetic — negligible overhead expected
