# Evaluation: Query Runtime Variation


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import io
import json
import pathlib
import re

import ipywidgets as ipyw
import IPython.display as ipyd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as pgo
import seaborn as sns

from postbound.db import postgres
from postbound.experiments import workloads
from postbound.optimizer import jointree
from postbound.vis import fdl, optimizer as opt_viz, db as db_viz
from postbound.util import dataframe as df_utils, dicts as dict_utils

sns.set_theme(style="whitegrid")

def parse_join_tree(raw_jt):
    jsonized = json.loads(raw_jt)
    join_sequence = jointree.parse_nested_table_sequence(jsonized)
    return jointree.LogicalJoinTree.load_from_list(join_sequence)


def parse_query_plan(raw_plan):
    jsonized = json.loads(raw_plan)
    pg_plan = postgres.PostgresExplainPlan(jsonized)
    return pg_plan.as_query_execution_plan()

result_file_pattern = r"join-order-runtimes-(?P<label>\d{1,2}[a-z])\.csv"
label_pattern = r"(?P<index>\d{1,2})(?P<variant>[a-z])"
root_dir = "results/query-runtime-variation"

df = pd.DataFrame()

for result_file in pathlib.Path(root_dir).glob("join-order-runtimes-*.csv"):
    file_matcher = re.match(result_file_pattern, result_file.name)
    if not file_matcher:
        continue
    label = file_matcher.group("label")
    label_matcher = re.match(label_pattern, label)
    if not label_matcher:
        continue
    query_index, query_variant = label_matcher.group("index"), label_matcher.group("variant")
    query_index = int(query_index)

    current_result_set = pd.read_csv(result_file, converters={"join_order": parse_join_tree, "query_plan": parse_query_plan})
    current_result_set["query_family"] = query_index
    current_result_set["effective_runtime"] = np.where(np.isfinite(current_result_set["execution_time"]),
                                                       current_result_set["execution_time"],
                                                       current_result_set["timeout"])
    df = pd.concat([df, current_result_set], ignore_index=True)

df.reset_index(inplace=True)
current_df = None
label = "1a"
workload = workloads.job()

def family_overview(family: int, *, ax = None):
    ax = ax if ax else plt.gca()

    current_df = df[df["query_family"] == family].copy()
    n_queries = len(current_df)
    n_timeout = np.isinf(current_df.execution_time).sum()

    g = sns.kdeplot(data=current_df, x="effective_runtime", hue="label", ax=ax)

    g.set_title(f"Family: {family} (total = {n_queries}, timeouts = {n_timeout})")
    g.set_xlabel("Execution time [s]")
    g.set_ylabel("Number of join orders")


def all_families():
    families = list(df["query_family"].unique())
    families.sort()

    fig, ax = plt.subplots(len(families), 1, figsize=(10, 5 * len(families)))
    fig.tight_layout(h_pad=5)

    for fam_idx, family in enumerate(families):
        family_overview(family, ax=ax[fam_idx])

def update_hist(bins):
    g = sns.histplot(data=current_df, x="effective_runtime", bins=bins)
    n_total = len(current_df)
    n_timeout = np.isinf(current_df["execution_time"]).sum()
    g.set_title(f"Query: {label} (Total={n_total} Timeout={n_timeout})")
    g.set_xlabel("Execution time [s]")
    g.set_ylabel("Number of join orders")


def runtime_hist(current_label: str = ""):
    global label
    global current_df
    if current_label:
        label = current_label
    current_df = df[df["label"] == label].copy()
    bins_slider = ipyw.IntSlider(value = 10, min=2, max=len(current_df), step=1, description="Bins")
    interactive_plot = ipyw.interactive(update_hist, bins=bins_slider)
    ipyd.display(interactive_plot)


def exec_time_distribution(current_label: str = ""):
    global label
    global current_df
    if current_label:
        label = current_label
    current_df = df[df["label"] == label].copy()

    join_order_coords = fdl.kamada_kawai_layout(current_df["join_order"],
                                                lambda a, b: 1 / max(jointree.bottom_up_similarity(a, b), 1e-5))
    coord_df = df_utils.as_df(join_order_coords, key_name="join_order", column_names=["x", "y"])
    vis_df = coord_df.merge(current_df[["join_order", "effective_runtime", "execution_time", "index"]], on="join_order")
    vis_df["join_order"] = vis_df["join_order"].apply(str)

    plotly_cfg = {"scrollZoom": True}
    p = px.scatter(vis_df, x="x", y="y", color="effective_runtime", hover_data=["join_order", "execution_time", "index"], color_continuous_scale="viridis",
                   height=750)
    p.update_xaxes(visible=False)
    p.update_yaxes(visible=False)
    p.update_layout(dragmode="pan", plot_bgcolor="white")
    p.show(config=plotly_cfg)


def best_join_orders(q: float = 0.1, current_label: str = ""):
    global label
    global current_df
    if current_label:
        label = current_label
    current_df = df[df["label"] == label].copy()
    quant_df = current_df[current_df["effective_runtime"] <= current_df["effective_runtime"].quantile(q)].copy()

    raw_data = dict_utils.aggregate(quant_df["join_order"].apply(jointree.join_depth)) | {"join_order": quant_df["join_order"],
                                                                                          "execution_time": quant_df["execution_time"],
                                                                                          "effective_runtime": quant_df["effective_runtime"]}
    wide_df = pd.DataFrame(raw_data)
    long_df = wide_df.melt(["join_order", "execution_time", "effective_runtime"], var_name="table", value_name="join_index")
    long_df["table"] = long_df["table"].apply(lambda tab: tab.full_name + "\n" + tab.alias)

    plotly_cfg = {"scrollZoom": True}
    title = f"Query {label} ({q * 100}% worst, {len(quant_df)} join orders total)"
    p = px.line(long_df, x="table", y="join_index", hover_data="execution_time", color="join_order", title=title, height=500)
    p.update_layout(showlegend=False)
    p.show(config=plotly_cfg)


def worst_join_orders(q: float = 0.1, current_label: str = ""):
    global label
    global current_df
    if current_label:
        label = current_label
    current_df = df[df["label"] == label].copy()
    quant_df = current_df[current_df["effective_runtime"] >= current_df["effective_runtime"].quantile(1 - q)].copy()

    raw_data = dict_utils.aggregate(quant_df["join_order"].apply(jointree.join_depth)) | {"join_order": quant_df["join_order"],
                                                                                          "execution_time": quant_df["execution_time"],
                                                                                          "effective_runtime": quant_df["effective_runtime"]}
    wide_df = pd.DataFrame(raw_data)
    long_df = wide_df.melt(["join_order", "execution_time", "effective_runtime"], var_name="table", value_name="join_index")
    long_df["table"] = long_df["table"].apply(lambda tab: tab.full_name + "\n" + tab.alias)

    plotly_cfg = {"scrollZoom": True}
    title = f"Query {label} ({q * 100}% worst, {len(quant_df)} join orders total)"
    p = px.line(long_df, x="table", y="join_index", hover_data="execution_time", color="join_order", title=title, height=500)
    p.update_layout(showlegend=False)
    p.show(config=plotly_cfg)


def join_order_differences(q: float = 0.1, current_label: str = ""):
    global label
    global current_df
    if current_label:
        label = current_label
    current_df = df[df["label"] == label].copy()

    best_df = current_df[current_df["effective_runtime"] <= current_df["effective_runtime"].quantile(q)].copy()
    best_df["category"] = "best"
    worst_df = current_df[current_df["effective_runtime"] >= current_df["effective_runtime"].quantile(1 - q)].copy()
    worst_df["category"] = "worst"
    quant_df = pd.concat([best_df, worst_df])
    raw_data = dict_utils.aggregate(quant_df["join_order"].apply(jointree.join_depth)) | {"join_order": quant_df["join_order"],
                                                                                              "execution_time": quant_df["execution_time"],
                                                                                              "effective_runtime": quant_df["effective_runtime"],
                                                                                              "category": quant_df["category"]}
    wide_df = pd.DataFrame(raw_data)
    long_df = wide_df.melt(["join_order", "execution_time", "effective_runtime", "category"], var_name="table", value_name="join_index")
    long_df["table"] = long_df["table"].apply(lambda tab: tab.full_name + "\n" + tab.alias)

    plotly_cfg = {"scrollZoom": True}
    title = f"Query {label} ({q * 100}% best/worst, {len(quant_df)} join orders total)"
    p = px.line(long_df, x="table", y="join_index", hover_data=["execution_time"], color="category", symbol="join_order", title=title, height=500)
    p.update_layout(showlegend=False)
    p.show(config=plotly_cfg)


def join_order(*idx: int):
    nrow = (len(idx) + 1) // 2
    ncol = 1 if len(idx) == 1 else 2

    fig_width = max(12 + (ncol - 2) * 4, 10)
    fig_height = max(9 + 7 * nrow, 10)
    fig, ax = plt.subplots(nrow, ncol, figsize=(fig_width, fig_height))

    if len(idx) == 1:
        ax = [[ax]]
    elif nrow == 1:
        ax = [ax]

    for plot_idx, graph_idx in enumerate(idx):
        query = df.iloc[graph_idx]

        label, join_tree, exec_time = query["label"], query["join_order"], round(query["execution_time"], 2)
        graph = opt_viz.plot_join_tree(join_tree)
        graph_bin_buffer = io.BytesIO(graph.pipe(format="png"))
        graph_img = plt.imread(graph_bin_buffer)

        title = f"Query {label} (idx={graph_idx}, exec time={exec_time}s)"

        row_idx = plot_idx // ncol
        col_idx = plot_idx % ncol
        current_ax = ax[row_idx][col_idx]
        current_ax.imshow(graph_img)
        current_ax.axis("off")
        current_ax.set_title(title, {"fontsize": 10})

    if len(idx) % ncol != 0:
        ax[-1][-1].axis("off")

    fig.subplots_adjust(wspace=0.2, hspace=0.3)
    fig.tight_layout()


def query_plan(*idx):
    nrow = (len(idx) + 1) // 2
    ncol = 1 if len(idx) == 1 else 2

    fig_width = max(16 + (ncol - 2) * 6, 10)
    fig_height = max(12 + 10 * nrow, 10)
    fig, ax = plt.subplots(nrow, ncol, figsize=(fig_width, fig_height))

    if len(idx) == 1:
        ax = [[ax]]
    elif nrow == 1:
        ax = [ax]

    for plot_idx, graph_idx in enumerate(idx):
        query = df.iloc[graph_idx]

        label, query_plan, exec_time = query["label"], query["query_plan"], round(query["execution_time"], 2)
        graph = db_viz.plot_query_plan(query_plan)
        graph_bin_buffer = io.BytesIO(graph.pipe(format="png"))
        graph_img = plt.imread(graph_bin_buffer)

        title = f"Query {label} (idx={graph_idx}, exec time={exec_time}s)"

        row_idx = plot_idx // ncol
        col_idx = plot_idx % ncol
        current_ax = ax[row_idx][col_idx]
        current_ax.imshow(graph_img)
        current_ax.axis("off")
        current_ax.set_title(title, {"fontsize": 10})

    if len(idx) % ncol != 0:
        ax[-1][-1].axis("off")

    fig.subplots_adjust(wspace=0.2, hspace=0.3)
    fig.tight_layout()

Available functions are:

- `all_families()` plots a KDE of the execution time distribution for all query types in a family (for all families).
- `family_overview(idx)` plots a KDE of the execution time distribution for all queries in a family.
- `runtime_hist(label)` plots a histogram of the execution time distribution for a specific query. Number of bins can be adjusted using the slider.
- `exec_time_distribution(label)` plots all join orders of query as a point cloud, such that similar join orders are located closely together. Each join order is colored by its execution time
- `best_join_orders(pct, label)` and `worst_join_orders(pct, label)` plots the $pct$ fraction of all join orders for the given label, such that only the join orders with lowest/highest runtime are included
- `join_order_differences(pct, label)` plot the $pct$ fraction of the best/worst join orders for the given label
- `join_order(idx)` and `query_plan(idx)` visualize the join order and optimized query plan for a specific instance

Instead of passing the `label` parameter to each function, the `label` global variable can be set.
