# Evaluation: Query Runtime Variation


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pathlib
import re

import ipywidgets as ipyw
import IPython.display as ipyd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

from postbound.experiments import workloads
from postbound.optimizer import jointree
from postbound.vis import fdl
from postbound.util import dataframe as df_utils, dicts as dict_utils

sns.set_theme(style="whitegrid")

def parse_join_tree(raw_jt):
    jsonized = json.loads(raw_jt)
    join_sequence = jointree.parse_nested_table_sequence(jsonized)
    return jointree.LogicalJoinTree.load_from_list(join_sequence)

result_file_pattern = r"join-order-runtimes-(?P<label>\d{1,2}[a-z])\.csv"
label_pattern = r"(?P<index>\d{1,2})(?P<variant>[a-z])"
root_dir = "results/query-runtime-variation"

df = pd.DataFrame()

for result_file in pathlib.Path(root_dir).glob("join-order-runtimes-*.csv"):
    file_matcher = re.match(result_file_pattern, result_file.name)
    if not file_matcher:
        continue
    label = file_matcher.group("label")
    label_matcher = re.match(label_pattern, label)
    if not label_matcher:
        continue
    query_index, query_variant = label_matcher.group("index"), label_matcher.group("variant")
    query_index = int(query_index)
    
    current_result_set = pd.read_csv(result_file, converters={"join_order": parse_join_tree})
    current_result_set["query_family"] = query_index
    current_result_set["effective_runtime"] = np.where(np.isfinite(current_result_set["execution_time"]),
                                                       current_result_set["execution_time"],
                                                       current_result_set["timeout"])
    df = pd.concat([df, current_result_set], ignore_index=True)
    
current_df = None

def family_overview(family: int, *, ax = None):
    ax = ax if ax else plt.gca()
    
    current_df = df[df["query_family"] == family].copy()
    n_queries = len(current_df)
    n_timeout = np.isinf(current_df.execution_time).sum()
    
    g = sns.kdeplot(data=current_df, x="effective_runtime", hue="label", ax=ax)
    
    g.set_title(f"Family: {family} (total = {n_queries}, timeouts = {n_timeout})")
    g.set_xlabel("Execution time [s]")
    g.set_ylabel("Number of join orders")
    

def all_families():
    families = list(df["query_family"].unique())
    families.sort()
    
    fig, ax = plt.subplots(len(families), 1, figsize=(10, 5 * len(families)))
    fig.tight_layout(h_pad=5)
    
    for fam_idx, family in enumerate(families):
        family_overview(family, ax=ax[fam_idx])

def update_hist(bins):
    g = sns.histplot(data=current_df, x="effective_runtime", bins=bins)
    n_total = len(current_df)
    n_timeout = np.isinf(current_df["execution_time"]).sum()
    g.set_title(f"Query: {label} (Total={n_total} Timeout={n_timeout})")
    g.set_xlabel("Execution time [s]")
    g.set_ylabel("Number of join orders")
    
    
def runtime_hist(current_label: str = ""):
    global label
    global current_df
    if not current_label:
        current_label = label
    else:
        label = current_label
    current_df = df[df["label"] == label].copy()
    bins_slider = ipyw.IntSlider(value = 10, min=2, max=len(current_df), step=1, description="Bins")
    interactive_plot = ipyw.interactive(update_hist, bins=bins_slider)
    ipyd.display(interactive_plot)


def exec_time_distribution(current_label: str = ""):
    global label
    global current_df
    if not current_label:
        current_label = label
    else:
        label = current_label
    current_df = df[df["label"] == label].copy()
    
    join_order_coords = fdl.kamada_kawai_layout(current_df["join_order"],
                                                lambda a, b: 1 / max(jointree.bottom_up_similarity(a, b), 1e-5))
    coord_df = df_utils.as_df(join_order_coords, key_name="join_order", column_names=["x", "y"])
    vis_df = coord_df.merge(current_df[["join_order", "effective_runtime", "execution_time"]], on="join_order")
    vis_df["join_order"] = vis_df["join_order"].apply(str)
    
    plotly_cfg = {"scrollZoom": True}
    p = px.scatter(vis_df, x="x", y="y", color="effective_runtime", hover_data=["join_order", "execution_time"], color_continuous_scale="viridis",
                   height=750)
    p.update_xaxes(visible=False)
    p.update_yaxes(visible=False)
    p.update_layout(dragmode="pan", plot_bgcolor="white")
    p.show(config=plotly_cfg)
    

def best_join_orders(quantile: float = 0.1, current_label: str = ""):
    global label
    global current_df
    if not current_label:
        current_label = label
    else:
        label = current_label
    current_df = df[df["label"] == label].copy()
    quant_df = current_df[current_df["effective_runtime"] <= current_df["effective_runtime"].quantile(quantile)].copy()
    
    raw_data = dict_utils.aggregate(quant_df["join_order"].apply(jointree.join_depth)) | {"join_order": quant_df["join_order"],
                                                                                          "execution_time": quant_df["execution_time"],
                                                                                          "effective_runtime": quant_df["effective_runtime"]}
    wide_df = pd.DataFrame(raw_data)
    long_df = wide_df.melt(["join_order", "execution_time", "effective_runtime"], var_name="table", value_name="join_index")
    long_df["table"] = long_df["table"].apply(lambda tab: tab.full_name + "\n" + tab.alias)
    
    plotly_cfg = {"scrollZoom": True}
    p = px.line(long_df, x="table", y="join_index", hover_data="execution_time", color="join_order")
    p.update_layout(showlegend=False)
    p.show(config=plotly_cfg)
    
    
def worst_join_orders(quantile: float = 0.1, current_label: str = ""):
    global label
    global current_df
    if not current_label:
        current_label = label
    else:
        label = current_label
    current_df = df[df["label"] == label].copy()
    quant_df = current_df[current_df["effective_runtime"] >= current_df["effective_runtime"].quantile(1 - quantile)].copy()
    
    raw_data = dict_utils.aggregate(quant_df["join_order"].apply(jointree.join_depth)) | {"join_order": quant_df["join_order"],
                                                                                          "execution_time": quant_df["execution_time"],
                                                                                          "effective_runtime": quant_df["effective_runtime"]}
    wide_df = pd.DataFrame(raw_data)
    long_df = wide_df.melt(["join_order", "execution_time", "effective_runtime"], var_name="table", value_name="join_index")
    long_df["table"] = long_df["table"].apply(lambda tab: tab.full_name + "\n" + tab.alias)
    
    plotly_cfg = {"scrollZoom": True}
    p = px.line(long_df, x="table", y="join_index", hover_data="execution_time", color="join_order")
    p.update_layout(showlegend=False)
    p.show(config=plotly_cfg)

Available functions are:

- `all_families()` plots a KDE of the execution time distribution for all query types in a family (for all families).
- `family_overview(idx)` plots a KDE of the execution time distribution for all queries in a family.
- `runtime_hist(label)` plots a histogram of the execution time distribution for a specific query. Number of bins can be adjusted using the slider.
- `exec_time_distribution(label)` plots all join orders of query as a point cloud, such that similar join orders are located closely together. Each join order is colored by its execution time
- `best_join_orders(pct, label)` and `worst_join_orders(pct, label)` plots the $pct$ fraction of all join orders for the given label, such that only the join orders with lowest/highest runtime are included

Instead of passing the `label` parameter to each function, the `label` global variable can be set.


In [3]:
label = "1a"