# UES workload analysis

## Data loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import re
import warnings

import natsort
import numpy as np
import pandas as pd
import seaborn as sns

from transform import db, mosp
from postgres import explain
from analysis import selection

In [3]:
sns.set_theme(style="whitegrid")

In [4]:
df_base = pd.read_csv("workloads/job-ues-results-rebuild.csv")
df_lin = pd.read_csv("workloads/job-ues-results-linearized.csv")

## Initial analysis

Get a rough overview of the workload performance:

In [5]:
df_base.groupby("run")["query_rt_total"].sum()

run
1    412.299860
2    411.238048
3    406.238064
4    409.906597
5    409.552837
Name: query_rt_total, dtype: float64

In [6]:
df_lin.groupby("run")["query_rt_total"].sum()

run
1    412.487329
2    407.745956
3    406.529721
4    402.119050
5    407.860975
Name: query_rt_total, dtype: float64

In [7]:
df_base = selection.best_query_repetition(df_base, "label", performance_col="query_rt_total").copy()
df_lin = selection.best_query_repetition(df_lin, "label", performance_col="query_rt_total").copy()

In [8]:
df_base["workload"] = "ues"
df_lin["workload"] = "ues_linearized"
df = pd.concat([df_base, df_lin])
df.sort_values(by="label", key=lambda _: np.argsort(natsort.index_natsorted(df["label"])),
               inplace=True)
df

Unnamed: 0,query,label,optimization_time,query_result,query_rt_total,run,workload
38,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.050499,3,ues
38,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004410,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.992252,1,ues_linearized
39,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,1b,0.003848,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.555610,4,ues
39,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,1b,0.003347,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.553253,3,ues_linearized
40,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.737995,1,ues
...,...,...,...,...,...,...,...
84,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33a,0.363256,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.583707,2,ues_linearized
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.664327,1,ues
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009747,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.589906,2,ues_linearized
86,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.823239,2,ues


## Feature calculation

In [9]:
def parse_query_plans(sample: pd.Series) -> explain.PlanNode:
    query, plan, workload = sample["query"], sample["query_result"], sample["workload"]
    if workload == "ues":
        parsed_plan = explain.parse_explain_analyze(query, plan)
    elif workload == "ues_linearized":
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    else:
        warnings.warn("Unknown workload '{}', assuming no subqueries".format(workload))
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    return parsed_plan

In [10]:
df.query_result = df.query_result.apply(json.loads)
df["query"] = df["query"].apply(mosp.MospQuery.parse)
df["explain"] = df.query_result
df.query_result = df.apply(parse_query_plans, axis="columns")
df["subquery"] = df.query_result.apply(explain.PlanNode.extract_subqueries)
df = df.explode("subquery")
df



Unnamed: 0,query,label,optimization_time,query_result,query_rt_total,run,workload,explain,subquery
38,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,1.050499,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
38,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004410,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.992252,1,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
39,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,1b,0.003848,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.555610,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
39,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,1b,0.003347,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.553253,3,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
40,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.737995,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
...,...,...,...,...,...,...,...,...,...
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009747,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.589906,2,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
86,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,2,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...
86,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,2,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...


In [11]:
df_ues = df[df.workload == "ues"].copy()
df_ues_lin = df[df.workload == "ues_linearized"].copy()

In [12]:
df_sqs = df_ues[~df_ues.subquery.isna()].copy()
df_sqs["pruned"] = df_sqs.subquery.apply(lambda sq: sq.any_pruned())
len(df_sqs)

105

In [13]:
df_sqs

Unnamed: 0,query,label,optimization_time,query_result,query_rt_total,run,workload,explain,subquery,pruned
38,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,1.050499,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
40,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.737995,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
74,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2c,0.003486,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.975779,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False
87,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3a,0.002976,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,2.917895,5,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False
88,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3b,0.003484,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,1.606580,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False
...,...,...,...,...,...,...,...,...,...,...
84,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33a,0.182369,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.629256,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False
85,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,1,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
86,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,2,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False


In [14]:
df_ues_wide = pd.merge(
    df_sqs.drop(columns=["run", "workload"]),
    df_ues_lin.drop(columns=["run", "workload", "subquery"]),
    on="label", how="inner",
    suffixes=("_ues", "_ues_lin"))
df_ues_wide["sq_pred"] = df_ues_wide.subquery.apply(getattr, args=("join_pred",))
df_ues_wide.rename(columns={"query_rt_total_ues": "rt_ues", "query_rt_total_ues_lin": "rt_ues_lin"}, inplace=True)
len(df_ues_wide)

105

In [15]:
df_ues_wide

Unnamed: 0,query_ues,label,optimization_time_ues,query_result_ues,rt_ues,explain_ues,subquery,pruned,query_ues_lin,optimization_time_ues_lin,query_result_ues_lin,rt_ues_lin,explain_ues_lin,sq_pred
0,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,1.050499,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004410,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.992252,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
1,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.737995,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004804,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.701607,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
2,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2c,0.003486,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.975779,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,0.002770,Hash Join (mc.company_id = cn.id) <- [Hash Joi...,1.028141,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mc.company_id = cn.id)
3,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3a,0.002976,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,2.917895,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.002648,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,2.656466,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
4,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3b,0.003484,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,1.606580,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.004370,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,1.438635,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33a,0.182369,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.629256,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.363256,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.583707,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
101,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.589906,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)
102,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.589906,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
103,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.010535,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.756614,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)


In [16]:
def subquery_join_partner(subquery):
    parent = subquery.parent
    left, right = parent.left, parent.right
    partner = left if subquery.join_pred == right.join_pred else right
    return partner

In [17]:
df_ues_wide["sq_fk_partner"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_subquery(row["sq_pred"]), axis="columns").apply(lambda sq: sq.base_table())
df_ues_wide["sq_fk_table"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_scan(row["sq_fk_partner"]), axis="columns")
df_ues_wide["sq_fk_rows"] = df_ues_wide["sq_fk_table"].apply(lambda tab: tab.incoming_rows(fallback_live=True))

df_ues_wide["sq_join"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_join(row["sq_pred"]), axis="columns")
df_ues_wide["sq_join_rows"] = df_ues_wide["sq_join"].apply(lambda join: join.lookup_subquery().count_result_tuples())
df_ues_wide["sq_join_duration"] = df_ues_wide["sq_join"].apply(lambda join: join.exec_time)

df_ues_wide["sq_join_partner"] = df_ues_wide["sq_join"].apply(subquery_join_partner)
df_ues_wide["sq_join_partner_duration"] = df_ues_wide["sq_join_partner"].apply(lambda join: join.exec_time)

df_ues_wide["node_pruned_ues"] = df_ues_wide.query_result_ues.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))
df_ues_wide["node_pruned_ues_lin"] = df_ues_wide.query_result_ues_lin.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))

In [18]:
df_ues_wide

Unnamed: 0,query_ues,label,optimization_time_ues,query_result_ues,rt_ues,explain_ues,subquery,pruned,query_ues_lin,optimization_time_ues_lin,...,sq_fk_partner,sq_fk_table,sq_fk_rows,sq_join,sq_join_rows,sq_join_duration,sq_join_partner,sq_join_partner_duration,node_pruned_ues,node_pruned_ues_lin
0,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,1.050499,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004410,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,250,0.119231,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.885482,False,False
1,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.737995,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004804,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,250,0.108347,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.602376,False,False
2,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2c,0.003486,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.975779,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,0.002770,...,movie_companies AS mc,Seq Scan :: movie_companies,4958296,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,7,0.244117,Hash Join (t.id = mk.movie_id) <- [Seq Scan ::...,0.691910,False,False
3,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3a,0.002976,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,2.917895,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.002648,...,movie_keyword AS mk,Seq Scan :: movie_keyword,7480087,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,14050,0.362651,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,2.517675,False,False
4,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3b,0.003484,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,1.606580,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.004370,...,movie_keyword AS mk,Seq Scan :: movie_keyword,7480087,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,14050,0.385987,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,1.197622,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33a,0.182369,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.629256,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.363256,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,789155,0.135864,Hash Join (t1.kind_id = kt1.id) <- [Merge Join...,1.467996,True,True
101,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,13726,0.000268,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,0.645468,False,False
102,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,789155,0.000017,Hash Join (t1.kind_id = kt1.id) <- [Merge Join...,0.645448,False,False
103,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.010535,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,2367725,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,20658,0.141106,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,1.671990,True,True


In [20]:
df_ues_wide["filter_strength_rel"] = df_ues_wide.sq_fk_rows / df_ues_wide.sq_join_rows
df_ues_wide["filter_strength_abs"] = df_ues_wide.sq_fk_rows - df_ues_wide.sq_join_rows
df_ues_wide["sq_speedup_rel"] = df_ues_wide.rt_ues_lin / df_ues_wide.rt_ues
df_ues_wide["sq_speedup_abs"] = df_ues_wide.rt_ues_lin - df_ues_wide.rt_ues
df_ues_wide[["label", "subquery", "sq_fk_rows", "sq_join_rows", "rt_ues", "rt_ues_lin", "filter_strength_rel", "sq_speedup_rel"]]

Unnamed: 0,label,subquery,sq_fk_rows,sq_join_rows,rt_ues,rt_ues_lin,filter_strength_rel,sq_speedup_rel
0,1a,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,2367725,250,1.050499,0.992252,9470.900000,0.944553
1,1c,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,2367725,250,0.737995,0.701607,9470.900000,0.950693
2,2c,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,4958296,7,0.975779,1.028141,708328.000000,1.053662
3,3a,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,7480087,14050,2.917895,2.656466,532.390534,0.910405
4,3b,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,7480087,14050,1.606580,1.438635,532.390534,0.895464
...,...,...,...,...,...,...,...,...
100,33a,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,2367725,789155,1.629256,1.583707,3.000329,0.972043
101,33b,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,2367725,13726,0.664327,0.589906,172.499271,0.887975
102,33b,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,2367725,789155,0.664327,0.589906,3.000329,0.887975
103,33c,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,2367725,20658,1.823239,1.756614,114.615403,0.963458


In [23]:
df_export = df_ues_wide.drop(
    columns=["query_result_ues_lin",
             "sq_pred",
             "sq_fk_partner",
             "sq_fk_table",
             "sq_join",
             "sq_join_partner",
             "explain_ues_lin"
            ]).rename(
    columns={"query_result_ues": "ues_plan",
             "explain_ues": "ues_explain",
             "sq_fk_rows": "foreign_key_rows",
             "sq_join_rows": "subquery_rows",
             "rt_ues": "runtime_ues",
             "rt_ues_lin": "runtime_ues_linear",
             "sq_speedup_rel": "ues_speedup_rel",
             "sq_speedup_abs": "ues_speedup_abs",
             "query_ues": "query_ues",
             "query_ues_lin": "query_ues_linear",
             "sq_join_duration": "subquery_runtime",
             "sq_join_partner_duration": "subquery_partner_runtime",
             "pruned": "subquery_pruned",
             "node_pruned_ues": "ues_pruned",
             "node_pruned_ues_lin": "ues_linear_pruned"
            })
df_export["tables"] = df_export.query_ues.apply(lambda q: json.dumps([tab.full_name for tab in q.collect_tables()]))
df_export["subquery_tables"] = df_export.subquery.apply(lambda sq: sq.lookup_subquery(sq.join_pred).collect_tables()).apply(lambda sq_tabs: json.dumps([tab.full_name for tab in sq_tabs]))

df_export.ues_explain = df_export.ues_explain.apply(json.dumps)

df_export.to_csv("workloads/job-ues-eval-linearized.csv", index=False)
df_export

Unnamed: 0,query_ues,label,optimization_time_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_ues_linear,optimization_time_ues_lin,...,subquery_runtime,subquery_partner_runtime,ues_pruned,ues_linear_pruned,filter_strength_rel,filter_strength_abs,ues_speedup_rel,ues_speedup_abs,tables,subquery_tables
0,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1a,0.004274,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,1.050499,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004410,...,0.119231,0.885482,False,False,9470.900000,2367475,0.944553,-0.058247,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
1,SELECT COUNT(*) FROM movie_companies AS mc JOI...,1c,0.005587,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.737995,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.004804,...,0.108347,0.602376,False,False,9470.900000,2367475,0.950693,-0.036388,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
2,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2c,0.003486,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.975779,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,0.002770,...,0.244117,0.691910,False,False,708328.000000,4958289,1.053662,0.052362,"[""movie_keyword"", ""keyword"", ""title"", ""movie_c...","[""movie_companies"", ""company_name""]"
3,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3a,0.002976,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,2.917895,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.002648,...,0.362651,2.517675,False,False,532.390534,7466037,0.910405,-0.261429,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
4,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,3b,0.003484,Hash Join (mk.movie_id = t.id) <- [Hash Join (...,1.606580,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.004370,...,0.385987,1.197622,False,False,532.390534,7466037,0.895464,-0.167945,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33a,0.182369,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.629256,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.363256,...,0.135864,1.467996,True,True,3.000329,1578570,0.972043,-0.045549,"[""movie_link"", ""link_type"", ""title"", ""kind_typ...","[""movie_info_idx"", ""info_type""]"
101,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,...,0.000268,0.645468,False,False,172.499271,2353999,0.887975,-0.074421,"[""movie_link"", ""link_type"", ""title"", ""kind_typ...","[""movie_info_idx"", ""info_type""]"
102,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33b,0.009899,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.664327,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.009747,...,0.000017,0.645448,False,False,3.000329,1578570,0.887975,-0.074421,"[""movie_link"", ""link_type"", ""title"", ""kind_typ...","[""movie_info_idx"", ""info_type""]"
103,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,33c,0.010593,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.823239,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.010535,...,0.141106,1.671990,True,True,114.615403,2347067,0.963458,-0.066625,"[""movie_link"", ""link_type"", ""title"", ""kind_typ...","[""movie_info_idx"", ""info_type""]"


## Some sanity checks for the export data set

In [19]:
df_export.groupby("label")[["label"]].count().rename(columns={"label": "count"}).reset_index().sort_values(by="count", ascending=False)

Unnamed: 0,label,count
58,31c,3
56,31a,3
39,25c,2
61,33a,2
57,31b,2
...,...,...
35,23c,1
36,24b,1
38,25b,1
40,26a,1


In [20]:
df_export.loc[df_export.filter_strength.isna() & ~df_export.subquery_pruned]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables


In [21]:
df_export.loc[df_export.label == "6b"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
10,6b,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (ci.movie_id = t.id) <- [Hash Join (...,0.536543,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (ci.person_id = n.id) <- [~Seq Scan~...,True,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2.252002,0,0,0.181875,0.349454,True,False,,4.197244,"[""movie_keyword"", ""keyword"", ""title"", ""cast_in...","[""cast_info"", ""name""]"


In [22]:
df_export.loc[df_export.label == "20b"].iloc[0].query_ues

SELECT COUNT(*) FROM complete_cast AS cc JOIN comp_cast_type AS cct1 ON cct1.kind = 'cast' AND cct1.id = cc.subject_id JOIN comp_cast_type AS cct2 ON cct2.kind LIKE '%complete%' AND cct2.id = cc.status_id JOIN title AS t ON t.production_year > 2000 AND t.id = cc.movie_id JOIN kind_type AS kt ON kt.kind = 'movie' AND kt.id = t.kind_id JOIN (SELECT movie_id FROM movie_keyword AS mk JOIN keyword AS k ON k.keyword IN ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND k.id = mk.keyword_id) AS t_mk ON t_mk.movie_id = cc.movie_id JOIN (SELECT person_id, movie_id FROM cast_info AS ci JOIN name AS n ON n.name LIKE '%downey%robert%' AND n.id = ci.person_id JOIN char_name AS chn ON chn.name NOT LIKE '%sherlock%' AND (chn.name LIKE '%tony%stark%' OR chn.name LIKE '%iron%man%') AND chn.id = ci.person_role_id) AS t_ci ON t_ci.movie_id = t_mk.movie_id

In [23]:
print(df_export.loc[df_export.label == "6b"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (ci.movie_id = t.id)
  <- [SQ] Hash Join (ci.person_id = n.id)
    <- [PRUNED] Seq Scan :: cast_info
    <- Seq Scan :: name (((name)::text ~~ '%downey%robert%'::text))
  <- Hash Join (t.id = mk.movie_id)
    <- Seq Scan :: title ((production_year > 2014))
    <- Hash Join (mk.keyword_id = k.id)
      <- Seq Scan :: movie_keyword
      <- Seq Scan :: keyword (((keyword)::text = ANY ('{superhero,sequel,second-part,marvel-comics,based-on-comic,tv-special,fight,violence}'::text[])))



In [24]:
df_export.loc[df_export.label == "6b"].iloc[0].ues_explain

'[{"Plan": {"Node Type": "Aggregate", "Strategy": "Plain", "Partial Mode": "Simple", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 622730.99, "Total Cost": 622731.0, "Plan Rows": 1, "Plan Width": 8, "Actual Startup Time": 534.419, "Actual Total Time": 535.652, "Actual Rows": 1, "Actual Loops": 1, "Plans": [{"Node Type": "Gather", "Parent Relationship": "Outer", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 179377.81, "Total Cost": 622730.98, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 534.414, "Actual Total Time": 535.647, "Actual Rows": 0, "Actual Loops": 1, "Workers Planned": 2, "Workers Launched": 2, "Single Copy": false, "Plans": [{"Node Type": "Hash Join", "Parent Relationship": "Outer", "Parallel Aware": true, "Async Capable": false, "Join Type": "Inner", "Startup Cost": 178377.81, "Total Cost": 621730.88, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 531.817, "Actual Total Time": 531.823, "Actual Rows": 0, "Actual Loops

In [25]:
df_export.loc[df_export.label == "33c"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,351960,18374,0.121368,0.695499,False,False,19.155328,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
111,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,690018,229962,0.114501,0.564965,False,False,3.000574,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"


In [26]:
df_export.loc[df_export.label == "33c"].iloc[0].query_ues

SELECT COUNT(*) FROM movie_link AS ml JOIN link_type AS lt ON lt.link IN ('sequel', 'follows', 'followed by') AND lt.id = ml.link_type_id JOIN title AS t2 ON t2.production_year BETWEEN 2000 AND 2010 AND t2.id = ml.linked_movie_id JOIN title AS t1 ON t1.id = ml.movie_id JOIN kind_type AS kt1 ON kt1.kind IN ('tv series', 'episode') AND kt1.id = t1.kind_id JOIN kind_type AS kt2 ON kt2.kind IN ('tv series', 'episode') AND kt2.id = t2.kind_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx1 JOIN info_type AS it1 ON it1.info = 'rating' AND it1.id = mi_idx1.info_type_id) AS t_mi_idx1 ON t_mi_idx1.movie_id = ml.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx2 JOIN info_type AS it2 ON it2.info = 'rating' AND it2.id = mi_idx2.info_type_id AND mi_idx2.info < '3.5') AS t_mi_idx2 ON t_mi_idx2.movie_id = ml.linked_movie_id JOIN movie_companies AS mc2 ON mc2.movie_id = ml.linked_movie_id JOIN company_name AS cn2 ON cn2.id = mc2.company_id JOIN movie_companies AS mc1 ON mc1.movie_id =

In [27]:
print(df_export.loc[df_export.label == "33c"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (cn1.id = mc1.company_id)
  <- Seq Scan :: company_name (((country_code)::text <> '[us]'::text))
  <- Hash Join (mc1.movie_id = t1.id)
    <- Seq Scan :: movie_companies
    <- Hash Join (cn2.id = mc2.company_id)
      <- Seq Scan :: company_name
      <- Hash Join (mc2.movie_id = t2.id)
        <- Seq Scan :: movie_companies
        <- Hash Join (mi_idx2.movie_id = t2.id)
          <- [SQ] Hash Join (mi_idx2.info_type_id = it2.id)
            <- Seq Scan :: movie_info_idx (((info)::text < '3.5'::text))
            <- Seq Scan :: info_type (((info)::text = 'rating'::text))
          <- Hash Join (mi_idx1.movie_id = t1.id)
            <- [SQ] Hash Join (mi_idx1.info_type_id = it1.id)
              <- Seq Scan :: movie_info_idx
              <- Seq Scan :: info_type (((info)::text = 'rating'::text))
            <- Hash Join (t2.kind_id = kt2.id)
              <- Hash Join (t1.kind_id = kt1.id)
                <- Merge Join (t1.id = ml.movie_id)
                  <- Index Scan :