# UES workload analysis

## Data loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import re
import warnings

import numpy as np
import pandas as pd
import seaborn as sns

from transform import db, mosp
from postgres import explain

In [3]:
sns.set_theme(style="whitegrid")

In [4]:
df_raw = pd.read_csv("workloads/job-ues-results-linearization.csv")

In [5]:
df_raw

Unnamed: 0,label,query,query_result,rt_total,run,workload
0,1a,select count(*) from movie_companies as mc j...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",3.028541,1,ues
1,1b,select count(*) from movie_info_idx as mi_idx...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.571985,1,ues
2,1c,select count(*) from movie_companies as mc j...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.672483,1,ues
3,1d,select count(*) from movie_info_idx as mi_idx...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.609533,1,ues
4,2a,select count(*) from movie_keyword as mk joi...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",2.158585,1,ues
...,...,...,...,...,...,...
1125,32a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.010962,5,ues_linearized
1126,32b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.444687,5,ues_linearized
1127,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.593583,5,ues_linearized
1128,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.683118,5,ues_linearized


## Initial analysis

Get a rough overview of the workload performance:

In [6]:
wrkld_run_grp = df_raw.groupby(["workload", "run",])["rt_total"].sum()
wrkld_run_grp

workload        run
ues             1      287.526762
                2      270.411454
                3      271.353861
                4      265.445545
                5      267.553320
ues_linearized  1      332.505916
                2      314.632555
                3      313.981810
                4      313.843529
                5      314.405119
Name: rt_total, dtype: float64

The best runs per workload:

In [7]:
min_runs = wrkld_run_grp.loc[wrkld_run_grp.groupby("workload").idxmin()]
min_runs

workload        run
ues             4      265.445545
ues_linearized  4      313.843529
Name: rt_total, dtype: float64

Choose the fastest runs per setting as representatives.

In [8]:
# first up, perform a full inner join on workload and run. This way, we can obtain all rows that are part of the best runs per workload
df_wrkld_run_idx = df_raw.merge(min_runs, how="inner", left_on=["workload", "run"], right_on=["workload", "run"], right_index=True).index

# secondly, use the index to extract the rows only, without superfluous colunms
df = df_raw.loc[df_wrkld_run_idx].reset_index(drop=True).copy()
df

Unnamed: 0,label,query,query_result,rt_total,run,workload
0,1a,select count(*) from movie_companies as mc j...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.792273,4,ues
1,1b,select count(*) from movie_info_idx as mi_idx...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.553238,4,ues
2,1c,select count(*) from movie_companies as mc j...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.636802,4,ues
3,1d,select count(*) from movie_info_idx as mi_idx...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.619424,4,ues
4,2a,select count(*) from movie_keyword as mk joi...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.095501,4,ues
...,...,...,...,...,...,...
221,32a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.010631,4,ues_linearized
222,32b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.441383,4,ues_linearized
223,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",1.510462,4,ues_linearized
224,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",0.688176,4,ues_linearized


## Feature calculation

In [9]:
def parse_query_plans(sample: pd.Series) -> explain.PlanNode:
    query, plan, workload = sample["query"], sample["query_result"], sample["workload"]
    if workload == "ues":
        parsed_plan = explain.parse_explain_analyze(query, plan)
    elif workload == "ues_linearized":
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    else:
        warnings.warn("Unknown workload '{}', assuming no subqueries".format(workload))
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    return parsed_plan

In [10]:
df.query_result = df.query_result.apply(json.loads)
df["query"] = df["query"].apply(mosp.MospQuery.parse)
df["explain"] = df.query_result
df.query_result = df.apply(parse_query_plans, axis="columns")
df["subquery"] = df.query_result.apply(explain.PlanNode.extract_subqueries)
df = df.explode("subquery")
df



Unnamed: 0,label,query,query_result,rt_total,run,workload,explain,subquery
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.792273,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
1,1b,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.553238,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
2,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.636802,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
3,1d,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.619424,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
4,2a,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (cn.id = mc.company_id) <- [Seq Scan...,1.095501,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
...,...,...,...,...,...,...,...,...
221,32a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mk.keyword_id = k.id) <- [~Merge Jo...,0.010631,4,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
222,32b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mk.keyword_id = k.id) <- [Merge Joi...,0.441383,4,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
223,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.510462,4,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
224,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.688176,4,ues_linearized,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",


In [11]:
df_ues = df[df.workload == "ues"].copy()
df_ues_lin = df[df.workload == "ues_linearized"].copy()

In [12]:
df_sqs = df_ues[~df_ues.subquery.isna()].copy()
df_sqs["pruned"] = df_sqs.subquery.apply(lambda sq: sq.any_pruned())
len(df_sqs)

112

In [13]:
df_sqs

Unnamed: 0,label,query,query_result,rt_total,run,workload,explain,subquery,pruned
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.792273,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
2,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.636802,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
6,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.978012,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False
8,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,2.239086,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True
9,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,1.081561,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True
...,...,...,...,...,...,...,...,...,...
110,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.637275,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
111,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False
111,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
112,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.861926,4,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False


In [14]:
df_ues_wide = pd.merge(
    df_sqs.drop(columns=["run", "workload"]),
    df_ues_lin.drop(columns=["run", "workload", "subquery"]),
    on="label", how="inner",
    suffixes=("_ues", "_ues_lin"))
df_ues_wide["sq_pred"] = df_ues_wide.subquery.apply(getattr, args=("join_pred",))
df.rename(columns={"rt_total_orig": "rt_ues", "rt_total_trans": "rt_ues_lin"}, inplace=True)
len(df_ues_wide)

112

In [15]:
df_ues_wide

Unnamed: 0,label,query_ues,query_result_ues,rt_total_ues,explain_ues,subquery,pruned,query_ues_lin,query_result_ues_lin,rt_total_ues_lin,explain_ues_lin,sq_pred
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.792273,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.833403,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.636802,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.653331,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.978012,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.company_id = cn.id) <- [Hash Joi...,1.061340,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mc.company_id = cn.id)
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,2.239086,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,2.247282,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,1.081561,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,1.099163,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.637275,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.510462,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.688176,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.688176,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.861926,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.813502,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)


In [16]:
def subquery_join_partner(subquery):
    parent = subquery.parent
    left, right = parent.left, parent.right
    partner = left if subquery.join_pred == right.join_pred else right
    return partner

In [17]:
df_ues_wide["sq_fk_partner"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_subquery(row["sq_pred"]), axis="columns").apply(lambda sq: sq.base_table())
df_ues_wide["sq_fk_table"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_scan(row["sq_fk_partner"]), axis="columns")
df_ues_wide["sq_fk_rows"] = df_ues_wide["sq_fk_table"].apply(lambda tab: tab.proc_rows)

df_ues_wide["sq_join"] = df_ues_wide.apply(lambda row: row["query_result_ues"].lookup_join(row["sq_pred"]), axis="columns")
df_ues_wide["sq_join_rows"] = df_ues_wide["sq_join"].apply(lambda join: join.proc_rows)
df_ues_wide["sq_join_duration"] = df_ues_wide["sq_join"].apply(lambda join: join.exec_time[0] / 1000)

df_ues_wide["sq_join_partner"] = df_ues_wide["sq_join"].apply(subquery_join_partner)
df_ues_wide["sq_join_partner_duration"] = df_ues_wide["sq_join_partner"].apply(lambda join: join.exec_time[0] / 1000)

df_ues_wide["node_pruned_ues"] = df_ues_wide.query_result_ues.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))
df_ues_wide["node_pruned_ues_lin"] = df_ues_wide.query_result_ues_lin.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))

In [18]:
df_ues_wide

Unnamed: 0,label,query_ues,query_result_ues,rt_total_ues,explain_ues,subquery,pruned,query_ues_lin,query_result_ues_lin,rt_total_ues_lin,...,sq_fk_partner,sq_fk_table,sq_fk_rows,sq_join,sq_join_rows,sq_join_duration,sq_join_partner,sq_join_partner_duration,node_pruned_ues,node_pruned_ues_lin
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.792273,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.833403,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,789242,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,83,0.107220,Hash Join (t.id = mc.movie_id) <- [Seq Scan ::...,0.658890,False,False
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.636802,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.653331,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,789242,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,83,0.108803,Hash Join (t.id = mc.movie_id) <- [Seq Scan ::...,0.500195,False,False
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.978012,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.company_id = cn.id) <- [Hash Joi...,1.061340,...,movie_companies AS mc,Seq Scan :: movie_companies,1652765,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,2,0.247829,Hash Join (t.id = mk.movie_id) <- [Seq Scan ::...,0.691120,False,False
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,2.239086,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,2.247282,...,movie_keyword AS mk,~Seq Scan~ :: movie_keyword,0,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0.000000,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,2.222914,True,True
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,1.081561,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,1.099163,...,movie_keyword AS mk,~Seq Scan~ :: movie_keyword,0,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0.000000,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,1.068213,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.637275,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.510462,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,789242,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,263052,0.133475,Hash Join (t2.kind_id = kt2.id) <- [Hash Join ...,1.467764,True,True
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.688176,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,582,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,1,0.000260,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,0.670848,False,False
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.688176,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,3,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,1,0.000017,Hash Join (t2.kind_id = kt2.id) <- [Hash Join ...,0.670828,False,False
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.861926,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.813502,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,383765,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,6886,0.140734,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,1.711804,True,True


In [19]:
df_ues_wide["filter_strength_rel"] = df_ues_wide.sq_fk_rows / df_ues_wide.sq_join_rows
df_ues_wide["filter_strength_abs"] = df_ues_wide.sq_fk_rows - df_ues_wide.sq_join_rows
df_ues_wide["sq_speedup_rel"] = df_ues_wide.rt_total_ues_lin / df_ues_wide.rt_total_ues
df_ues_wide["sq_speedup_abs"] = df_ues_wide.rt_total_ues_lin - df_ues_wide.rt_total_ues
df_ues_wide[["label", "subquery", "sq_fk_rows", "sq_join_rows", "rt_total_ues", "rt_total_ues_lin", "filter_strength_rel", "sq_speedup_rel"]]

Unnamed: 0,label,subquery,sq_fk_rows,sq_join_rows,rt_total_ues,rt_total_ues_lin,filter_strength_rel,sq_speedup_rel
0,1a,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,789242,83,0.792273,0.833403,9508.939759,1.051914
1,1c,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,789242,83,0.636802,0.653331,9508.939759,1.025956
2,2c,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,1652765,2,0.978012,1.061340,826382.500000,1.085201
3,3a,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0,2.239086,2.247282,,1.003660
4,3b,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0,1.081561,1.099163,,1.016275
...,...,...,...,...,...,...,...,...
107,33a,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,789242,263052,1.637275,1.510462,3.000327,0.922546
108,33b,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,582,1,0.689398,0.688176,582.000000,0.998227
109,33b,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,3,1,0.689398,0.688176,3.000000,0.998227
110,33c,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,383765,6886,1.861926,1.813502,55.731194,0.973993


In [28]:
df_export = df_ues_wide.drop(
    columns=["query_result_ues_lin",
             "sq_pred",
             "sq_fk_partner",
             "sq_fk_table",
             "sq_join",
             "sq_join_partner",
             "explain_ues_lin"
            ]).rename(
    columns={"query_result_ues": "ues_plan",
             "explain_ues": "ues_explain",
             "sq_fk_rows": "foreign_key_rows",
             "sq_join_rows": "subquery_rows",
             "rt_total_ues": "runtime_ues",
             "rt_total_ues_lin": "runtime_ues_linear",
             "sq_speedup_rel": "ues_speedup_rel",
             "sq_speedup_abs": "ues_speedup_abs",
             "query_ues": "query_ues",
             "query_ues_lin": "query_ues_linear",
             "sq_join_duration": "subquery_runtime",
             "sq_join_partner_duration": "subquery_partner_runtime",
             "pruned": "subquery_pruned",
             "node_pruned_ues": "ues_pruned",
             "node_pruned_ues_lin": "ues_linear_pruned"
            })
df_export["tables"] = df_export.query_ues.apply(lambda q: json.dumps([tab.full_name for tab in q.collect_tables()]))
df_export["subquery_tables"] = df_export.subquery.apply(lambda sq: sq.lookup_subquery(sq.join_pred).collect_tables()).apply(lambda sq_tabs: json.dumps([tab.full_name for tab in sq_tabs]))

df_export.ues_explain = df_export.ues_explain.apply(json.dumps)

df_export.to_csv("workloads/job-ues-eval-linearization.csv", index=False)
df_export

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_ues_linear,runtime_ues_linear,foreign_key_rows,...,subquery_runtime,subquery_partner_runtime,ues_pruned,ues_linear_pruned,filter_strength_rel,filter_strength_abs,ues_speedup_rel,ues_speedup_abs,tables,subquery_tables
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.792273,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.833403,789242,...,0.107220,0.658890,False,False,9508.939759,789159,1.051914,0.041130,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.636802,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.653331,789242,...,0.108803,0.500195,False,False,9508.939759,789159,1.025956,0.016529,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.978012,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,1.061340,1652765,...,0.247829,0.691120,False,False,826382.500000,1652763,1.085201,0.083328,"[""movie_keyword"", ""keyword"", ""title"", ""movie_c...","[""movie_companies"", ""company_name""]"
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,2.239086,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,2.247282,0,...,0.000000,2.222914,True,True,,0,1.003660,0.008196,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,1.081561,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,1.099163,0,...,0.000000,1.068213,True,True,,0,1.016275,0.017602,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.637275,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.510462,789242,...,0.133475,1.467764,True,True,3.000327,526190,0.922546,-0.126813,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.688176,582,...,0.000260,0.670848,False,False,582.000000,581,0.998227,-0.001222,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,0.689398,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0.688176,3,...,0.000017,0.670828,False,False,3.000000,2,0.998227,-0.001222,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [~Seq S...,1.861926,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.813502,383765,...,0.140734,1.711804,True,True,55.731194,376879,0.973993,-0.048424,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"


## Some sanity checks for the export data set

In [19]:
df_export.groupby("label")[["label"]].count().rename(columns={"label": "count"}).reset_index().sort_values(by="count", ascending=False)

Unnamed: 0,label,count
58,31c,3
56,31a,3
39,25c,2
61,33a,2
57,31b,2
...,...,...
35,23c,1
36,24b,1
38,25b,1
40,26a,1


In [20]:
df_export.loc[df_export.filter_strength.isna() & ~df_export.subquery_pruned]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables


In [21]:
df_export.loc[df_export.label == "6b"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
10,6b,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (ci.movie_id = t.id) <- [Hash Join (...,0.536543,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (ci.person_id = n.id) <- [~Seq Scan~...,True,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2.252002,0,0,0.181875,0.349454,True,False,,4.197244,"[""movie_keyword"", ""keyword"", ""title"", ""cast_in...","[""cast_info"", ""name""]"


In [22]:
df_export.loc[df_export.label == "20b"].iloc[0].query_ues

SELECT COUNT(*) FROM complete_cast AS cc JOIN comp_cast_type AS cct1 ON cct1.kind = 'cast' AND cct1.id = cc.subject_id JOIN comp_cast_type AS cct2 ON cct2.kind LIKE '%complete%' AND cct2.id = cc.status_id JOIN title AS t ON t.production_year > 2000 AND t.id = cc.movie_id JOIN kind_type AS kt ON kt.kind = 'movie' AND kt.id = t.kind_id JOIN (SELECT movie_id FROM movie_keyword AS mk JOIN keyword AS k ON k.keyword IN ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND k.id = mk.keyword_id) AS t_mk ON t_mk.movie_id = cc.movie_id JOIN (SELECT person_id, movie_id FROM cast_info AS ci JOIN name AS n ON n.name LIKE '%downey%robert%' AND n.id = ci.person_id JOIN char_name AS chn ON chn.name NOT LIKE '%sherlock%' AND (chn.name LIKE '%tony%stark%' OR chn.name LIKE '%iron%man%') AND chn.id = ci.person_role_id) AS t_ci ON t_ci.movie_id = t_mk.movie_id

In [23]:
print(df_export.loc[df_export.label == "6b"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (ci.movie_id = t.id)
  <- [SQ] Hash Join (ci.person_id = n.id)
    <- [PRUNED] Seq Scan :: cast_info
    <- Seq Scan :: name (((name)::text ~~ '%downey%robert%'::text))
  <- Hash Join (t.id = mk.movie_id)
    <- Seq Scan :: title ((production_year > 2014))
    <- Hash Join (mk.keyword_id = k.id)
      <- Seq Scan :: movie_keyword
      <- Seq Scan :: keyword (((keyword)::text = ANY ('{superhero,sequel,second-part,marvel-comics,based-on-comic,tv-special,fight,violence}'::text[])))



In [24]:
df_export.loc[df_export.label == "6b"].iloc[0].ues_explain

'[{"Plan": {"Node Type": "Aggregate", "Strategy": "Plain", "Partial Mode": "Simple", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 622730.99, "Total Cost": 622731.0, "Plan Rows": 1, "Plan Width": 8, "Actual Startup Time": 534.419, "Actual Total Time": 535.652, "Actual Rows": 1, "Actual Loops": 1, "Plans": [{"Node Type": "Gather", "Parent Relationship": "Outer", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 179377.81, "Total Cost": 622730.98, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 534.414, "Actual Total Time": 535.647, "Actual Rows": 0, "Actual Loops": 1, "Workers Planned": 2, "Workers Launched": 2, "Single Copy": false, "Plans": [{"Node Type": "Hash Join", "Parent Relationship": "Outer", "Parallel Aware": true, "Async Capable": false, "Join Type": "Inner", "Startup Cost": 178377.81, "Total Cost": 621730.88, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 531.817, "Actual Total Time": 531.823, "Actual Rows": 0, "Actual Loops

In [25]:
df_export.loc[df_export.label == "33c"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,351960,18374,0.121368,0.695499,False,False,19.155328,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
111,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,690018,229962,0.114501,0.564965,False,False,3.000574,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"


In [26]:
df_export.loc[df_export.label == "33c"].iloc[0].query_ues

SELECT COUNT(*) FROM movie_link AS ml JOIN link_type AS lt ON lt.link IN ('sequel', 'follows', 'followed by') AND lt.id = ml.link_type_id JOIN title AS t2 ON t2.production_year BETWEEN 2000 AND 2010 AND t2.id = ml.linked_movie_id JOIN title AS t1 ON t1.id = ml.movie_id JOIN kind_type AS kt1 ON kt1.kind IN ('tv series', 'episode') AND kt1.id = t1.kind_id JOIN kind_type AS kt2 ON kt2.kind IN ('tv series', 'episode') AND kt2.id = t2.kind_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx1 JOIN info_type AS it1 ON it1.info = 'rating' AND it1.id = mi_idx1.info_type_id) AS t_mi_idx1 ON t_mi_idx1.movie_id = ml.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx2 JOIN info_type AS it2 ON it2.info = 'rating' AND it2.id = mi_idx2.info_type_id AND mi_idx2.info < '3.5') AS t_mi_idx2 ON t_mi_idx2.movie_id = ml.linked_movie_id JOIN movie_companies AS mc2 ON mc2.movie_id = ml.linked_movie_id JOIN company_name AS cn2 ON cn2.id = mc2.company_id JOIN movie_companies AS mc1 ON mc1.movie_id =

In [27]:
print(df_export.loc[df_export.label == "33c"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (cn1.id = mc1.company_id)
  <- Seq Scan :: company_name (((country_code)::text <> '[us]'::text))
  <- Hash Join (mc1.movie_id = t1.id)
    <- Seq Scan :: movie_companies
    <- Hash Join (cn2.id = mc2.company_id)
      <- Seq Scan :: company_name
      <- Hash Join (mc2.movie_id = t2.id)
        <- Seq Scan :: movie_companies
        <- Hash Join (mi_idx2.movie_id = t2.id)
          <- [SQ] Hash Join (mi_idx2.info_type_id = it2.id)
            <- Seq Scan :: movie_info_idx (((info)::text < '3.5'::text))
            <- Seq Scan :: info_type (((info)::text = 'rating'::text))
          <- Hash Join (mi_idx1.movie_id = t1.id)
            <- [SQ] Hash Join (mi_idx1.info_type_id = it1.id)
              <- Seq Scan :: movie_info_idx
              <- Seq Scan :: info_type (((info)::text = 'rating'::text))
            <- Hash Join (t2.kind_id = kt2.id)
              <- Hash Join (t1.kind_id = kt1.id)
                <- Merge Join (t1.id = ml.movie_id)
                  <- Index Scan :