# UES workload analysis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import re
import warnings

import numpy as np
import pandas as pd
import seaborn as sns

from transform import db, mosp
from postgres import explain

In [3]:
sns.set_theme(style="whitegrid")

In [4]:
df_raw = pd.read_csv("workloads/job-ues-results-fks-nonlj.csv")

In [5]:
df_raw.groupby(["run", "workload"])["rt_total"].sum()

run  workload   
1    transformed    181.697190
     ues            149.424562
2    transformed    180.440391
     ues            149.746419
3    transformed    181.162909
     ues            148.810540
Name: rt_total, dtype: float64

Choose the fastest runs per setting as representatives.

In [6]:
repr_trans = df_raw.loc[(df_raw.workload == "transformed") & (df_raw.run == 2)]
repr_ues = df_raw.loc[(df_raw.workload == "ues") & (df_raw.run == 3)]
df = pd.concat([repr_ues, repr_trans])

In [7]:
def parse_query_plans(sample: pd.Series) -> explain.PlanNode:
    query, plan, workload = sample["query"], sample["result"], sample["workload"]
    if workload == "ues":
        parsed_plan = explain.parse_explain_analyze(query, plan)
    elif workload == "transformed":
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    else:
        warnings.warn("Unknown workload '{}', assuming no subqueries".format(workload))
        parsed_plan = explain.parse_explain_analyze(query, plan, with_subqueries=False)
    return parsed_plan

In [8]:
df.result = df.result.apply(json.loads)
df["query"] = df["query"].apply(mosp.MospQuery.parse)
df["explain"] = df.result
df.result = df.apply(parse_query_plans, axis="columns")
df["subquery"] = df.result.apply(explain.PlanNode.extract_subqueries)
df = df.explode("subquery")
df



Unnamed: 0,label,query,result,rt_total,run,workload,explain,subquery
226,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.439545,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
227,1b,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.325870,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
228,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.348425,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...
229,1d,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,Hash Join (mc.company_type_id = ct.id) <- [Has...,0.328748,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
230,2a,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (cn.id = mc.company_id) <- [Seq Scan...,0.848735,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
...,...,...,...,...,...,...,...,...
560,32a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mk.keyword_id = k.id) <- [Merge Joi...,0.444229,2,transformed,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
561,32b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mk.keyword_id = k.id) <- [Merge Joi...,0.432801,2,transformed,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
562,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.064475,2,transformed,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",
563,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.292542,2,transformed,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",


In [9]:
df_ues = df[df.workload == "ues"].copy()
df_trans = df[df.workload == "transformed"].copy()

In [10]:
df_sqs = df_ues[~df_ues.subquery.isna()].copy()
df_sqs["pruned"] = df_sqs.subquery.apply(lambda sq: sq.any_pruned())
len(df_sqs)

112

In [11]:
df_sqs

Unnamed: 0,label,query,result,rt_total,run,workload,explain,subquery,pruned
226,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.439545,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
228,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.348425,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False
232,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.561410,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False
234,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (t.id = mk.movie_id) <- [Hash Join (...,1.539704,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False
235,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,0.572723,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True
...,...,...,...,...,...,...,...,...,...
336,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.082079,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
337,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False
337,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False
338,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,3,ues,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False


In [12]:
df_cmp = pd.merge(
    df_sqs.drop(columns=["run", "workload"]),
    df_trans.drop(columns=["run", "workload", "subquery"]),
    on="label", how="inner",
    suffixes=("_orig", "_trans"))
df_cmp["sq_pred"] = df_cmp.subquery.apply(getattr, args=("join_pred",))
df.rename(columns={"rt_total_orig": "rt_orig", "rt_total_trans": "rt_trans"}, inplace=True)
len(df_cmp)

112

In [13]:
df_cmp

Unnamed: 0,label,query_orig,result_orig,rt_total_orig,explain_orig,subquery,pruned,query_trans,result_trans,rt_total_trans,explain_trans,sq_pred
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.439545,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.459900,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.348425,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.448824,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx.info_type_id = it.id)
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.561410,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.company_id = cn.id) <- [Hash Joi...,0.587651,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mc.company_id = cn.id)
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (t.id = mk.movie_id) <- [Hash Join (...,1.539704,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,1.292148,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,0.572723,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,0.591094,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mk.keyword_id = k.id)
...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.082079,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.064475,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.292542,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.292542,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx1.info_type_id = it1.id)
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.189302,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",(mi_idx2.info_type_id = it2.id)


In [14]:
def subquery_join_partner(subquery):
    parent = subquery.parent
    left, right = parent.left, parent.right
    partner = left if subquery.join_pred == right.join_pred else right
    return partner

In [15]:
df_cmp["sq_fk_partner"] = df_cmp.apply(lambda row: row["result_orig"].lookup_subquery(row["sq_pred"]), axis="columns").apply(lambda sq: sq.base_table())
df_cmp["sq_fk_table"] = df_cmp.apply(lambda row: row["result_orig"].lookup_scan(row["sq_fk_partner"]), axis="columns")
df_cmp["sq_fk_rows"] = df_cmp["sq_fk_table"].apply(lambda tab: tab.proc_rows)

df_cmp["sq_join"] = df_cmp.apply(lambda row: row["result_orig"].lookup_join(row["sq_pred"]), axis="columns")
df_cmp["sq_join_rows"] = df_cmp["sq_join"].apply(lambda join: join.proc_rows)
df_cmp["sq_join_duration"] = df_cmp["sq_join"].apply(lambda join: join.exec_time[0] / 1000)

df_cmp["sq_join_partner"] = df_cmp["sq_join"].apply(subquery_join_partner)
df_cmp["sq_join_partner_duration"] = df_cmp["sq_join_partner"].apply(lambda join: join.exec_time[0] / 1000)

df_cmp["node_pruned_orig"] = df_cmp.result_orig.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))
df_cmp["node_pruned_trans"] = df_cmp.result_trans.apply(lambda plan: plan.any_pruned(exclude_subqueries=True))

In [16]:
df_cmp

Unnamed: 0,label,query_orig,result_orig,rt_total_orig,explain_orig,subquery,pruned,query_trans,result_trans,rt_total_trans,...,sq_fk_partner,sq_fk_table,sq_fk_rows,sq_join,sq_join_rows,sq_join_duration,sq_join_partner,sq_join_partner_duration,node_pruned_orig,node_pruned_trans
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.439545,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.459900,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,460012,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,83,0.064083,Hash Join (t.id = mc.movie_id) <- [Seq Scan ::...,0.356114,False,False
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.348425,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.info_type_id = it.id) <- [Ha...,0.448824,...,movie_info_idx AS mi_idx,Seq Scan :: movie_info_idx,460012,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,83,0.063187,Hash Join (t.id = mc.movie_id) <- [Seq Scan ::...,0.243332,False,False
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.561410,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.company_id = cn.id) <- [Hash Joi...,0.587651,...,movie_companies AS mc,Seq Scan :: movie_companies,869710,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,1,0.131000,Hash Join (t.id = mk.movie_id) <- [Seq Scan ::...,0.400316,False,False
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (t.id = mk.movie_id) <- [Hash Join (...,1.539704,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,1.292148,...,movie_keyword AS mk,Seq Scan :: movie_keyword,1507977,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,4317,0.220146,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,1.260173,False,True
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,0.572723,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.keyword_id = k.id) <- [Hash Join...,0.591094,...,movie_keyword AS mk,~Seq Scan~ :: movie_keyword,0,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0.000000,Hash Join (mi.movie_id = t.id) <- [Seq Scan ::...,0.566448,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.082079,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.064475,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,690018,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,229962,0.115141,Hash Join (t2.kind_id = kt2.id) <- [Hash Join ...,0.445078,False,False
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.292542,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,654961,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,25591,0.221860,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,0.444502,False,False
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.292542,...,movie_info_idx AS mi_idx1,Seq Scan :: movie_info_idx,1380035,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,459925,0.217341,Hash Join (t2.kind_id = kt2.id) <- [Hash Join ...,0.197167,False,False
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.189302,...,movie_info_idx AS mi_idx2,Seq Scan :: movie_info_idx,351960,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,18374,0.121368,Hash Join (mi_idx1.movie_id = t1.id) <- [Hash ...,0.695499,False,False


In [17]:
df_cmp["filter_strength"] = df_cmp.sq_fk_rows / df_cmp.sq_join_rows
df_cmp["sq_speedup"] = df_cmp.rt_total_trans / df_cmp.rt_total_orig
df_cmp[["label", "subquery", "sq_fk_rows", "sq_join_rows", "rt_total_orig", "rt_total_trans", "filter_strength", "sq_speedup"]]

Unnamed: 0,label,subquery,sq_fk_rows,sq_join_rows,rt_total_orig,rt_total_trans,filter_strength,sq_speedup
0,1a,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,460012,83,0.439545,0.459900,5542.313253,1.046309
1,1c,Hash Join (mi_idx.info_type_id = it.id) <- [Se...,460012,83,0.348425,0.448824,5542.313253,1.288151
2,2c,Hash Join (mc.company_id = cn.id) <- [Seq Scan...,869710,1,0.561410,0.587651,869710.000000,1.046741
3,3a,Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,1507977,4317,1.539704,1.292148,349.311327,0.839218
4,3b,~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,0,0,0.572723,0.591094,,1.032077
...,...,...,...,...,...,...,...,...
107,33a,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,690018,229962,1.082079,1.064475,3.000574,0.983731
108,33b,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,654961,25591,1.360770,1.292542,25.593412,0.949861
109,33b,Hash Join (mi_idx1.info_type_id = it1.id) <- [...,1380035,459925,1.360770,1.292542,3.000565,0.949861
110,33c,Hash Join (mi_idx2.info_type_id = it2.id) <- [...,351960,18374,1.208609,1.189302,19.155328,0.984025


In [18]:
df_export = df_cmp.drop(
    columns=["result_trans",
             "sq_pred",
             "sq_fk_partner",
             "sq_fk_table",
             "sq_join",
             "sq_join_partner",
             "explain_trans"
            ]).rename(
    columns={"result_orig": "ues_plan",
             "explain_orig": "ues_explain",
             "sq_fk_rows": "foreign_key_rows",
             "sq_join_rows": "rows_after_join",
             "rt_total_orig": "runtime_ues",
             "rt_total_trans": "runtime_flat",
             "sq_speedup": "ues_speedup",
             "query_orig": "query_ues",
             "query_trans": "query_flat",
             "sq_join_duration": "subquery_runtime",
             "sq_join_partner_duration": "subquery_partner_runtime",
             "pruned": "subquery_pruned",
             "node_pruned_orig": "ues_pruned",
             "node_pruned_trans": "flat_pruned"
            })
df_export["tables"] = df_export.query_ues.apply(lambda q: json.dumps([tab.full_name for tab in q.collect_tables()]))
df_export["subquery_tables"] = df_export.subquery.apply(lambda sq: sq.lookup_subquery(sq.join_pred).collect_tables()).apply(lambda sq_tabs: json.dumps([tab.full_name for tab in sq_tabs]))

df_export.ues_explain = df_export.ues_explain.apply(json.dumps)

df_export.to_csv("workloads/job-ues-eval-fks-nonlj.csv", index=False)
df_export

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
0,1a,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.439545,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.459900,460012,83,0.064083,0.356114,False,False,5542.313253,1.046309,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
1,1c,SELECT COUNT(*) FROM movie_companies AS mc JOI...,Hash Join (mi_idx.movie_id = t.id) <- [Hash Jo...,0.348425,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx.info_type_id = it.id) <- [Se...,False,SELECT COUNT(*) FROM movie_companies AS mc JOI...,0.448824,460012,83,0.063187,0.243332,False,False,5542.313253,1.288151,"[""movie_companies"", ""company_type"", ""title"", ""...","[""movie_info_idx"", ""info_type""]"
2,2c,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (mc.movie_id = t.id) <- [Hash Join (...,0.561410,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mc.company_id = cn.id) <- [Seq Scan...,False,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,0.587651,869710,1,0.131000,0.400316,False,False,869710.000000,1.046741,"[""movie_keyword"", ""keyword"", ""title"", ""movie_c...","[""movie_companies"", ""company_name""]"
3,3a,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (t.id = mk.movie_id) <- [Hash Join (...,1.539704,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mk.keyword_id = k.id) <- [Seq Scan ...,False,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,1.292148,1507977,4317,0.220146,1.260173,False,True,349.311327,0.839218,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
4,3b,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,Hash Join (mk.movie_id = t.id) <- [~Hash Join~...,0.572723,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",~Hash Join~ (mk.keyword_id = k.id) <- [~Seq Sc...,True,SELECT COUNT(*) FROM movie_info AS mi JOIN tit...,0.591094,0,0,0.000000,0.566448,True,True,,1.032077,"[""movie_info"", ""title"", ""movie_keyword"", ""keyw...","[""movie_keyword"", ""keyword""]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,33a,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.082079,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.064475,690018,229962,0.115141,0.445078,False,False,3.000574,0.983731,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
108,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.292542,654961,25591,0.221860,0.444502,False,False,25.593412,0.949861,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
109,33b,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (mc1.company_id = cn1.id) <- [Hash J...,1.360770,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.292542,1380035,459925,0.217341,0.197167,False,False,3.000565,0.949861,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,351960,18374,0.121368,0.695499,False,False,19.155328,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"


## Some sanity checks for the export data set

In [19]:
df_export.groupby("label")[["label"]].count().rename(columns={"label": "count"}).reset_index().sort_values(by="count", ascending=False)

Unnamed: 0,label,count
58,31c,3
56,31a,3
39,25c,2
61,33a,2
57,31b,2
...,...,...
35,23c,1
36,24b,1
38,25b,1
40,26a,1


In [20]:
df_export.loc[df_export.filter_strength.isna() & ~df_export.subquery_pruned]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables


In [21]:
df_export.loc[df_export.label == "6b"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
10,6b,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,Hash Join (ci.movie_id = t.id) <- [Hash Join (...,0.536543,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (ci.person_id = n.id) <- [~Seq Scan~...,True,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,2.252002,0,0,0.181875,0.349454,True,False,,4.197244,"[""movie_keyword"", ""keyword"", ""title"", ""cast_in...","[""cast_info"", ""name""]"


In [22]:
df_export.loc[df_export.label == "20b"].iloc[0].query_ues

SELECT COUNT(*) FROM complete_cast AS cc JOIN comp_cast_type AS cct1 ON cct1.kind = 'cast' AND cct1.id = cc.subject_id JOIN comp_cast_type AS cct2 ON cct2.kind LIKE '%complete%' AND cct2.id = cc.status_id JOIN title AS t ON t.production_year > 2000 AND t.id = cc.movie_id JOIN kind_type AS kt ON kt.kind = 'movie' AND kt.id = t.kind_id JOIN (SELECT movie_id FROM movie_keyword AS mk JOIN keyword AS k ON k.keyword IN ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND k.id = mk.keyword_id) AS t_mk ON t_mk.movie_id = cc.movie_id JOIN (SELECT person_id, movie_id FROM cast_info AS ci JOIN name AS n ON n.name LIKE '%downey%robert%' AND n.id = ci.person_id JOIN char_name AS chn ON chn.name NOT LIKE '%sherlock%' AND (chn.name LIKE '%tony%stark%' OR chn.name LIKE '%iron%man%') AND chn.id = ci.person_role_id) AS t_ci ON t_ci.movie_id = t_mk.movie_id

In [23]:
print(df_export.loc[df_export.label == "6b"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (ci.movie_id = t.id)
  <- [SQ] Hash Join (ci.person_id = n.id)
    <- [PRUNED] Seq Scan :: cast_info
    <- Seq Scan :: name (((name)::text ~~ '%downey%robert%'::text))
  <- Hash Join (t.id = mk.movie_id)
    <- Seq Scan :: title ((production_year > 2014))
    <- Hash Join (mk.keyword_id = k.id)
      <- Seq Scan :: movie_keyword
      <- Seq Scan :: keyword (((keyword)::text = ANY ('{superhero,sequel,second-part,marvel-comics,based-on-comic,tv-special,fight,violence}'::text[])))



In [24]:
df_export.loc[df_export.label == "6b"].iloc[0].ues_explain

'[{"Plan": {"Node Type": "Aggregate", "Strategy": "Plain", "Partial Mode": "Simple", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 622730.99, "Total Cost": 622731.0, "Plan Rows": 1, "Plan Width": 8, "Actual Startup Time": 534.419, "Actual Total Time": 535.652, "Actual Rows": 1, "Actual Loops": 1, "Plans": [{"Node Type": "Gather", "Parent Relationship": "Outer", "Parallel Aware": false, "Async Capable": false, "Startup Cost": 179377.81, "Total Cost": 622730.98, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 534.414, "Actual Total Time": 535.647, "Actual Rows": 0, "Actual Loops": 1, "Workers Planned": 2, "Workers Launched": 2, "Single Copy": false, "Plans": [{"Node Type": "Hash Join", "Parent Relationship": "Outer", "Parallel Aware": true, "Async Capable": false, "Join Type": "Inner", "Startup Cost": 178377.81, "Total Cost": 621730.88, "Plan Rows": 1, "Plan Width": 0, "Actual Startup Time": 531.817, "Actual Total Time": 531.823, "Actual Rows": 0, "Actual Loops

In [25]:
df_export.loc[df_export.label == "33c"]

Unnamed: 0,label,query_ues,ues_plan,runtime_ues,ues_explain,subquery,subquery_pruned,query_flat,runtime_flat,foreign_key_rows,rows_after_join,subquery_runtime,subquery_partner_runtime,ues_pruned,flat_pruned,filter_strength,ues_speedup,tables,subquery_tables
110,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx2.info_type_id = it2.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,351960,18374,0.121368,0.695499,False,False,19.155328,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"
111,33c,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,Hash Join (cn1.id = mc1.company_id) <- [Seq Sc...,1.208609,"[{""Plan"": {""Node Type"": ""Aggregate"", ""Strategy...",Hash Join (mi_idx1.info_type_id = it1.id) <- [...,False,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,1.189302,690018,229962,0.114501,0.564965,False,False,3.000574,0.984025,"[""movie_link"", ""link_type"", ""title"", ""title"", ...","[""movie_info_idx"", ""info_type""]"


In [26]:
df_export.loc[df_export.label == "33c"].iloc[0].query_ues

SELECT COUNT(*) FROM movie_link AS ml JOIN link_type AS lt ON lt.link IN ('sequel', 'follows', 'followed by') AND lt.id = ml.link_type_id JOIN title AS t2 ON t2.production_year BETWEEN 2000 AND 2010 AND t2.id = ml.linked_movie_id JOIN title AS t1 ON t1.id = ml.movie_id JOIN kind_type AS kt1 ON kt1.kind IN ('tv series', 'episode') AND kt1.id = t1.kind_id JOIN kind_type AS kt2 ON kt2.kind IN ('tv series', 'episode') AND kt2.id = t2.kind_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx1 JOIN info_type AS it1 ON it1.info = 'rating' AND it1.id = mi_idx1.info_type_id) AS t_mi_idx1 ON t_mi_idx1.movie_id = ml.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx2 JOIN info_type AS it2 ON it2.info = 'rating' AND it2.id = mi_idx2.info_type_id AND mi_idx2.info < '3.5') AS t_mi_idx2 ON t_mi_idx2.movie_id = ml.linked_movie_id JOIN movie_companies AS mc2 ON mc2.movie_id = ml.linked_movie_id JOIN company_name AS cn2 ON cn2.id = mc2.company_id JOIN movie_companies AS mc1 ON mc1.movie_id =

In [27]:
print(df_export.loc[df_export.label == "33c"].iloc[0].ues_plan.pretty_print(include_filter=True))

Hash Join (cn1.id = mc1.company_id)
  <- Seq Scan :: company_name (((country_code)::text <> '[us]'::text))
  <- Hash Join (mc1.movie_id = t1.id)
    <- Seq Scan :: movie_companies
    <- Hash Join (cn2.id = mc2.company_id)
      <- Seq Scan :: company_name
      <- Hash Join (mc2.movie_id = t2.id)
        <- Seq Scan :: movie_companies
        <- Hash Join (mi_idx2.movie_id = t2.id)
          <- [SQ] Hash Join (mi_idx2.info_type_id = it2.id)
            <- Seq Scan :: movie_info_idx (((info)::text < '3.5'::text))
            <- Seq Scan :: info_type (((info)::text = 'rating'::text))
          <- Hash Join (mi_idx1.movie_id = t1.id)
            <- [SQ] Hash Join (mi_idx1.info_type_id = it1.id)
              <- Seq Scan :: movie_info_idx
              <- Seq Scan :: info_type (((info)::text = 'rating'::text))
            <- Hash Join (t2.kind_id = kt2.id)
              <- Hash Join (t1.kind_id = kt1.id)
                <- Merge Join (t1.id = ml.movie_id)
                  <- Index Scan :