# UES workload analysis

In [1]:
import json
import re
import warnings

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("workloads/job-ues-results-orig-nofk.csv")
df = df[df.run == 1].copy()
df.query_result = df.query_result.apply(json.loads)
df

Unnamed: 0,label,query,flattened_query,query_result,query_rt_total,run
0,1a,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.297587,1
1,1b,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.225469,1
2,1c,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.254706,1
3,1d,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.208007,1
4,2a,select count(*) from movie_keyword as mk joi...,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.547452,1
...,...,...,...,...,...,...
108,32a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.274688,1
109,32b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.289307,1
110,33a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.443636,1
111,33b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.435970,1


In [3]:
from transform import db, mosp

In [4]:
p = df.iloc[0].query_result
p

[{'Plan': {'Node Type': 'Aggregate',
   'Strategy': 'Plain',
   'Partial Mode': 'Simple',
   'Parallel Aware': False,
   'Async Capable': False,
   'Startup Cost': 54217.81,
   'Total Cost': 54217.82,
   'Plan Rows': 1,
   'Plan Width': 8,
   'Actual Startup Time': 291.988,
   'Actual Total Time': 294.612,
   'Actual Rows': 1,
   'Actual Loops': 1,
   'Plans': [{'Node Type': 'Gather',
     'Parent Relationship': 'Outer',
     'Parallel Aware': False,
     'Async Capable': False,
     'Startup Cost': 38947.44,
     'Total Cost': 54217.81,
     'Plan Rows': 1,
     'Plan Width': 0,
     'Actual Startup Time': 291.686,
     'Actual Total Time': 294.599,
     'Actual Rows': 143,
     'Actual Loops': 1,
     'Workers Planned': 2,
     'Workers Launched': 2,
     'Single Copy': False,
     'Plans': [{'Node Type': 'Hash Join',
       'Parent Relationship': 'Outer',
       'Parallel Aware': True,
       'Async Capable': False,
       'Join Type': 'Inner',
       'Startup Cost': 37947.44,
     

In [5]:
q = mosp.MospQuery.parse(df.iloc[0]["query"])
q

SELECT COUNT(*) FROM movie_companies AS mc JOIN company_type AS ct ON ct.kind = 'production companies' AND ct.id = mc.company_type_id AND mc.note NOT LIKE '%(as metro-goldwyn-mayer pictures)%' AND (mc.note LIKE '%(co-production)%' OR mc.note LIKE '%(presents)%') JOIN title AS t ON t.id = mc.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx JOIN info_type AS it ON it.info = 'top 250 rank' AND it.id = mi_idx.info_type_id) AS t_mi_idx ON t_mi_idx.movie_id = mc.movie_id

In [6]:
eap = db.parse_explain_analyze(q, p)
print(eap.pretty_print())

Hash Join (mi_idx.movie_id = t.id)
  <- [SQ] Hash Join (mi_idx.info_type_id = it.id)
    <- Seq Scan :: movie_info_idx
    <- Seq Scan :: info_type
  <- Nested Loop (t.id = mc.movie_id)
    <- Hash Join (mc.company_type_id = ct.id)
      <- Seq Scan :: movie_companies
      <- Seq Scan :: company_type
    <- Index Only Scan :: title



In [24]:
def calculate_subquery_rows(parse_tree, *, target_predicate=""):
    if not target_predicate and parse_tree.is_subquery():
        parent = parse_tree.parent if parse_tree.parent else parse_tree
        left, right = parent.left, parent.right
        return parent, left.proc_rows + right.proc_rows
    elif target_predicate and db.compare_predicate_strs(parse_tree.join_pred, target_predicate):
        parent = parse_tree.parent if parse_tree.parent else parse_tree
        left, right = parent.left, parent.right
        return parent, left.proc_rows + right.proc_rows
    elif target_predicate:
        print(f"{target_predicate} does not match {parse_tree.join_pred}")

    if not parse_tree.children:
        return None
    child_rows = [calculate_subquery_rows(child) for child in parse_tree.children]
    filtered = [c for c in child_rows if c]
    return db._simplify_plan_tree(filtered) if filtered else None

In [8]:
calculate_subquery_rows(eap)

(Hash Join (mi_idx.info_type_id = it.id) <- [Seq Scan :: movie_info_idx, Seq Scan :: info_type],
 9651)

In [9]:
df_trans = pd.read_csv("workloads/job-ues-results-flattened-nofk.csv")
df_trans = df_trans[df_trans.run == 1].copy()
df_trans.flattened_query_result = df_trans.flattened_query_result.apply(json.loads)
df_trans

Unnamed: 0,label,query,flattened_query,flattened_query_result,flattened_query_rt_total,run
0,1a,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.323265,1
1,1b,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.211456,1
2,1c,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.271575,1
3,1d,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.209004,1
4,2a,select count(*) from movie_keyword as mk joi...,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.558684,1
...,...,...,...,...,...,...
108,32a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.626275,1
109,32b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.612061,1
110,33a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.416346,1
111,33b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,"[{'Plan': {'Node Type': 'Aggregate', 'Strategy...",0.403899,1


In [10]:
p_trans = df_trans.iloc[0].flattened_query_result
p_trans

[{'Plan': {'Node Type': 'Aggregate',
   'Strategy': 'Plain',
   'Partial Mode': 'Simple',
   'Parallel Aware': False,
   'Async Capable': False,
   'Startup Cost': 54787.57,
   'Total Cost': 54787.58,
   'Plan Rows': 1,
   'Plan Width': 8,
   'Actual Startup Time': 319.468,
   'Actual Total Time': 320.876,
   'Actual Rows': 1,
   'Actual Loops': 1,
   'Plans': [{'Node Type': 'Gather',
     'Parent Relationship': 'Outer',
     'Parallel Aware': False,
     'Async Capable': False,
     'Startup Cost': 38945.17,
     'Total Cost': 54787.57,
     'Plan Rows': 1,
     'Plan Width': 0,
     'Actual Startup Time': 319.167,
     'Actual Total Time': 320.864,
     'Actual Rows': 143,
     'Actual Loops': 1,
     'Workers Planned': 2,
     'Workers Launched': 2,
     'Single Copy': False,
     'Plans': [{'Node Type': 'Nested Loop',
       'Parent Relationship': 'Outer',
       'Parallel Aware': False,
       'Async Capable': False,
       'Join Type': 'Inner',
       'Startup Cost': 37945.17,
  

In [11]:
q_trans = mosp.MospQuery.parse(df_trans.iloc[0]["query"])
q_trans

SELECT COUNT(*) FROM movie_companies AS mc JOIN company_type AS ct ON ct.kind = 'production companies' AND ct.id = mc.company_type_id AND mc.note NOT LIKE '%(as metro-goldwyn-mayer pictures)%' AND (mc.note LIKE '%(co-production)%' OR mc.note LIKE '%(presents)%') JOIN title AS t ON t.id = mc.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx JOIN info_type AS it ON it.info = 'top 250 rank' AND it.id = mi_idx.info_type_id) AS t_mi_idx ON t_mi_idx.movie_id = mc.movie_id

In [12]:
eap_trans = db.parse_explain_analyze(q_trans, p_trans, with_subqueries=False)
print(eap_trans.pretty_print())

Nested Loop (it.id = mi_idx.info_type_id)
  <- Hash Join (mi_idx.movie_id = t.id)
    <- Seq Scan :: movie_info_idx
    <- Nested Loop (t.id = mc.movie_id)
      <- Hash Join (mc.company_type_id = ct.id)
        <- Seq Scan :: movie_companies
        <- Seq Scan :: company_type
      <- Index Only Scan :: title
  <- Index Scan :: info_type



In [13]:
print(eap.pretty_print())

Hash Join (mi_idx.movie_id = t.id)
  <- [SQ] Hash Join (mi_idx.info_type_id = it.id)
    <- Seq Scan :: movie_info_idx
    <- Seq Scan :: info_type
  <- Nested Loop (t.id = mc.movie_id)
    <- Hash Join (mc.company_type_id = ct.id)
      <- Seq Scan :: movie_companies
      <- Seq Scan :: company_type
    <- Index Only Scan :: title



In [14]:
eap_trans.left

Hash Join (mi_idx.movie_id = t.id) <- [Seq Scan :: movie_info_idx, Nested Loop (t.id = mc.movie_id) <- [Hash Join (mc.company_type_id = ct.id) <- [Seq Scan :: movie_companies, Seq Scan :: company_type], Index Only Scan :: title]]

In [15]:
eap_trans.right

Index Scan :: info_type

In [16]:
eap_trans.left.proc_rows + eap_trans.right.proc_rows

20718

In [22]:
calculate_subquery_rows(eap)

(Hash Join (mi_idx.movie_id = t.id) <- [Hash Join (mi_idx.info_type_id = it.id) <- [Seq Scan :: movie_info_idx, Seq Scan :: info_type], Nested Loop (t.id = mc.movie_id) <- [Hash Join (mc.company_type_id = ct.id) <- [Seq Scan :: movie_companies, Seq Scan :: company_type], Index Only Scan :: title]],
 9651,
 Hash Join (mi_idx.info_type_id = it.id) <- [Seq Scan :: movie_info_idx, Seq Scan :: info_type])

In [23]:
calculate_subquery_rows(eap_trans, target_predicate="(mi_idx.info_type_id = it.id)")

(Nested Loop (it.id = mi_idx.info_type_id) <- [Hash Join (mi_idx.movie_id = t.id) <- [Seq Scan :: movie_info_idx, Nested Loop (t.id = mc.movie_id) <- [Hash Join (mc.company_type_id = ct.id) <- [Seq Scan :: movie_companies, Seq Scan :: company_type], Index Only Scan :: title]], Index Scan :: info_type],
 20718,
 Nested Loop (it.id = mi_idx.info_type_id) <- [Hash Join (mi_idx.movie_id = t.id) <- [Seq Scan :: movie_info_idx, Nested Loop (t.id = mc.movie_id) <- [Hash Join (mc.company_type_id = ct.id) <- [Seq Scan :: movie_companies, Seq Scan :: company_type], Index Only Scan :: title]], Index Scan :: info_type])