# Query Transformation Tests

In [1]:
import collections
import itertools
import pathlib
import warnings
from dataclasses import dataclass
from typing import Any, Dict, List, Tuple

import mo_sql_parsing as mosp
import natsort
import psycopg2

import numpy as np
import pandas as pd

In [2]:
workloads = ["explicit", "implicit"]
queries_src_dir = pathlib.Path("../workloads/JOB-Queries/")

In [3]:
df_data = collections.defaultdict(list)
for workload in workloads:
    workload_path = queries_src_dir / workload
    query_files = list(workload_path.glob("*.sql"))
    df_data["label"].extend(query_file.stem for query_file in query_files)
    df_data["workload"].extend(itertools.repeat(workload, len(query_files)))
    df_data["query"].extend(query_file.read_text().replace("\n", " ").lower() for query_file in query_files)
df_queries = pd.DataFrame(df_data)
df_queries = df_queries.sort_values(by="label", key=lambda _: np.argsort(natsort.index_natsorted(df_queries["label"]))).reset_index(drop=True)
df_queries["mosp_query"] = df_queries["query"].apply(mosp.parse)

In [4]:
df_queries

Unnamed: 0,label,workload,query,mosp_query
0,1a,explicit,select count(*) from movie_companies as mc j...,"{'select': {'value': {'count': '*'}}, 'from': ..."
1,1a,implicit,"select count(*) from company_type as ct, ...","{'select': {'value': {'count': '*'}}, 'from': ..."
2,1b,explicit,select count(*) from movie_info_idx as mi_idx...,"{'select': {'value': {'count': '*'}}, 'from': ..."
3,1b,implicit,"select count(*) from company_type as ct, ...","{'select': {'value': {'count': '*'}}, 'from': ..."
4,1c,explicit,select count(*) from movie_companies as mc j...,"{'select': {'value': {'count': '*'}}, 'from': ..."
...,...,...,...,...
221,33a,implicit,"select count(*) from company_name as cn1, ...","{'select': {'value': {'count': '*'}}, 'from': ..."
222,33b,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ..."
223,33b,implicit,"select count(*) from company_name as cn1, ...","{'select': {'value': {'count': '*'}}, 'from': ..."
224,33c,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ..."


In [5]:
df_expl = df_queries[df_queries.workload == "explicit"].copy()
df_subqueries = df_expl[df_expl["query"].str.rfind("select") > df_expl["query"].str.find("join")].copy()

In [6]:
class QueryUpdate:
    def __init__(self, base_table="", ):
        self.base_table = base_table
        self.table_renamings = collections.defaultdict(int)
        self.table_references = list()
        self.predicates = list()

    def include_subquery(self, subquery):

        # first up, build the rename map
        tables_in_sq = self._collect_tables(subquery)
        for table in tables_in_sq:
            self.table_renamings[table] += 1

    def _collect_tables(self, subquery):
        tables = []
        for clause in subquery["join"]["value"]["from"]:
            if "join" in clause:
                tables.append(clause["join"]["value"])
            else:
                tables.append(clause["value"])
        return tables

    def __str__(self) -> str:
        return f"Tables: {self.table_references}, Predicates: {self.predicates}"

In [7]:
def extract_subqueries(plan):
    from_clause = plan["from"]
    query_update = None
    sq_found = False

    # for each reference in the 'from' clause, check if it constitutes a subquery reference
    for table in from_clause:

        # extract the base table name and proceed with the joined tables
        if not isinstance(table, dict) or "join" not in table:
            query_update = QueryUpdate(table)
            continue

        join_target = table["join"]["value"]
        if isinstance(join_target, dict) and "select" in join_target:
            #print("Found subquery:", join_target)
            sq_found = True
            break
            query_update.include_subquery(table)

    return(sq_found)

In [8]:
df_nosq = df_expl[~df_expl.mosp_query.apply(extract_subqueries)]

In [9]:
conn = psycopg2.connect("dbname=imdb user=strix host=localhost")
cur = conn.cursor()

In [10]:
q_raw = df_subqueries.iloc[0]["query"]
q_raw

"select count(*) from  movie_companies as mc  join company_type as ct on (ct.kind = 'production companies' and ct.id = mc.company_type_id and mc.note not like '%(as metro-goldwyn-mayer pictures)%' and (mc.note like '%(co-production)%'  or mc.note like '%(presents)%')) join title as t on (t.id = mc.movie_id) join  (select movie_id from movie_info_idx as mi_idx  join info_type as it on (it.info = 'top 250 rank' and it.id = mi_idx.info_type_id)) as t_mi_idx  on(t_mi_idx.movie_id = mc.movie_id);"

In [11]:
@dataclass
class TableRef:
    full_name: str
    alias: str
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        return f"{self.full_name} AS {self.alias}"


@dataclass
class AttributeRef:
    src_table: TableRef
    attribute: str
        
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        return f"{self.src_table.alias}.{self.attribute}"


class JoinStatement:
    def __init__(self, target: TableRef, on=None):
        self.target = target
        self.on = on
        self.multi_clause = False
    
    def expand(self, on):
        if not self.on:
            self.on = on
        elif not self.multi_clause:
            self.on = {"and": [self.on, on]}
            self.multi_clause = True
        else:
            self.on["and"].append(on)
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        return f"JOIN {self.target} ON {self.on}"


def attr_is_from_table(table: TableRef, attr: str):
    if not isinstance(attr, str):
        warnings.warn("Treating non-string attribute as False: " + str(attr))
        return False
    table_qualifier = table.alias + "."
    return attr.startswith(table_qualifier)

In [12]:
class DBSchema:
    def __init__(self, cursor: "psycopg2.cursor"):
        self.cursor = cursor
        
    def lookup_attribute(self, attribute_name: str, candidate_tables: List[TableRef]) -> AttributeRef:
        for table in candidate_tables:
            columns = self._fetch_columns(table.full_name)
            if attribute_name in columns:
                return table
        raise KeyError(f"Attribute not found: {attribute_name} in candidates {candidate_tables}")

    def _fetch_columns(self, table_name):
        base_query = "SELECT column_name FROM information_schema.columns WHERE table_name = %s"
        cur.execute(base_query, (table_name,))
        result_set = cur.fetchall()
        return [col[0] for col in result_set]

In [13]:
def collect_attributes(projection):
    if not isinstance(projection, list):
        return [projection["value"]]
    return [col["value"] for col in projection]

In [14]:
def extract_table_references(from_clause):
    references = []
    
    for table_ref in from_clause:
        table_name, table_alias = "", ""
        
        # base table reference
        if "value" in table_ref:
            table_name = table_ref["value"]
            table_alias = table_ref.get("name", table_name)
        # joined table
        elif "join" in table_ref:
            join_target = table_ref["join"]
            table_name = join_target["value"]
            table_alias = join_target.get("name", table_name)
            
        references.append(TableRef(table_name, table_alias))
    
    return references

In [15]:
def bind_attributes(raw_attributes: List[str], candidate_tables: List[TableRef], *, dbschema) -> List[AttributeRef]:
    bindings = []
    for attribute in raw_attributes:
        src_table = dbschema.lookup_attribute(attribute, candidate_tables)
        bindings.append(AttributeRef(src_table, attribute))
    return bindings

In [16]:
q = df_subqueries.iloc[0].mosp_query
q

{'select': {'value': {'count': '*'}},
 'from': [{'value': 'movie_companies', 'name': 'mc'},
  {'join': {'value': 'company_type', 'name': 'ct'},
   'on': {'and': [{'eq': ['ct.kind', {'literal': 'production companies'}]},
     {'eq': ['ct.id', 'mc.company_type_id']},
     {'not_like': ['mc.note',
       {'literal': '%(as metro-goldwyn-mayer pictures)%'}]},
     {'or': [{'like': ['mc.note', {'literal': '%(co-production)%'}]},
       {'like': ['mc.note', {'literal': '%(presents)%'}]}]}]}},
  {'join': {'value': 'title', 'name': 't'},
   'on': {'eq': ['t.id', 'mc.movie_id']}},
  {'join': {'value': {'select': {'value': 'movie_id'},
     'from': [{'value': 'movie_info_idx', 'name': 'mi_idx'},
      {'join': {'value': 'info_type', 'name': 'it'},
       'on': {'and': [{'eq': ['it.info', {'literal': 'top 250 rank'}]},
         {'eq': ['it.id', 'mi_idx.info_type_id']}]}}]},
    'name': 't_mi_idx'},
   'on': {'eq': ['t_mi_idx.movie_id', 'mc.movie_id']}}]}

In [17]:
def head(lst):
    if not len(lst):
        raise ValueError("List is empty")
    return lst[0]

def unwrap_singular_dict(d, *, target="value"):
    if target == "value":
        vals = list(d.values())
        if len(vals) > 1:
            raise ValueError("Not a singluar dict:", d)
        return head(vals)
    elif target == "key":
        if len(d) > 1:
            raise ValueError("Not a singular dict:", d)
        return head(list(d.keys()))
    else:
        raise KeyError("Unknown target: " + target)

In [18]:
class MospQuery:
    def __init__(self, query):
        self.query = query
        
    def joins(self):
        from_clause = self.query["from"]
        return [clause for clause in from_clause if "join" in clause]

In [19]:
class MospPredicateHandler:
    def __init__(self, predicate):
        self.predicate = predicate
        self.operation = unwrap_singular_dict(predicate, target="key")
        self.arguments = self.predicate[self.operation]
        
    def attributes(self):
        return [arg for arg in self.arguments if isinstance(arg, str)]

In [28]:
def filter_predicates(predicate_tree, base_table, joins_only=False):
    if isinstance(predicate_tree, list):
        filtered = [filter_predicates(subtree, base_table, joins_only) for subtree in predicate_tree]
        return None if all([f is None for f in filtered]) else filtered
    elif isinstance(predicate_tree, dict):
        operation = unwrap_singular_dict(predicate_tree, target="key")
        if operation.lower() in ["and", "or", "not"]:
            subtree = predicate_tree[operation]
            filtered = filter_predicates(subtree, base_table, joins_only)
            return None if filtered is None else {operation: filtered}
        else:
            attributes = MospPredicateHandler(predicate_tree).attributes()
            res = None
            if not any(attr_is_from_table(base_table, attr) for attr in attributes):
                res = None
            elif joins_only and len(attributes) >= 2:
                res = predicate_tree
            elif not joins_only and len(attributes) == 1:
                res = predicate_tree
            else:
                res = None
            return res
    else:
        raise TypeError("Unknown predicate tree type: " + str(predicate_tree))

In [21]:
def extract_filter_predicates(predicate_tree, base_table):
    # check if we moved down the conjunction/disjunction tree as far as possible
    if isinstance(predicate_tree, list):
        # perform the actual filtering
        matching_predicates = []
        for pred in predicate_tree:
            # each predicate is a dict with just a single key: the comparison operation
            # this slightly weird structure needs to be unwrapped
            # e.g. {'eq': ['it.info', {'literal': 'top 250 rank'}]}
            
            if isinstance(pred, dict):
                operation = unwrap_singular_dict(pred, target="key")
                matching_predicates.append({operation: extract_filter_predicates(pred, base_table)})
                continue
                
            pred_source = unwrap_singular_dict(pred)[0]
            if not pred_source.startswith(base_table.alias + "."):
                continue
            pred_target = unwrap_singular_dict(pred)[1]
            
            # predicates such as customer.last_login_date > customer.last_purchase_date
            if isinstance(pred_target, str) and pred_target.startswith(base_table.alias + "."):
                matching_predicates.append(pred)
            # predicates such as customer.last_login_date > 2022-01-01
            elif isinstance(pred_target, dict) and "literal" in pred_target:
                matching_predicates.append(pred)
            # join predicates such as customer.id = buyer.id
            else:
                continue

        return matching_predicates
    elif isinstance(predicate_tree, dict):
        operation = unwrap_singular_dict(predicate_tree, target="key")
        if operation == "eq":
            pred_source, pred_target = predicate_tree["eq"]
            if attr_is_from_table(base_table, pred_source) or attr_is_from_table(base_table, pred_target):
                return predicate_tree
            else:
                return {"eq": []}
        return {operation: extract_filter_predicates(predicate_tree[operation], base_table)}
    else:
        raise ValueError("Unknown predicate tree structure: " + predicate_tree)

In [22]:
def extract_join_predicates(predicate_tree, base_table):
    # check if we moved down the conjunction/disjunction tree as far as possible
    if isinstance(predicate_tree, list):
        matching_predicates = []
        for pred in predicate_tree:
            pred_source = unwrap_singular_dict(pred)[0]
            pred_target = unwrap_singular_dict(pred)[1]
            if not isinstance(pred_target, str):
                continue
            if attr_is_from_table(base_table, pred_source) or attr_is_from_table(base_table, pred_target):
                matching_predicates.append(pred)
        return matching_predicates
    elif isinstance(predicate_tree, dict):
        operation = unwrap_singular_dict(predicate_tree, target="key")
        if operation == "eq":
            pred_source, pred_target = predicate_tree["eq"]
            if attr_is_from_table(base_table, pred_source) or attr_is_from_table(base_table, pred_target):
                return predicate_tree
            else:
                return {"eq": []}
        else:
            extracted_subtree = extract_join_predicates(predicate_tree[operation], base_table)
            return {operation: extracted_subtree}
    else:
        raise ValueError("Unknown predicate tree structure: " + predicate_tree)

In [23]:
def prune_predicate_tree(predicate_tree):
    if predicate_tree is None:
        return None

    if isinstance(predicate_tree, list):
        print("Found nested list:", predicate_tree)
        return None # TODO appropriate reaction
    elif isinstance(predicate_tree, dict):
        operation = unwrap_singular_dict(predicate_tree, target="key")
        subtree = predicate_tree[operation]
        if not subtree:
            return None
        
        if operation.lower() in ["and", "or", "not"]:
            pruned_subtree = [prune_predicate_tree(pred) for pred in subtree]
            pruned_subtree = [pred for pred in pruned_subtree if pred is not None]
            if not pruned_subtree:
                return None
            elif len(pruned_subtree) == 1 and operation.lower() != "not":
                return pruned_subtree
            else:
                return {operation: pruned_subtree}
        return predicate_tree    
    else:
        raise TypeError("Unknown predicate tree type: " + str(predicate_tree))

In [24]:
q["from"][3]["join"]["value"]["from"][1]["on"]

{'and': [{'eq': ['it.info', {'literal': 'top 250 rank'}]},
  {'eq': ['it.id', 'mi_idx.info_type_id']}]}

In [25]:
def rewrite_predicate_to_subquery(pred, base_table, join_table):
    operation = unwrap_singular_dict(pred, target="key")
    predicate_components = pred[operation]
    rewritten_pred = list(predicate_components)
    for elem_idx, elem in enumerate(predicate_components):
        if attr_is_from_table(base_table, elem):
            rewritten_pred[elem_idx] = elem
        else:
            join_tab, attr_name = elem.split(".")
            rewritten_pred[elem_idx] = f"{join_table.alias}.{attr_name}"
    return {operation: rewritten_pred}

In [69]:
q = df_expl.loc[56].mosp_query
q

{'select': {'value': {'count': '*'}},
 'from': [{'value': 'aka_name', 'name': 'a1'},
  {'join': {'value': 'name', 'name': 'n1'},
   'on': {'eq': ['a1.person_id', 'n1.id']}},
  {'join': {'value': {'select': [{'value': 'person_id'},
      {'value': 'movie_id'}],
     'from': [{'value': 'cast_info', 'name': 'ci'},
      {'join': {'value': 'role_type', 'name': 'rt'},
       'on': {'and': [{'eq': ['rt.role', {'literal': 'writer'}]},
         {'eq': ['ci.role_id', 'rt.id']}]}},
      {'join': {'value': 'title', 'name': 't'},
       'on': {'eq': ['ci.movie_id', 't.id']}}]},
    'name': 't_ci'},
   'on': {'eq': ['t_ci.person_id', 'a1.person_id']}},
  {'join': {'value': 'movie_companies', 'name': 'mc'},
   'on': {'eq': ['mc.movie_id', 't_ci.movie_id']}},
  {'join': {'value': 'company_name', 'name': 'cn'},
   'on': {'and': [{'eq': ['cn.country_code', {'literal': '[us]'}]},
     {'eq': ['mc.company_id', 'cn.id']}]}}]}

In [85]:
class FlattenedQuery:
    def __init__(self, base_table: TableRef, dbschema: DBSchema):
        self.base_table = base_table
        self.dbschema = dbschema
        self.joins = list()
        self.virtual_tables = list() # these are the table names that only exist due to being named as a result of a subquery
        
    def absorb_join(self, join):
        """Includes a join statement in the query. Could be either a subquery, or a plain join."""
        join_data = join["join"]
        join_predicate = join["on"]
        print("Found join pred:", join_predicate)
        
        # sanity check
        if isinstance(join_data["value"], dict) and not "select" in join_data["value"]:
            warnings.warn("Unknown query structure:", join)
            return
        
        # just a plain join?
        if not isinstance(join_data["value"], dict):
            joined_table = TableRef(join_data["value"], join_data["name"])
            join_predicate = self._rewrite_join_predicate(join_predicate)
            join_stmt = JoinStatement(joined_table, join_predicate)
            self.joins.append(join_stmt)
            print("==> Just a plain join!")
            return
        
        # at this point, we know we found a subquery
        # we are going to break up its structure first, and gather the relevant data secondly
        subquery = join_data["value"]
        sq_projection = subquery["select"]
        sq_tables = subquery["from"]
        sq_target = join_data["name"]
        
        table_refs = extract_table_references(sq_tables)
        self.virtual_tables[sq_target] = list(table_refs)
        attribute_refs = bind_attributes(collect_attributes(sq_projection), table_refs, dbschema=self.dbschema)
        for table_idx, table in enumerate(table_refs):
            sq_join_predicate = []
            sq_join_filters = []
            for join in [join["on"] for join in sq_tables if "join" in join]:
                extracted_predicates = filter_predicates(join, table, joins_only=True)
                pruned_predicates = prune_predicate_tree(extracted_predicates)
                if pruned_predicates:
                    sq_join_predicate.append(pruned_predicates)
                
                extracted_filters = filter_predicates(join, table)
                pruned_filters = prune_predicate_tree(extracted_filters)
                if pruned_filters:
                    sq_join_filters.append(pruned_filters)
            
            # At this point, we know which filter conditions apply to the join, as well as
            # which predicates are needed to carry out the acual join.
            # Now, we "just" need to determine, whether the join conditions are already
            # sufficient b/c they reference the base table (even if indirectly), or whether we
            # need to introduce a new surrogate predicate source to complete the link
            
            # use the join predicate on the subquery?: if the subquery attribute belongs to
            # this table, we found the candidate!
            if table_idx == 0:
                sq_join_predicate = rewrite_predicate_to_subquery(join_predicate, self.base_table, table)
                print("Rewrote join pred to", sq_join_predicate)
            join_stmt = JoinStatement(table, sq_join_filters)
            join_stmt.expand(sq_join_predicate)
            self.joins.append(join_stmt)
            
    def _rewrite_join_predicate(self):
        pass
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        return f"{self.base_table} {self.joins}"
    
def flatten_query(mosp_query):
    from_clause = mosp_query["from"]
    flattened_query = None
    for table in from_clause:
        # extract the base table name and proceed with the joined tables
        if not isinstance(table, dict) or "join" not in table:
            table_ref = TableRef(table["value"], table["name"])
            flattened_query = FlattenedQuery(table_ref, DBSchema(cur))
            continue
        else:
            print(".. Absorbing join", table)
            flattened_query.absorb_join(table)
            
    return flattened_query

In [86]:
q

{'select': {'value': {'count': '*'}},
 'from': [{'value': 'aka_name', 'name': 'a1'},
  {'join': {'value': 'name', 'name': 'n1'},
   'on': {'eq': ['a1.person_id', 'n1.id']}},
  {'join': {'value': {'select': [{'value': 'person_id'},
      {'value': 'movie_id'}],
     'from': [{'value': 'cast_info', 'name': 'ci'},
      {'join': {'value': 'role_type', 'name': 'rt'},
       'on': {'and': [{'eq': ['rt.role', {'literal': 'writer'}]},
         {'eq': ['ci.role_id', 'rt.id']}]}},
      {'join': {'value': 'title', 'name': 't'},
       'on': {'eq': ['ci.movie_id', 't.id']}}]},
    'name': 't_ci'},
   'on': {'eq': ['t_ci.person_id', 'a1.person_id']}},
  {'join': {'value': 'movie_companies', 'name': 'mc'},
   'on': {'eq': ['mc.movie_id', 't_ci.movie_id']}},
  {'join': {'value': 'company_name', 'name': 'cn'},
   'on': {'and': [{'eq': ['cn.country_code', {'literal': '[us]'}]},
     {'eq': ['mc.company_id', 'cn.id']}]}}]}

In [87]:
fq = flatten_query(q)
fq

.. Absorbing join {'join': {'value': 'name', 'name': 'n1'}, 'on': {'eq': ['a1.person_id', 'n1.id']}}
Found join pred: {'eq': ['a1.person_id', 'n1.id']}
==> Just a plain join!
.. Absorbing join {'join': {'value': {'select': [{'value': 'person_id'}, {'value': 'movie_id'}], 'from': [{'value': 'cast_info', 'name': 'ci'}, {'join': {'value': 'role_type', 'name': 'rt'}, 'on': {'and': [{'eq': ['rt.role', {'literal': 'writer'}]}, {'eq': ['ci.role_id', 'rt.id']}]}}, {'join': {'value': 'title', 'name': 't'}, 'on': {'eq': ['ci.movie_id', 't.id']}}]}, 'name': 't_ci'}, 'on': {'eq': ['t_ci.person_id', 'a1.person_id']}}
Found join pred: {'eq': ['t_ci.person_id', 'a1.person_id']}
Rewrote join pred to {'eq': ['ci.person_id', 'a1.person_id']}
.. Absorbing join {'join': {'value': 'movie_companies', 'name': 'mc'}, 'on': {'eq': ['mc.movie_id', 't_ci.movie_id']}}
Found join pred: {'eq': ['mc.movie_id', 't_ci.movie_id']}
==> Just a plain join!
.. Absorbing join {'join': {'value': 'company_name', 'name': 'cn'}

aka_name AS a1 [JOIN name AS n1 ON {'eq': ['a1.person_id', 'n1.id']}, JOIN cast_info AS ci ON {'eq': ['ci.person_id', 'a1.person_id']}, JOIN role_type AS rt ON {'and': [[[{'eq': ['rt.role', {'literal': 'writer'}]}]], [[{'eq': ['ci.role_id', 'rt.id']}]]]}, JOIN title AS t ON [{'eq': ['ci.movie_id', 't.id']}], JOIN movie_companies AS mc ON {'eq': ['mc.movie_id', 't_ci.movie_id']}, JOIN company_name AS cn ON {'and': [{'eq': ['cn.country_code', {'literal': '[us]'}]}, {'eq': ['mc.company_id', 'cn.id']}]}]

In [31]:
q

{'select': {'value': {'count': '*'}},
 'from': [{'value': 'movie_companies', 'name': 'mc'},
  {'join': {'value': 'company_type', 'name': 'ct'},
   'on': {'and': [{'eq': ['ct.kind', {'literal': 'production companies'}]},
     {'eq': ['ct.id', 'mc.company_type_id']},
     {'not_like': ['mc.note',
       {'literal': '%(as metro-goldwyn-mayer pictures)%'}]},
     {'or': [{'like': ['mc.note', {'literal': '%(co-production)%'}]},
       {'like': ['mc.note', {'literal': '%(presents)%'}]}]}]}},
  {'join': {'value': 'title', 'name': 't'},
   'on': {'eq': ['t.id', 'mc.movie_id']}},
  {'join': {'value': {'select': {'value': 'movie_id'},
     'from': [{'value': 'movie_info_idx', 'name': 'mi_idx'},
      {'join': {'value': 'info_type', 'name': 'it'},
       'on': {'and': [{'eq': ['it.info', {'literal': 'top 250 rank'}]},
         {'eq': ['it.id', 'mi_idx.info_type_id']}]}}]},
    'name': 't_mi_idx'},
   'on': {'eq': ['t_mi_idx.movie_id', 'mc.movie_id']}}]}

In [32]:
def tableref_to_mosp(table):
    return {"value": table.full_name, "name": table.alias}

In [33]:
def flattened_query_to_mosp(flattened_query):
    mosp_query = {}
    select_clause = {"value": {"count": "*"}}
    mosp_query["select"] = select_clause
    from_clause = [tableref_to_mosp(flattened_query.base_table)]
    for join in flattened_query.joins:
        mosp_join = {"join": tableref_to_mosp(join.target),
                     "on": join.on}
        from_clause.append(mosp_join)
    mosp_query["from"] = from_clause
    return mosp_query

In [34]:
mosp_fq = flattened_query_to_mosp(fq)
mosp_fq

{'select': {'value': {'count': '*'}},
 'from': [{'value': 'movie_companies', 'name': 'mc'},
  {'join': {'value': 'company_type', 'name': 'ct'},
   'on': {'and': [{'eq': ['ct.kind', {'literal': 'production companies'}]},
     {'eq': ['ct.id', 'mc.company_type_id']},
     {'not_like': ['mc.note',
       {'literal': '%(as metro-goldwyn-mayer pictures)%'}]},
     {'or': [{'like': ['mc.note', {'literal': '%(co-production)%'}]},
       {'like': ['mc.note', {'literal': '%(presents)%'}]}]}]}},
  {'join': {'value': 'title', 'name': 't'},
   'on': {'eq': ['t.id', 'mc.movie_id']}},
  {'join': {'value': 'movie_info_idx', 'name': 'mi_idx'},
   'on': {'eq': ['mi_idx.movie_id', 'mc.movie_id']}},
  {'join': {'value': 'info_type', 'name': 'it'},
   'on': {'and': [[[{'eq': ['it.info', {'literal': 'top 250 rank'}]}]],
     [[{'eq': ['it.id', 'mi_idx.info_type_id']}]]]}}]}

In [35]:
fq_str = mosp.format(mosp_fq)
fq_str

"SELECT COUNT(*) FROM movie_companies AS mc JOIN company_type AS ct ON ct.kind = 'production companies' AND ct.id = mc.company_type_id AND mc.note NOT LIKE '%(as metro-goldwyn-mayer pictures)%' AND (mc.note LIKE '%(co-production)%' OR mc.note LIKE '%(presents)%') JOIN title AS t ON t.id = mc.movie_id JOIN movie_info_idx AS mi_idx ON mi_idx.movie_id = mc.movie_id JOIN info_type AS it ON ((it.info = 'top 250 rank')) AND ((it.id = mi_idx.info_type_id))"

In [36]:
mosp.format(q)

"SELECT COUNT(*) FROM movie_companies AS mc JOIN company_type AS ct ON ct.kind = 'production companies' AND ct.id = mc.company_type_id AND mc.note NOT LIKE '%(as metro-goldwyn-mayer pictures)%' AND (mc.note LIKE '%(co-production)%' OR mc.note LIKE '%(presents)%') JOIN title AS t ON t.id = mc.movie_id JOIN (SELECT movie_id FROM movie_info_idx AS mi_idx JOIN info_type AS it ON it.info = 'top 250 rank' AND it.id = mi_idx.info_type_id) AS t_mi_idx ON t_mi_idx.movie_id = mc.movie_id"

In [37]:
cur.execute(mosp.format(q))
cur.fetchall()

[(143,)]

In [38]:
cur.execute(mosp.format(mosp_fq))
cur.fetchall()

[(143,)]

In [39]:
q_raw

"select count(*) from  movie_companies as mc  join company_type as ct on (ct.kind = 'production companies' and ct.id = mc.company_type_id and mc.note not like '%(as metro-goldwyn-mayer pictures)%' and (mc.note like '%(co-production)%'  or mc.note like '%(presents)%')) join title as t on (t.id = mc.movie_id) join  (select movie_id from movie_info_idx as mi_idx  join info_type as it on (it.info = 'top 250 rank' and it.id = mi_idx.info_type_id)) as t_mi_idx  on(t_mi_idx.movie_id = mc.movie_id);"

In [50]:
df_expl["flattened_query"] = df_expl.mosp_query.apply(flatten_query)
df_expl["flattened_query_str"] = df_expl.flattened_query.apply(flattened_query_to_mosp).apply(mosp.format)

In [51]:
df_expl

Unnamed: 0,label,workload,query,mosp_query,flattened_query,flattened_query_str,query_res
0,1a,explicit,select count(*) from movie_companies as mc j...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_companies AS mc [JOIN company_type AS ct...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,143
2,1b,explicit,select count(*) from movie_info_idx as mi_idx...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_info_idx AS mi_idx [JOIN info_type AS it...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,3
4,1c,explicit,select count(*) from movie_companies as mc j...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_companies AS mc [JOIN company_type AS ct...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,3
6,1d,explicit,select count(*) from movie_info_idx as mi_idx...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_info_idx AS mi_idx [JOIN info_type AS it...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,4
8,2a,explicit,select count(*) from movie_keyword as mk joi...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_keyword AS mk [JOIN keyword AS k ON {'an...,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,7834
...,...,...,...,...,...,...,...
216,32a,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_link AS ml [JOIN link_type AS lt ON {'eq...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0
218,32b,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_link AS ml [JOIN link_type AS lt ON {'eq...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,4388
220,33a,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_link AS ml [JOIN link_type AS lt ON {'an...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,8
222,33b,explicit,select count(*) from movie_link as ml join l...,"{'select': {'value': {'count': '*'}}, 'from': ...",movie_link AS ml [JOIN link_type AS lt ON {'an...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,4


In [54]:
def execute_query(q):
    cur.execute(q)
    res = cur.fetchone()[0]
    return res

In [44]:
df_expl["query_res"] = df_expl["query"].apply(execute_query)

In [66]:
df_expl.iloc[0]["query"]

"select count(*) from  movie_companies as mc  join company_type as ct on (ct.kind = 'production companies' and ct.id = mc.company_type_id and mc.note not like '%(as metro-goldwyn-mayer pictures)%' and (mc.note like '%(co-production)%'  or mc.note like '%(presents)%')) join title as t on (t.id = mc.movie_id) join  (select movie_id from movie_info_idx as mi_idx  join info_type as it on (it.info = 'top 250 rank' and it.id = mi_idx.info_type_id)) as t_mi_idx  on(t_mi_idx.movie_id = mc.movie_id);"

In [67]:
df_expl[df_expl.flattened_query_str.str.contains("t_ci")]

Unnamed: 0,label,workload,query,mosp_query,flattened_query,flattened_query_str,query_res
56,8c,explicit,select count(*) from aka_name as a1 join nam...,"{'select': {'value': {'count': '*'}}, 'from': ...",aka_name AS a1 [JOIN name AS n1 ON {'eq': ['a1...,SELECT COUNT(*) FROM aka_name AS a1 JOIN name ...,2487611
58,8d,explicit,select count(*) from aka_name as an1 join na...,"{'select': {'value': {'count': '*'}}, 'from': ...",aka_name AS an1 [JOIN name AS n1 ON {'eq': ['a...,SELECT COUNT(*) FROM aka_name AS an1 JOIN name...,323005


In [68]:
df_expl.loc[56]

label                                                                 8c
workload                                                        explicit
query                  select count(*) from  aka_name as a1  join nam...
mosp_query             {'select': {'value': {'count': '*'}}, 'from': ...
flattened_query        aka_name AS a1 [JOIN name AS n1 ON {'eq': ['a1...
flattened_query_str    SELECT COUNT(*) FROM aka_name AS a1 JOIN name ...
query_res                                                        2487611
Name: 56, dtype: object

In [55]:
df_expl["flattened_res"] = df_expl.flattened_query_str.apply(execute_query)

UndefinedTable: missing FROM-clause entry for table "t_ci"
LINE 1: ...t.id) JOIN movie_companies AS mc ON mc.movie_id = t_ci.movie...
                                                             ^


## Regression analysis

First up, based on actual results:

In [None]:
regressions = []
regression_found = False
for q_idx, query in enumerate(df_queries.itertuples()):
    orig_query = query.query
    cur.execute(orig_query)
    orig_card = cur.fetchone()[0]
    mosp_query = mosp.format(query.mosp_query)
    cur.execute(mosp_query)
    mosp_card = cur.fetchone()[0]
    if q_idx % 50 == 0:
        print("Now at query", q_idx+1)
    if orig_card != mosp_card:
        regression_found = True
        regressions.append(query.label)
        print("= Regression found for query", query.label, "Orig:", orig_card, "MOSP:", mosp_card)
        
if not regression_found:
    print("== All tests succeeded! ==")
else:
    print("== Regressions found: ==")
    print(regressions)

Secondly, based on query plans:

In [None]:
def extract_plan_nodes(plan):
    node_type = plan["Node Type"]
    node_filter = ""
    if "Join Filter" in plan:
        node_filter = plan["Join Filter"]
    elif "Hash Cond" in plan:
        node_filter = plan["Hash Cond"]
    elif "Filter" in plan and "Index Cond" in plan:
        node_filter = plan["Index Cond"] + " // " + plan["Filter"]
    elif "Filter" in plan:
        node_filter = plan["Filter"]
    elif "Index Cond" in plan:
        node_filter = plan["Index Cond"]
    
    nodes = [node_type]
    if node_filter:
        nodes = [f"{node_type} :: {node_filter}"]
    
    for subplan in plan.get("Plans", []):
        nodes.extend(extract_plan_nodes(subplan))
    return nodes

In [None]:
regressions = []
regression_found = False
for q_idx, query in enumerate(df_queries.itertuples()):
    orig_query = query.query
    cur.execute("explain (format json) " + orig_query)
    orig_plan = cur.fetchone()[0][0]["Plan"]
    orig_nodes = extract_plan_nodes(orig_plan)
    
    mosp_query = mosp.format(query.mosp_query)
    cur.execute("explain (format json) " + mosp_query)
    mosp_plan = cur.fetchone()[0][0]["Plan"]
    mosp_nodes = extract_plan_nodes(mosp_plan)
    
    if q_idx % 50 == 0:
        print("Now at query", q_idx+1)
    if orig_nodes != mosp_nodes:
        regression_found = True
        regressions.append(query.label)
        print("= Regression found for query", query.label, "Orig:", orig_nodes, "MOSP:", mosp_nodes)
        
if not regression_found:
    print("== All tests succeeded! ==")
else:
    print("== Regressions found: ==")
    print(regressions)

In [8]:
import pandas as pd

In [14]:
df_regressions = pd.read_csv("workloads/job-ues-flattened-regressions.csv")
df_regressions

Unnamed: 0,label,query,flattened_query,card_query,card_flattened_query,passed
0,1a,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,143,143,True
1,1b,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,3,3,True
2,1c,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...,3,3,True
3,1d,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...,4,4,True
4,2a,select count(*) from movie_keyword as mk joi...,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...,7834,7834,True
...,...,...,...,...,...,...
108,32a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,0,0,True
109,32b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,4388,4388,True
110,33a,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,8,8,True
111,33b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...,4,4,True


In [16]:
df_regressions[~df_regressions.passed]

Unnamed: 0,label,query,flattened_query,card_query,card_flattened_query,passed


## Transformation analysis

In [1]:
import random

import pandas as pd

In [17]:
df = pd.read_csv("workloads/job-ues-flattened.csv")

In [18]:
test_queries = df.loc[random.choices(df.index, k=15)]
test_queries

Unnamed: 0,label,query,flattened_query
65,18a,select count(*) from movie_info_idx as mi_idx...,SELECT COUNT(*) FROM movie_info_idx AS mi_idx ...
96,28a,select count(*) from complete_cast as cc joi...,SELECT COUNT(*) FROM complete_cast AS cc JOIN ...
23,7a,select count(*) from person_info as pi join ...,SELECT COUNT(*) FROM person_info AS pi JOIN in...
45,13b,select count(*) from movie_info_idx as miidx ...,SELECT COUNT(*) FROM movie_info_idx AS miidx J...
111,33b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...
44,13a,select count(*) from movie_info_idx as miidx ...,SELECT COUNT(*) FROM movie_info_idx AS miidx J...
111,33b,select count(*) from movie_link as ml join l...,SELECT COUNT(*) FROM movie_link AS ml JOIN lin...
18,6b,select count(*) from movie_keyword as mk joi...,SELECT COUNT(*) FROM movie_keyword AS mk JOIN ...
33,9d,select count(*) from aka_name as an join nam...,SELECT COUNT(*) FROM aka_name AS an JOIN name ...
14,5a,select count(*) from movie_companies as mc j...,SELECT COUNT(*) FROM movie_companies AS mc JOI...


In [19]:
for __, test_query in test_queries.iterrows():
    label = test_query["label"]
    orig_query = test_query["query"]
    transformed_query = test_query["flattened_query"]
    print(" ::", label)
    print(orig_query)
    print("---")
    print(transformed_query)
    print("=====")

 :: 18a
select count(*) from  movie_info_idx as mi_idx  join info_type as it2 on (it2.info = 'votes' and it2.id = mi_idx.info_type_id) join title as t on (t.id = mi_idx.movie_id) join  (select person_id, movie_id from cast_info as ci  join name as n on (n.gender = 'm' and n.name like '%tim%' and n.id = ci.person_id and ci.note in ('(producer)', '(executive producer)'))) as t_ci  on(t_ci.movie_id = mi_idx.movie_id) join  (select movie_id from movie_info as mi  join info_type as it1 on (it1.info = 'budget' and it1.id = mi.info_type_id)) as t_mi  on(t_mi.movie_id = t_ci.movie_id);
---
SELECT COUNT(*) FROM movie_info_idx AS mi_idx JOIN info_type AS it2 ON it2.info = 'votes' AND it2.id = mi_idx.info_type_id JOIN title AS t ON t.id = mi_idx.movie_id JOIN cast_info AS ci ON ci.movie_id = mi_idx.movie_id JOIN name AS n ON n.gender = 'm' AND n.name LIKE '%tim%' AND n.id = ci.person_id AND ci.note IN ('(producer)', '(executive producer)') JOIN movie_info AS mi ON mi.movie_id = ci.movie_id JOIN i