# query optimization

the process of query optimization is very similar to the process of refactoring.

the goal is to make the query more efficient without changing it's behavior.

so ideally you should write TDD-style tests for your queries first before you start optimizing them to avoid any unexpected behavior changes.

in this notebook we will go through some common query optimization techniques based on examples.

In [2]:
!chmod +x ./reset.sh && ./reset.sh > /dev/null
!psql postgres -c "reset all;"

Did not find any relations.
psql: error: connection to server on socket "/tmp/.s.PGSQL.5432" failed: FATAL:  database "sueszli" does not exist
ERROR:  cannot drop the currently open database
ERROR:  current user cannot be dropped
RESET


In [70]:
import os
import sys
import json

def equals_query(query1: str, query2: str, prefix: str="") -> bool:
    output1 = os.popen(f'psql postgres -c "{prefix} {query1};"').read()
    output2 = os.popen(f'psql postgres -c "{prefix} {query2};"').read()
    length_match = len(output1) == len(output2)
    
    output1 = output1.splitlines()
    output2 = output2.splitlines()
    output1.sort()
    output2.sort()
    output1 = "\n".join(output1)
    output2 = "\n".join(output2)
    non_order_match = output1 == output2
    
    return length_match and non_order_match

def print_simple_execution_plan(query: str, prefix: str="") -> None:
    output = os.popen(f'psql postgres -c "{prefix} explain (analyze, verbose off, costs off, settings off, generic_plan off, buffers off, wal off, timing off, summary off) {query};"').read()
    print(output)

def print_verbose_execution_plan(query: str, prefix: str="") -> None:
    output = os.popen(f'psql postgres -c "{prefix} explain (analyze, verbose, costs, settings, buffers, wal, timing, summary) {query};"').read()
    print(output)

def print_avg_time(query: str, prefix: str="", iters:int=10) -> None:   
    def get_exec_time(query: str, prefix: str="") -> float:
        output = os.popen(f'psql postgres -c "{prefix} explain (analyze, verbose, costs, settings, buffers, wal, timing, summary, format json) {query};"').read()
        output = output.splitlines()[2:-2]
        for i in range(len(output)):
            if len(output[i]) > 2:
                output[i] = output[i][:-1]
        output = "\n".join(output)
        output = json.loads(output)[0]
        return float(output["Execution Time"])

    vals = []
    for _ in range(iters):
        vals.append(get_exec_time(query, prefix))
        sys.stdout.write(f"\r{iters - _} iterations left")
        sys.stdout.write("\r")
        sys.stdout.flush()
    print(f"average execution time in {iters} interations: {sum(vals) / iters:.2f} ms")

# demo
demo_q1 = "SELECT a,b,c FROM r NATURAL JOIN s NATURAL JOIN t"
demo_q2 = "SELECT a,b,c FROM r NATURAL JOIN s NATURAL JOIN t ORDER BY a,b,c"
assert equals_query(demo_q1, demo_q2)
# print_simple_execution_plan(demo_q1)
# print_verbose_execution_plan(demo_q1)
print_avg_time(demo_q1)


average execution time in 10 interations: 325.58 ms
