# TPCDS: Query Performance Test Script

In [1]:
import sys, os
import psycopg2
import numpy as np
import pandas as pd
from psycopg2 import Error
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
from datetime import datetime
from IPython.display import clear_output

In [2]:
# scale factor being tested
sf = 'sf_1'

# set up connection variables
db_host = "localhost"
db_port = "5432"
db_user = "postgres"
db_pass = "password"
db_name = "tpcds"

# function to connect with postgres
def connect_postgres(db_host, db_port, db_user, db_pass, db_name):
    try:
        # Connect to an existing database
        connection = psycopg2.connect(host = db_host,
                                      port = db_port,
                                      user = db_user,
                                      password = db_pass,
                                      database = db_name)
        # Set auto-commit
        connection.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT);
        # Create a cursor to perform database operations
        cur = connection.cursor()
        # Print PostgreSQL details
        print("PostgreSQL server information")
        print(connection.get_dsn_parameters(), "\n")
        # Executing a SQL query
        cur.execute("SELECT version();")
        # Fetch result
        record = cur.fetchone()
        print("You are connected to - ", record, "\n")

    except (Exception, Error) as error:
        print("Error while connecting to PostgreSQL", error)
    else:
        return cur

In [3]:
# connect to postgres
cur = connect_postgres(db_host, db_port, db_user, db_pass, db_name)

PostgreSQL server information
{'user': 'postgres', 'dbname': 'tpcds', 'host': 'localhost', 'port': '5432', 'tty': '', 'options': '', 'sslmode': 'prefer', 'sslcompression': '0', 'gssencmode': 'prefer', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 

You are connected to -  ('PostgreSQL 14.5, compiled by Visual C++ build 1914, 64-bit',) 



In [4]:
# get dir path

path = os.getcwd() + '\\all_queries\\updated_queries'
files = os.listdir(path)
print(path)

C:\Users\ahmad\Desktop\tpcds_git\tpc-benchmark\all_queries\updated_queries


In [5]:
# function to get full abosolute path files in directory

def get_absolute_path(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

In [6]:
# get full abosolute path files in directory

files_abs_path = [p.replace('\\', '/') for p in get_absolute_path(path)]
print("Total files:", len(files_abs_path))
print("First few files...")
files_abs_path[:5]

Total files: 99
First few files...


['C:/Users/ahmad/Desktop/tpcds_git/tpc-benchmark/all_queries/updated_queries/query-01.sql',
 'C:/Users/ahmad/Desktop/tpcds_git/tpc-benchmark/all_queries/updated_queries/query-02.sql',
 'C:/Users/ahmad/Desktop/tpcds_git/tpc-benchmark/all_queries/updated_queries/query-03.sql',
 'C:/Users/ahmad/Desktop/tpcds_git/tpc-benchmark/all_queries/updated_queries/query-04.sql',
 'C:/Users/ahmad/Desktop/tpcds_git/tpc-benchmark/all_queries/updated_queries/query-05.sql']

In [7]:
# setup dataframe for recording query execution run times

query_name_list = []

for i in range(len(files)):
    query_name_list.append("Q" + files[i][-6:-4])
    
query_name_dict = {'query':query_name_list}
exec_details_df = pd.DataFrame(query_name_dict)

In [8]:
# get the date-time before all 99 queries have run (with iterations if chosen)

run_start_default = datetime.now()
# dd/mm/YY H:M:S
run_start = run_start_default.strftime("%d/%m/%Y %H:%M:%S")
print("Overall Run Start:", run_start)

Overall Run Start: 15/10/2022 01:38:39


In [9]:
# run all 99 queries in sequence, and multiple iterations if chosen
# save query result table output
# save query execution run time (for all iterations)

q_errors = 0
exec_details = []
# choose number of iterations to run
n_iterations = 5

for i in range(1, n_iterations + 1):
    
    clear_output(wait = True)
    print(f'Iteration {i}\n')
    q_index = 0
    exec_details = []
    iteration_start = datetime.now()
    for sql_script in files_abs_path:

        exec_start = datetime.now()
        try:
            cur.execute(
                open(sql_script, "r").read()
            )
        except Exception as e:
            q_errors += 1
            outcome = "Error"
        else:
            outcome = "Success"

        exec_end = datetime.now()
        exec_run_time = "{:.2f}".format((exec_end - exec_start).total_seconds())
        query_num = query_name_list[q_index]
        print(f'{query_num}: Success, Execution Time: {exec_run_time}s')
        exec_details.append(exec_run_time)
        
        # load table output to csv file (on first iteration only)
        if i == 1:
            df = pd.DataFrame(cur.fetchall(), columns = [desc[0] for desc in cur.description])
            df.to_csv(f'performance_test/{sf}/{query_num}.csv', index = False)
        else:
            pass
        
        q_index += 1
        
    iteration_end = datetime.now()    
    iteration_run_time = "{:.2f}".format(((iteration_end - iteration_start).total_seconds()) / 3600)
    print(f'\n{sf.upper()}, Iteration {i}, Total run time for the 99 queries: {iteration_run_time}hr')
    
    # append iteration execution details to dataframe
    exec_details_df[f'exec_time_iter_{i}'] = np.array(exec_details)

Iteration 5

Q01: Success, Execution Time: 0.19s
Q02: Success, Execution Time: 0.69s
Q03: Success, Execution Time: 1.07s
Q04: Success, Execution Time: 0.14s
Q05: Success, Execution Time: 5.68s
Q06: Success, Execution Time: 0.39s
Q07: Success, Execution Time: 0.13s
Q08: Success, Execution Time: 0.20s
Q09: Success, Execution Time: 2.86s
Q10: Success, Execution Time: 2.36s
Q11: Success, Execution Time: 0.16s
Q12: Success, Execution Time: 1612.18s
Q13: Success, Execution Time: 0.03s
Q14: Success, Execution Time: 0.54s
Q15: Success, Execution Time: 0.54s
Q16: Success, Execution Time: 0.62s
Q17: Success, Execution Time: 0.17s
Q18: Success, Execution Time: 0.15s
Q19: Success, Execution Time: 0.39s
Q20: Success, Execution Time: 0.87s
Q21: Success, Execution Time: 0.56s
Q22: Success, Execution Time: 0.41s
Q23: Success, Execution Time: 0.52s
Q24: Success, Execution Time: 0.17s
Q25: Success, Execution Time: 1.20s
Q26: Success, Execution Time: 4.90s
Q27: Success, Execution Time: 0.48s
Q28: Success

In [10]:
# check total amount of query errors

print(f"We have a total of {q_errors} queries with error")

We have a total of 0 queries with error


In [11]:
# get the date-time after all 99 queries have run (with iterations if chosen)

run_end_default = datetime.now()
# dd/mm/YY H:M:S
run_end = run_end_default.strftime("%d/%m/%Y %H:%M:%S")
print(f"Overall Run End (with {n_iterations} iterations):", run_end)

Overall Run End (with 5 iterations): 15/10/2022 13:37:08


In [12]:
# get the total run time (in hours) for all 99 queries to complete (with iterations if chosen)

total_run_time = "{:.2f}".format(((run_end_default - run_start_default).total_seconds()) / 3600)
print(f'Total run time for the 99 queries (with {n_iterations} iterations): {total_run_time}hr')

Total run time for the 99 queries (with 5 iterations): 11.97hr


In [13]:
# full details on query execution times (including iterations & average)
# load execution details to csv

exec_details_df['avg_exec_time'] = np.round(exec_details_df.iloc[:, 1:].apply(pd.to_numeric).mean(axis = 1), 2)
exec_details_df.to_csv(f'performance_test/{sf}/exec_time_details_{sf}.csv', index = False)
exec_details_df

Unnamed: 0,query,exec_time_iter_1,exec_time_iter_2,exec_time_iter_3,exec_time_iter_4,exec_time_iter_5,avg_exec_time
0,Q01,0.58,0.19,0.19,0.19,0.19,0.27
1,Q02,1.86,0.70,0.70,0.71,0.69,0.93
2,Q03,1.32,1.16,1.07,1.07,1.07,1.14
3,Q04,0.16,0.14,0.14,0.15,0.14,0.15
4,Q05,6.27,5.76,5.67,5.68,5.68,5.81
...,...,...,...,...,...,...,...
94,Q95,0.35,0.34,0.34,0.35,0.35,0.35
95,Q96,0.14,0.13,0.13,0.14,0.13,0.13
96,Q97,0.06,0.06,0.06,0.06,0.06,0.06
97,Q98,0.74,0.72,0.73,0.85,0.78,0.76


In [14]:
# close connection to db

cur.close()

#### End of script.