In [1]:
import sys, os, json
import numpy as np
import pandas as pd
from datetime import datetime
from IPython.display import clear_output
from pyArango.connection import *

In [2]:
# scale factor being tested
all_sfs = ['sf_1','sf_2','sf_3','sf_4','sf_5','sf_6']

In [3]:
def create_arango_connection(sf):
    # set up connection variables
    db_url = 'http://127.0.0.1:8529'
    db_user = "hello"
    db_pass = "hello"
    db_name = "yelp_"+sf

    # function to connect with arango
    def connect_arangodb(db_url, db_user, db_pass, db_name):
        try:
            # Creating a connection
            conn = Connection(arangoURL = db_url,
                                username = db_user,
                                password = db_pass)
            if conn[db_name]:
                print('Connected with an existing database')
                db = conn[db_name]
            else:
                print('Created a new database')
                db = conn.createDatabase(name = db_name)

        except (Exception, Error) as error:
            print("Error while connecting", error)
        return db

        # connect to arangodb
    db = connect_arangodb(db_url, db_user, db_pass, db_name)
    return db

In [4]:
def get_queries_from_dir_path():
    
    # get dir path
    path = os.getcwd() + '\\all_queries'
    files = os.listdir(path)
    print(path)
    
    # function to get full abosolute path files in directory
    def get_absolute_path(d):
        return [os.path.join(d, f) for f in os.listdir(d)]
    
    # get full abosolute path files in directory
    files_abs_path = [p.replace('\\', '/') for p in get_absolute_path(path)]
    print("Total files:", len(files_abs_path))
    print("First few files...")
    print(files_abs_path[:]) 
    return files, files_abs_path    

In [5]:
def get_query_dataframe(files):
    
    # setup dataframe for recording query execution run times
    query_name_list = []

    for i in range(len(files)):
        query_name_list.append("Q" + files[i][5:-4])

    query_name_dict = {'query':query_name_list}
    exec_details_df = pd.DataFrame(query_name_dict)

    return query_name_dict, exec_details_df, query_name_list

In [6]:
def overall_run_start():
    
    # get the date-time before all queries have run (with iterations if chosen)
    run_start_default = datetime.now()
    # dd/mm/YY H:M:S
    run_start = run_start_default.strftime("%d/%m/%Y %H:%M:%S")
    print("Overall Run Start:", run_start)
    return run_start_default

In [7]:
def all_queries_execution(sf, db, files_abs_path, n_iterations, query_name_list, exec_details_df):
    # run all queries in sequence, and multiple iterations if chosen
    # save query results to json file
    # save query execution run time (for all iterations)

    q_errors = 0
    exec_details = []
    # choose number of iterations to run

    for i in range(1, n_iterations + 1):
        clear_output(wait = True)
        print(f'Scale Factor {sf}\n')
        print(f'Iteration {i}\n')
        q_index = 0
        exec_details = []
        iteration_start = datetime.now()
        for aql_script in files_abs_path:
            f = open(aql_script,"r").read()
            print(f'\nReading script {aql_script}')
            exec_start = datetime.now()
            try:
                qResult = db.AQLQuery(query = f, rawResults=True, batchSize=100)
            except Exception as e:
                print(e)
                q_errors += 1
                outcome = "Error"
            else:
                outcome = "Success"
            print('Sript reading complete')

            # qResult[0].to_json('hello.json')

            exec_end = datetime.now()
            exec_run_time = "{:.2f}".format((exec_end - exec_start).total_seconds()* 1000)
            query_num = query_name_list[q_index]
            print(f'{query_num}: Success, Execution Time: {exec_run_time} ms')
            exec_details.append(exec_run_time)

            # load output to json file (on first isteration only)
            if i == 1:
                output_filename = "C:\\Users\\ahmad\\Desktop\\adb_project\\arango\\performance_test\\" + sf + "\\" + query_num + ".json"
                print(output_filename)
                with open(output_filename, 'w') as f:
                    json.dump(list(qResult), f)
            else:
                pass

            q_index += 1

        iteration_end = datetime.now()    
        iteration_run_time = "{:.2f}".format(((iteration_end - iteration_start).total_seconds() * 1000))
        print(f'\n{sf.upper()}, Iteration {i}, Total run time for the queries: {iteration_run_time} ms')

        # append iteration execution details to dataframe
        exec_details_df[f'exec_time_iter_{i}'] = np.array(exec_details)
        
    return q_errors, exec_details_df

In [8]:
def get_query_errors(q_errors):
    # check total amount of query errors
    print(f"We have a total of {q_errors} queries with error")

In [9]:
def overall_run_end(n_iterations):
    # get the date-time after all queries have run (with iterations if chosen)

    run_end_default = datetime.now()
    # dd/mm/YY H:M:S
    run_end = run_end_default.strftime("%d/%m/%Y %H:%M:%S")
    print(f"Overall Run End (with {n_iterations} iterations):", run_end)
    return run_end_default

In [10]:
def total_run_time(run_start_default, run_end_default, n_iterations):
    # get the total run time (in hours) for all queries to complete (with iterations if chosen)

    total_run_time = "{:.2f}".format(((run_end_default - run_start_default).total_seconds() * 1000))
    print(f'Total run time for the queries (with {n_iterations} iterations): {total_run_time} ms')

In [11]:
def get_exec_df(sf, exec_details_df):
    # full details on query execution times (including iterations & average)
    # load execution details to csv

    exec_details_df['avg_exec_time'] = np.round(exec_details_df.iloc[:, 1:].apply(pd.to_numeric).mean(axis = 1), 2)
    exec_details_df.to_csv(f'performance_test/{sf}/exec_time_details_{sf}.csv', index = False)
    return exec_details_df

In [12]:
for sf in all_sfs:
    
    n_iterations = 3
    
    print(sf+ ' running---------------')
    
    db = create_arango_connection(sf)
    
    files, files_abs_path = get_queries_from_dir_path()
    
    query_dict, exec_details_df, query_name_list = get_query_dataframe(files)
    
    startTime = overall_run_start()
    q_errors, exec_df = all_queries_execution(sf, db, files_abs_path, n_iterations, query_name_list, exec_details_df)
    get_query_errors(q_errors)
    endTime = overall_run_end(n_iterations)
    
    total_run_time(startTime, endTime, n_iterations)
    
    df = get_exec_df(sf, exec_df)
    
    print(df.head())
    
    print(sf+' done---------------')
    print()

Scale Factor sf_6

Iteration 3


Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query03.txt
Sript reading complete
Q03: Success, Execution Time: 7.63 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query04.txt
Sript reading complete
Q04: Success, Execution Time: 367374.27 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query05.txt
Sript reading complete
Q05: Success, Execution Time: 426024.64 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query06.txt
Sript reading complete
Q06: Success, Execution Time: 606.35 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query07.txt
Sript reading complete
Q07: Success, Execution Time: 760.45 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query08.txt
Sript reading complete
Q08: Success, Execution Time: 12004.86 ms

Reading script C:/Users/ahmad/Desktop/adb_project/arango/all_queries/query09.txt
Sript read