estimated cardinality on each single table from the traditional CardEst methods(psql)

In [None]:
# collect the cardinality estimated by traditional methods
import sqlglot
from pilotscope.DBInteractor.PilotDataInteractor import PilotDataInteractor
from pilotscope.PilotConfig import PostgreSQLConfig

def get_pg_est_card(datasets_name = []):
    '''
    using pilotscope to get single table ce.
    :param datasets_name: the list of names about the datasets to be preprocess
    '''
    config = PostgreSQLConfig()
    config.db_host = "localhost"
    config.db_user = "postgres"
    config.db_user_pwd = "postgres"
    config.db_port = 54323

    for dataset_name in datasets_name:
        workload_in_path = f'/opt/hdd/datasets/user/cardbench/binary_join_workload/{dataset_name}_without_pg_est_card.sql'
        workload_out_path = f'/opt/hdd/datasets/user/cardbench/binary_join_workload/{dataset_name}.sql'
        print(f"dataset: {dataset_name}, workload_in_path: {workload_in_path}, workload_out_path: {workload_out_path}")
        config.db = dataset_name
        data_interactor = PilotDataInteractor(config)
        
        with open(workload_in_path, 'r') as read_files:
            sqls = []
            for line in read_files:
                sql = line.strip()
                sqls.append(sql)

        subsqls, count = [], 0
        for line in sqls:
            spilt_infos = line.split("||")
            sql, true_card, _ = spilt_infos[0], spilt_infos[1], spilt_infos[2]
            data_interactor.pull_subquery_card()
            result = data_interactor.execute(sql)
            subquerys = list(result.subquery_2_card.keys())
            subsqls.append(sql)
            count += 1
            if count % 100 == 0:
                print(f"{count} sqls has been processed")
            for subquery in subquerys:
                try:
                    tables = [table for table in sqlglot.parse_one(subquery).find_all(sqlglot.exp.Table)]
                except Exception as e:
                    print(f"sql: {sql}")
                    print(f"subquery: {subquery}")
                    break
                if len(tables) == 1: # skip single table estimated cardinality
                    continue
                subsqls.append(f'{subquery};||{true_card}||{result.subquery_2_card[subquery]}')

        with open(workload_out_path, "w") as f:
            for sql in subsqls:
                f.write(sql + "\n")

        print(f"dataset: {dataset_name}")

In [45]:
# collect suitable sql from cardbench
import glob
import numpy as np
import tqdm

from sparse_deferred.structs import graph_struct
import sparse_deferred.np as sdnp

GraphStruct = graph_struct.GraphStruct
InMemoryDB = graph_struct.InMemoryDB

def clean_query_string(query):
  return query.replace("'", "'").replace("\n", "").replace(";", "").replace("bq-cost-models-exp.", "")

def get_cardbench_dataset(join_type, dataset_name):
    '''
    The training datasets are stored sharded (namely split into multiple files)
    to find all the shards of a training dataset we use glob. 
    '''

    filename = f"../CardBench/CardBench_zero_shot_cardinality_training/training_datasets/{join_type}/{dataset_name}_{join_type}.npz"
    filenames = glob.glob(filename + '-*')
    filenames.sort(key=lambda f: int(f.split('-')[-1]))
    cardinalities = []
    queries = []
    
    for file in tqdm.tqdm(filenames): 
        np_data = np.load(open(file, 'rb'), allow_pickle=True)  
        for k, np_arr in np_data.items():
            k_parts = k.split('.')
            if k_parts[0] == 'feat' and k_parts[1] == 'n' and k_parts[2] == 'g': 
                if k_parts[3] == 'cardinality':
                    cardinalities += np_arr.tolist()
                elif k_parts[3] == 'query':
                    queries += np_arr.tolist()
    
    assert len(cardinalities) == len(queries)
    sqls = []
    for query, card in zip(queries, cardinalities):
        if 'IS NOT NULL' in query.decode():
            continue
        sqls.append(f'{clean_query_string(query.decode())};||{card}||')
    print(len(sqls))    
    with open(f'/opt/hdd/datasets/user/cardbench/{join_type}_workload/{dataset_name}_without_pg_est_card.sql', "w") as f:
        for sql in sqls:
            f.write(sql + "\n")
    
    return sqls

In [46]:
dataset_names = ['accidents','airline','cms_synthetic_patient_data_omop','consumer','covid19_weathersource_com','crypto_bitcoin_cash','employee','ethereum_blockchain','geo_openstreetmap','github_repos','human_variant_annotation','idc_v10','movielens','open_targets_genetics','samples','stackoverflow','tpch_10G','usfs_fia','uspto_oce_claims','wikipedia']
for dataset_name in dataset_names:
    get_cardbench_dataset('binary_join', dataset_name)

100%|██████████| 29/29 [00:00<00:00, 37.53it/s]


6826


100%|██████████| 44/44 [00:00<00:00, 46.64it/s]


10749


100%|██████████| 27/27 [00:00<00:00, 40.00it/s]


6361


100%|██████████| 19/19 [00:00<00:00, 35.24it/s]


3186


100%|██████████| 34/34 [00:00<00:00, 35.30it/s]


6808


100%|██████████| 42/42 [00:00<00:00, 47.53it/s]


12230


100%|██████████| 35/35 [00:00<00:00, 46.73it/s]


9711


100%|██████████| 59/59 [00:02<00:00, 23.82it/s]


17029


100%|██████████| 50/50 [00:01<00:00, 37.82it/s]


14943


100%|██████████| 271/271 [00:02<00:00, 93.83it/s] 


5799


100%|██████████| 47/47 [00:01<00:00, 33.01it/s]


11910


100%|██████████| 22/22 [00:00<00:00, 30.79it/s]


5304


100%|██████████| 53/53 [00:01<00:00, 44.57it/s]


13979


100%|██████████| 31/31 [00:01<00:00, 29.28it/s]


6811


100%|██████████| 38/38 [00:01<00:00, 37.81it/s]


8751


100%|██████████| 320/320 [00:02<00:00, 116.30it/s]


12066


100%|██████████| 44/44 [00:01<00:00, 36.62it/s]


9605


100%|██████████| 39/39 [00:01<00:00, 33.32it/s]


9021


100%|██████████| 21/21 [00:00<00:00, 41.40it/s]


4957


100%|██████████| 25/25 [00:00<00:00, 31.26it/s]

5348





In [None]:
# waiting for raw data
dataset_names = ['accidents','airline','cms_synthetic_patient_data_omop','consumer','covid19_weathersource_com','crypto_bitcoin_cash','employee','ethereum_blockchain','geo_openstreetmap','github_repos','human_variant_annotation','idc_v10','movielens','open_targets_genetics','samples','stackoverflow','tpch_10G','usfs_fia','uspto_oce_claims','wikipedia']
for dataset_name in dataset_names:
    get_pg_est_card(dataset_name)