In [1]:
from config_paths import *
import sys
sys.path.append(str(SRC_DIR))

from fuzzy_sql.fuzzy_sql import *
import json
import os
from pathlib import Path

DATASET_NAME='sdgd'
import glob

In [2]:
# set directories
metadata_dir = os.path.join(DATA_DIR, DATASET_NAME,'processed/metadata')
real_dir = os.path.join(DATA_DIR,DATASET_NAME, 'processed/real')
syn_dir = os.path.join(DATA_DIR,DATASET_NAME, 'processed/synthetic')
db_path = os.path.join(DB_DIR, f'{DATASET_NAME}.db')

In [3]:
# poplulate filenames from folder into a list
real_fnames=glob.glob(real_dir+'/*.csv')
real_fnames=[Path(real_fname).stem for real_fname in real_fnames]
real_fnames.sort()


# generate filenames
meta_jsons=[]
real_csvs=[]
for real_fname in real_fnames:
    meta_jsons.append(real_fname+'.json')
    real_csvs.append(real_fname+'.csv')
    


In [4]:
# import real data into database
conn = sqlite3.connect(db_path)
with conn:
    for real_csv,meta_json in zip(real_csvs,meta_jsons):
        #Import real
        data,_=prep_data_for_db(os.path.join(real_dir,real_csv)) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        with open(os.path.join(metadata_dir, meta_json), 'r') as f:
            metadata=json.load(f)
        print(f"Processing {real_csv} and {meta_json} ")
        candidate_db_idx=get_vars_to_index(metadata,data)
        table_name=Path(real_csv).stem
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)

Processing C1.csv and C1.json 
Table C1 already exists in the database
Processing C10.csv and C10.json 
Table C10 is created in the database
.... The index: IDX_C10_hsv_total_h_std is created for the table: C10 in the database
.... The index: IDX_C10_cervix_specularities_area is created for the table: C10 in the database
.... The index: IDX_C10_hsv_cervix_s_std is created for the table: C10 in the database
.... The index: IDX_C10_hsv_total_h_mean is created for the table: C10 in the database
.... The index: IDX_C10_fit_ellipse_total is created for the table: C10 in the database
.... The index: IDX_C10_rgb_cervix_r_mean is created for the table: C10 in the database
.... The index: IDX_C10_fit_cervix_bbox_rate is created for the table: C10 in the database
.... The index: IDX_C10_cervix_area is created for the table: C10 in the database
.... The index: IDX_C10_rgb_cervix_b_mean is created for the table: C10 in the database
.... The index: IDX_C10_rgb_total_b_mean_minus_std is created for 

In [5]:
def find_syn_fnames(syn_data_dir: Path, real_names: list) -> dict:
    syn_dict={}
    for real_name_i in real_names:
        catch_file=[]
        for syn_name_i in os.listdir(syn_data_dir):
            if syn_name_i.startswith(real_name_i+'_'):
                catch_file.append(Path(syn_name_i).stem)
        syn_dict[real_name_i]=catch_file
    print('Extracted the names of all available synthetic datasets corresponding to {} real datasets'.format(str(len(syn_dict))))
    return syn_dict



In [6]:
# import syn data into database
syn_dict=find_syn_fnames(syn_dir,real_fnames)

#drop datasets with no synthetic data available 
tabular_test_set={key:val for key,val in syn_dict.items() if len(val)!=0}

Extracted the names of all available synthetic datasets corresponding to 41 real datasets


In [7]:
#import synthetic data into database
for key in tabular_test_set.keys():
    meta_json=key+'.json'
    for syn_table_name in tabular_test_set[key]:
        syn_csv=syn_table_name+'.csv'
        data,_=prep_data_for_db(os.path.join(syn_dir,syn_csv)) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        with open(os.path.join(metadata_dir, meta_json), 'r') as f:
            metadata=json.load(f)
        print(f"Processing {syn_csv} and {meta_json} ")
        candidate_db_idx=get_vars_to_index(metadata,data)
        table_name=Path(syn_csv).stem
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)
        

Processing C1_syn_default_14.csv and C1.json 
Table C1_syn_default_14 is created in the database
.... The index: IDX_C1_syn_default_14_relationship is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_income is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_hours_per_week is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_marital_status is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_age is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_fnlwgt is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_race is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_education is created for the table: C1_syn_default_14 in the database
.... The index: IDX_C1_syn_default_14_nativ

In [8]:
#save tabula test set for late usage in testing fuzzy SQL
with open('tabular_test_set.json','w') as f:
    json.dump(tabular_test_set, f)
