In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
import os
import json

from fuzzy_sql.rnd_query import RND_QUERY
from fuzzy_sql.fuzzy_sql import *

In [2]:
#set directories
root_dir=Path('/home/samer/projects/fuzzy_sql')
metadata_dir=os.path.join(root_dir,'data/lucy/processed/metadata')
db_path=os.path.join(root_dir,'db/lucy.db')

In [3]:
#define input tables and metadata
tbl_names_lst=["b_sample","l_sample"]

metadata_lst=[]
for tbl_name in tbl_names_lst:
    with open(os.path.join(metadata_dir,tbl_name+'.json'),'r') as f:
        metadata_lst.append(json.load(f))

In [4]:
#setup parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.9,'OR':0.1},
    'NOT_STATE':{'0':0.8, '1':0.2},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.15, 'IN':0.15, 'NOT LIKE':0.1, 'NOT IN':0.1},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.2, 'NOT BETWEEN':0.1},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.2, 'IN':0.1, 'NOT BETWEEN':0.1, 'NOT IN':0.1},
    'FILTER_TYPE':{'WHERE':0.5, 'AND':0.5}, #Use WHERE or AND with JOIN CLAUSE
    'JOIN_TYPE': {'JOIN':0.5, 'LEFT JOIN':0.5}
}

#connect to db
conn = sqlite3.connect(db_path)

self=RND_QUERY(conn, tbl_names_lst, metadata_lst,DFLT_PARAMS)

Metadata fot table b_sample validated.
Metadata fot table l_sample validated.


# Generate Random  queries 

In [5]:
rnd_query=self.make_single_agg_query(agg_fntn=True)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

SELECT b_sample.PL_RUCC2, b_sample.DNR, b_sample.FEMALE, b_sample.NEOMAT, b_sample.MEDINCST, COUNT(*), AVG(l_sample.Date)  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  GROUP BY b_sample.PL_RUCC2, b_sample.DNR, b_sample.FEMALE, b_sample.NEOMAT, b_sample.MEDINCST
no_of_rows=269
no_of_cols=7
product=1.883


SELECT b_sample.PL_RUCC2, b_sample.DNR, b_sample.FEMALE, b_sample.NEOMAT, b_sample.MEDINCST, COUNT(*), AVG(l_sample.Date)  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  GROUP BY b_sample.PL_RUCC2, b_sample.DNR, b_sample.FEMALE, b_sample.NEOMAT, b_sample.MEDINCST


In [6]:
syn_tbl_name_lst=['b_sample_syn_01','l_sample_syn_01']
rnd_query=self.make_twin_agg_query(syn_tbl_name_lst,agg_fntn=True)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

REAL:
no_of_rows=120712
no_of_cols=18
product=2172.816


SYN:
no_of_rows=151906
no_of_cols=18
product=2734.308


SELECT b_sample.NEOMAT, l_sample.ASCHED, b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.DNR, b_sample.PL_RUCA4, b_sample.Homeless, b_sample.DIED, b_sample.PL_NCHS2, l_sample.DX1, b_sample.PL_CBSA, b_sample.HOSPBRTH, b_sample.FEMALE, b_sample.PL_RUCC2, b_sample.PL_UIC20, b_sample.RACE, COUNT(*), AVG(l_sample.NDX)  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  GROUP BY b_sample.NEOMAT, l_sample.ASCHED, b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.DNR, b_sample.PL_RUCA4, b_sample.Homeless, b_sample.DIED, b_sample.PL_NCHS2, l_sample.DX1, b_sample.PL_CBSA, b_sample.HOSPBRTH, b_sample.FEMALE, b_sample.PL_RUCC2, b_sample.PL_UIC20, b_sample.RACE


SELECT b_sample_syn_01.NEOMAT, l_sample_syn_01.ASCHED, b_sample_syn_01.PL_UR_CA, b_sample_syn_01.HISPANIC, b_sample_syn_01.DNR, b_sample_syn_01.PL_RUCA4, b_sample_syn_01.Homeless, b_sample_syn_01.DIED, b_sample_syn_01

In [7]:
# self.max_in_terms=2
# self.no_groupby_vars=2
# self.no_where_vars=2
# self.no_join_tables=1

rnd_query=self.make_single_fltr_query()


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

SELECT *  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  WHERE   b_sample.Homeless <> '0' AND  b_sample.AGE > 57 AND  b_sample.PL_RUCA4 NOT LIKE '1' OR  l_sample.NPR > 0 AND  b_sample.HOSPBRTH = '0' 
no_of_rows=545798
no_of_cols=31
product=16919.738


SELECT *  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  WHERE   b_sample.Homeless <> '0' AND  b_sample.AGE > 57 AND  b_sample.PL_RUCA4 NOT LIKE '1' OR  l_sample.NPR > 0 AND  b_sample.HOSPBRTH = '0' 
