In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
import os
import json

from fuzzy_sql.rnd_query import RND_QUERY
from fuzzy_sql.fuzzy_sql import *

In [2]:
#set directories
root_dir=Path('/home/samer/projects/fuzzy_sql')
metadata_dir=os.path.join(root_dir,'data/cms/processed/metadata')
db_path=os.path.join(root_dir,'db/cms.db')

In [3]:
#define input tables and metadata
tbl_names_lst=['s1_ben_sum_2008','s1_ben_sum_2009','s1_ben_sum_2010','s1_carrier_1a','s1_carrier_1b','s1_inpatient','s1_outpatient','s1_prescrp']

metadata_lst=[]
for tbl_name in tbl_names_lst:
    with open(os.path.join(metadata_dir,tbl_name+'.json'),'r') as f:
        metadata_lst.append(json.load(f))

In [4]:
#setup parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.9,'OR':0.1},
    'NOT_STATE':{'0':0.8, '1':0.2},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.15, 'IN':0.15, 'NOT LIKE':0.1, 'NOT IN':0.1},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.2, 'NOT BETWEEN':0.1},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.2, 'IN':0.1, 'NOT BETWEEN':0.1, 'NOT IN':0.1},
    'LESS_GRP_VARS': False, # enforce bias in random queries toward smaller number of groupby vars. Default is no bias (i.e. uniform sampling)
    'LESS_CMP_VARS':False, # enforce bias in random queries toward small number of  comparison terms. Default is no bias (i.e. uniform sampling)
    'JOIN_CNDTN':{'WHERE':0.5, 'AND':0.5} #Use WHERE or AND with JOIN CLAUSE
}

#connect to db
conn = sqlite3.connect(db_path)

self=RND_QUERY(conn, tbl_names_lst, metadata_lst,DFLT_PARAMS)

# Generate Random  queries 

In [5]:
rnd_query=self.make_single_agg_query(agg_fntn=True)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=0
no_of_cols=4
product=0.0


SELECT s1_ben_sum_2009.SP_OSTEOPRS, s1_outpatient.ICD9_DGNS_CD_5, COUNT(*), AVG(s1_inpatient.HCPCS_CD_18)  FROM s1_ben_sum_2009  JOIN s1_inpatient  ON s1_ben_sum_2009.DESYNPUF_ID=s1_inpatient.DESYNPUF_ID  JOIN s1_prescrp  ON s1_ben_sum_2009.DESYNPUF_ID=s1_prescrp.DESYNPUF_ID  JOIN s1_outpatient  ON s1_ben_sum_2009.DESYNPUF_ID=s1_outpatient.DESYNPUF_ID  JOIN s1_carrier_1b  ON s1_ben_sum_2009.DESYNPUF_ID=s1_carrier_1b.DESYNPUF_ID  GROUP BY s1_ben_sum_2009.SP_OSTEOPRS, s1_outpatient.ICD9_DGNS_CD_5


In [6]:
syn_tbl_name_lst=['s2_ben_sum_2008','s2_ben_sum_2009','s2_ben_sum_2010','s2_carrier_2a','s2_carrier_2b','s2_inpatient','s2_outpatient','s2_prescrp']
rnd_query=self.make_twin_agg_query(syn_tbl_name_lst,agg_fntn=True)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])


REAL:
no_of_rows=0
no_of_cols=35
product=0.0


SYN:
no_of_rows=0
no_of_cols=35
product=0.0


SELECT s1_carrier_1b.LINE_PRCSG_IND_CD_12, s1_ben_sum_2009.BENE_DEATH_DT, s1_inpatient.DESYNPUF_ID, s1_ben_sum_2009.SP_DEPRESSN, s1_carrier_1a.LINE_ICD9_DGNS_CD_11, s1_inpatient.ICD9_PRCDR_CD_4, s1_carrier_1a.CLM_FROM_DT, s1_carrier_1b.ICD9_DGNS_CD_6, s1_carrier_1a.LINE_ICD9_DGNS_CD_5, s1_inpatient.ICD9_DGNS_CD_8, s1_carrier_1a.LINE_ICD9_DGNS_CD_6, s1_carrier_1b.LINE_ICD9_DGNS_CD_7, s1_ben_sum_2009.PPPYMT_CAR, s1_inpatient.NCH_BENE_DSCHRG_DT, s1_ben_sum_2009.MEDREIMB_OP, s1_carrier_1b.ICD9_DGNS_CD_5, s1_ben_sum_2009.SP_CNCR, s1_carrier_1a.LINE_ICD9_DGNS_CD_12, s1_ben_sum_2009.BENE_RACE_CD, s1_ben_sum_2009.BENRES_IP, s1_carrier_1a.LINE_ICD9_DGNS_CD_2, s1_carrier_1b.LINE_PRCSG_IND_CD_13, s1_ben_sum_2009.SP_ISCHMCHT, s1_carrier_1b.LINE_PRCSG_IND_CD_9, s1_carrier_1a.LINE_PRCSG_IND_CD_3, s1_carrier_1a.ICD9_DGNS_CD_8, s1_inpatient.ICD9_DGNS_CD_1, s1_inpatient.ICD9_DGNS_CD_2, s1_carrier_1b.HCPCS_CD_1,

In [None]:
rnd_query=self.make_single_fltr_query()


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])