In [6]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
from fuzzy_sql.fuzzy_sql import *
from fuzzy_sql.long_query import LONG_QUERY


In [7]:
#set paths
root_dir=Path('/home/samer/projects/fuzzy_sql')
real_dir=os.path.join(root_dir,'data/lucy/processed/real')
meta_dir=os.path.join(root_dir,'data/lucy/processed/metadata')
syn_dir=os.path.join(root_dir,'data/lucy/processed/synthetic')
db_path=os.path.join(root_dir,'db/lucy.db')
report_dir="test"


In [8]:
# Construct longitudinal query object based on the real data

rp_path=real_dir+"/b_sample.csv" #real parent (baseline) path 
rc_path=real_dir+"/l_sample.csv" #real child path
meta_path=meta_dir+"/sample.json" #metdata path

rp=load_csv(rp_path) 
rc=load_csv(rc_path) 
with open(meta_path) as f:
    meta=json.load(f) #metadata for the data 
rp=assign_dtype(rp, meta['parent'])
rc=assign_dtype(rc, meta['child'])


conn = sqlite3.connect(db_path)
make_table('sample_r_b', rp, conn)
make_table('sample_r_l', rc, conn)


DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.9,'OR':0.1},
    'NOT_STATE':{'0':0.8, '1':0.2},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.15, 'IN':0.15, 'NOT LIKE':0.1, 'NOT IN':0.1},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.2, 'NOT BETWEEN':0.1},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.2, 'IN':0.1, 'NOT BETWEEN':0.1, 'NOT IN':0.1},
    'LESS_GRP_VARS': False, # enforce bias in random queries toward smaller number of groupby vars. Default is no bias (i.e. uniform sampling)
    'LESS_CMP_VARS':False, # enforce bias in random queries toward small number of  comparison terms. Default is no bias (i.e. uniform sampling)
    'JOIN_CNDTN':{'WHERE':0.5, 'AND':0.5} #Use WHERE or AND with JOIN CLAUSE
}
self=LONG_QUERY(conn,'sample_r_b','sample_r_l', meta,DFLT_PARAMS)

Table sample_r_b already exists in the database
Table sample_r_l already exists in the database


In [9]:
# # Generate Single queries 
# rnd_query=self.make_single_agg_query()
# rnd_query=self.make_single_agg_query_w_aggfntn()
# rnd_query=self.make_single_fltr_query()
# rnd_query=self.make_single_aggfltr_query()
# rnd_query=self.make_single_aggfltr_query_w_aggfntn()

In [10]:

# Generate Random queries
many_syn=[] #this is a list of holding the random queries against each available version of the synthetic datasets
for i in range(1,4):
    sp_path=syn_dir+f"/b_sample_syn_0{i}.csv" #synthetic parent (baseline) path 
    sc_path=syn_dir+f"/l_sample_syn_0{i}.csv" #synthetic child (longitudinal) path 

    sp=load_csv(sp_path)  
    sc=load_csv(sc_path) 

    sp=assign_dtype(sp, meta['parent'])
    sc=assign_dtype(sc, meta['child'])

    # import real data into database
    make_table(f'sample_s_b_0{i}', sp, conn)
    make_table(f'sample_s_l_0{i}', sc, conn)


    # mltpl_queries=self.make_mltpl_twin_agg_query(3,f'sample_s_b_0{i}',f'sample_s_l_0{i}')
    # mltpl_queries=self.make_mltpl_twin_agg_query_w_aggfntn(3,f'sample_s_b_0{i}',f'sample_s_l_0{i}')
    # mltpl_queries=self.make_mltpl_twin_fltr_query(3,f'sample_s_b_0{i}',f'sample_s_l_0{i}')
    mltpl_queries=self.make_mltpl_twin_aggfltr_query(10,f'sample_s_b_0{i}',f'sample_s_l_0{i}') #this is a list of all unmatched random queries 
    # mltpl_queries=self.make_mltpl_twin_aggfltr_query_w_aggfntn(3,f'sample_s_b_0{i}',f'sample_s_l_0{i}')
    mltpl_scored_queries=self.calc_mltpl_dist_scores(mltpl_queries) #this is a list of all random matched and scored queries 
    many_syn.append(mltpl_scored_queries)


Table sample_s_b_01 already exists in the database
Table sample_s_l_01 already exists in the database
SELECT sample_r_b.PL_UIC20, sample_r_b.PL_NCHS2, sample_r_b.PL_RUCA4, sample_r_b.FEMALE, sample_r_b.DIED ,COUNT(*) FROM sample_r_b JOIN sample_r_l ON sample_r_b.PNUM_R = sample_r_l.PNUM_R WHERE ( NOT sample_r_b.DIED IN ('0', '0', '0', '0', '0')  AND  sample_r_l.NPR > 2.0  AND  sample_r_b.NEOMAT IN ('0', '0', '0', '0', '0')  AND  sample_r_l.MDC >= 14.0  AND  sample_r_l.PROCTYPE <> 1.0  AND  sample_r_b.PL_NCHS2 LIKE '2'  AND  sample_r_l.AWEEKEND <= 1.0 ) GROUP BY sample_r_b.PL_UIC20, sample_r_b.PL_NCHS2, sample_r_b.PL_RUCA4, sample_r_b.FEMALE, sample_r_b.DIED

Generated Random Aggregate Filter Query - 0 
SELECT sample_r_b.Homeless, sample_r_b.PL_UR_CA, sample_r_b.HISPANIC, sample_r_b.RACE, sample_r_b.FEMALE ,COUNT(*) FROM sample_r_b JOIN sample_r_l ON sample_r_b.PNUM_R = sample_r_l.PNUM_R AND ( sample_r_l.MDC BETWEEN 6.0 AND 8.0  AND  sample_r_b.MEDINCST LIKE '1'  AND  NOT sample_r_b.PL_

In [11]:
# Reporting

start_html= "<html> \
    <head><title>Random Queries for Longitudinal Data</title></head> \
        <link rel='stylesheet' type='text/css' href='df_style.css'/> \
        <body> \
        <H1>Random Queries for Longitudinal Data</H1>\
    "
end_html="</body></html>"


def make_html_content(query_id, rnd_query):
    assert query_id=='real' or query_id=='syn',("query_id shall be either 'real' or 'syn' ")
    html_string=f"<u>SQL statement - {query_id}:</u><br>"
    html_string+=rnd_query['query_desc'][f'sql_{query_id}']
    html_string+="<br><br>"
    if len(rnd_query[f'query_{query_id}']) !=0:
        html_string+=f"SQL result - {query_id}:<br>"
        html_string+=rnd_query[f'query_{query_id}'].head(5).to_html(classes='mystyle')
        html_string+="Number of returned records: "+str(rnd_query['query_desc'][f'n_rows_{query_id}'])
    else:
        html_string+=f"<H4>No records returned</H4>"
    
    html_string+="<br><br>"
    return html_string


with open('../docs/reports/'+report_dir+'/long_rnd_queries.html', 'w') as f:
    f.write(start_html)
    for s in range(len(many_syn)):
        f.write(f"<H3>======================= REAL VS SYNTHETIC VERSION {str(s+1)} STARTS ======================</H3>")
        for r in range(len(many_syn[s])):
            f.write(make_html_content('real',many_syn[s][r]))
            f.write(make_html_content('syn',many_syn[s][r]))
            f.write("Hellinger Distance = {:.3f}".format(many_syn[s][r]['query_hlngr_score']))
            f.write("<H3>************************************************************************************</H3>")
        if s==len(many_syn):
            break
    f.write(end_html)