In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
import os
import json

from fuzzy_sql.rnd_query import RND_QUERY
from fuzzy_sql.fuzzy_sql import *

In [2]:
#set directories
root_dir=Path('/home/samer/projects/fuzzy_sql')
metadata_dir=os.path.join(root_dir,'data/lucy/processed/metadata')
db_path=os.path.join(root_dir,'db/lucy.db')

In [3]:
#define input tables and metadata
tbl_names_lst=["b_sample","l_sample"]

metadata_lst=[]
for tbl_name in tbl_names_lst:
    with open(os.path.join(metadata_dir,tbl_name+'.json'),'r') as f:
        metadata_lst.append(json.load(f))

In [4]:
#connect to db
conn = sqlite3.connect(db_path)
self=RND_QUERY(conn, tbl_names_lst, metadata_lst)

Metadata for table b_sample is valid.
Metadata for table l_sample is valid.
Parameter input is valid.


# Generate Random  queries 

In [5]:
#Single Aggregate 
rnd_query=self.make_single_agg_query()

n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=28657
no_of_cols=5
product=143.285


SELECT b_sample.HISPANIC, l_sample.DX1, b_sample.PL_UIC20, COUNT(*), AVG(l_sample.NPR)  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  GROUP BY b_sample.HISPANIC, l_sample.DX1, b_sample.PL_UIC20


In [6]:
#Twin Aggregate 
syn_tbl_name_lst=['b_sample_syn_01','l_sample_syn_01']
rnd_query=self.make_twin_agg_query(syn_tbl_name_lst)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

REAL:
no_of_rows=47
no_of_cols=7
product=0.329


SYN:
no_of_rows=47
no_of_cols=7
product=0.329


SELECT b_sample.PL_RUCA4, b_sample.DIED, b_sample.NEOMAT, b_sample.HOSPBRTH, b_sample.FEMALE, COUNT(*), MIN(l_sample.NPR)  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  GROUP BY b_sample.PL_RUCA4, b_sample.DIED, b_sample.NEOMAT, b_sample.HOSPBRTH, b_sample.FEMALE


SELECT b_sample_syn_01.PL_RUCA4, b_sample_syn_01.DIED, b_sample_syn_01.NEOMAT, b_sample_syn_01.HOSPBRTH, b_sample_syn_01.FEMALE, COUNT(*), MIN(l_sample_syn_01.NPR)  FROM b_sample_syn_01  JOIN l_sample_syn_01  ON b_sample_syn_01.PNUM_R=l_sample_syn_01.PNUM_R  GROUP BY b_sample_syn_01.PL_RUCA4, b_sample_syn_01.DIED, b_sample_syn_01.NEOMAT, b_sample_syn_01.HOSPBRTH, b_sample_syn_01.FEMALE


In [7]:
#Single Filter

# self.max_in_terms=2
# self.no_groupby_vars=2
self.no_where_vars=2
# self.no_join_tables=1

rnd_query=self.make_single_fltr_query()


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=711928
no_of_cols=31
product=22069.768


SELECT *  FROM b_sample  LEFT JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  AND   b_sample.DIED = '0' AND  l_sample.ASCHED = '0' 


In [8]:
# Twin Filter 

self.no_where_vars=2

syn_tbl_name_lst=['b_sample_syn_01','l_sample_syn_01']
rnd_query=self.make_twin_fltr_query(syn_tbl_name_lst)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

REAL:
no_of_rows=3747
no_of_cols=31
product=116.157


SYN:
no_of_rows=3131
no_of_cols=31
product=97.061


SELECT *  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  WHERE  NOT  l_sample.NPR < 4 AND NOT  b_sample.PL_CBSA = '2' 


SELECT *  FROM b_sample_syn_01  JOIN l_sample_syn_01  ON b_sample_syn_01.PNUM_R=l_sample_syn_01.PNUM_R  WHERE  NOT  l_sample_syn_01.NPR < 4 AND NOT  b_sample_syn_01.PL_CBSA = '2' 


rnd_query['query_desc']

In [9]:
''' This query resulted in kernel crash 
SELECT *  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  AND   b_sample.Homeless IN ('0', '0', '0') AND  l_sample.TOTCHG >= 85649 AND  l_sample.HCUP_ED = 0 AND  l_sample.PROCTYPE <= 1 AND  l_sample.DX1 = '515' AND  b_sample.NEOMAT IN ('0', '0', '0') AND  l_sample.Date > 6 AND  b_sample.RACE LIKE '1' AND NOT  l_sample.ASCHED IN ('0', '0', '0') OR  b_sample.PL_RUCA4 <> '1' AND  l_sample.NPR <> 3 AND  l_sample.NECODE < 2 AND  l_sample.AWEEKEND > 0 AND NOT  b_sample.DIED LIKE '0' AND  b_sample.PL_UIC20 LIKE '1' AND  b_sample.PL_CBSA <> '2' AND  b_sample.DNR IN ('0', '0', '0') AND  b_sample.HISPANIC IN ('2', '1', '1') OR  b_sample.PL_RUCC2 IN ('1', '1', '1') 

'''

" This query resulted in kernel crash \nSELECT *  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  AND   b_sample.Homeless IN ('0', '0', '0') AND  l_sample.TOTCHG >= 85649 AND  l_sample.HCUP_ED = 0 AND  l_sample.PROCTYPE <= 1 AND  l_sample.DX1 = '515' AND  b_sample.NEOMAT IN ('0', '0', '0') AND  l_sample.Date > 6 AND  b_sample.RACE LIKE '1' AND NOT  l_sample.ASCHED IN ('0', '0', '0') OR  b_sample.PL_RUCA4 <> '1' AND  l_sample.NPR <> 3 AND  l_sample.NECODE < 2 AND  l_sample.AWEEKEND > 0 AND NOT  b_sample.DIED LIKE '0' AND  b_sample.PL_UIC20 LIKE '1' AND  b_sample.PL_CBSA <> '2' AND  b_sample.DNR IN ('0', '0', '0') AND  b_sample.HISPANIC IN ('2', '1', '1') OR  b_sample.PL_RUCC2 IN ('1', '1', '1') \n\n"

In [10]:
# single Aggregate-Filter


# self.max_in_terms=2
# self.no_groupby_vars=2
self.no_where_vars=2
# self.no_join_tables=1

rnd_query=self.make_single_aggfltr_query()


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=53
no_of_cols=16
product=0.848


SELECT b_sample.Homeless, b_sample.PL_UR_CA, b_sample.PL_RUCC2, l_sample.DX1, b_sample.DIED, b_sample.MEDINCST, b_sample.DNR, b_sample.FEMALE, b_sample.HOSPBRTH, b_sample.PL_CBSA, b_sample.PL_UIC20, b_sample.PL_RUCA4, b_sample.PL_NCHS2, b_sample.NEOMAT, COUNT(*), AVG(l_sample.HCUP_ED)  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  AND   l_sample.HCUP_ED <> 4 AND  l_sample.DX1 = '29534'  GROUP BY b_sample.Homeless, b_sample.PL_UR_CA, b_sample.PL_RUCC2, l_sample.DX1, b_sample.DIED, b_sample.MEDINCST, b_sample.DNR, b_sample.FEMALE, b_sample.HOSPBRTH, b_sample.PL_CBSA, b_sample.PL_UIC20, b_sample.PL_RUCA4, b_sample.PL_NCHS2, b_sample.NEOMAT


In [11]:
# Twin Aggregate-Filter


# self.max_in_terms=2
# self.no_groupby_vars=2
self.no_where_vars=2
# self.no_join_tables=1


syn_tbl_name_lst=['b_sample_syn_01','l_sample_syn_01']
rnd_query=self.make_twin_aggfltr_query(syn_tbl_name_lst)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

REAL:
no_of_rows=0
no_of_cols=9
product=0.0


SYN:
no_of_rows=0
no_of_cols=9
product=0.0


SELECT b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.PL_RUCA4, b_sample.NEOMAT, b_sample.PL_RUCC2, b_sample.RACE, b_sample.PL_CBSA, COUNT(*), AVG(b_sample.AGE)  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  WHERE   l_sample.AWEEKEND >= 0 AND  l_sample.DRGVER < 24  GROUP BY b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.PL_RUCA4, b_sample.NEOMAT, b_sample.PL_RUCC2, b_sample.RACE, b_sample.PL_CBSA


SELECT b_sample_syn_01.PL_UR_CA, b_sample_syn_01.HISPANIC, b_sample_syn_01.PL_RUCA4, b_sample_syn_01.NEOMAT, b_sample_syn_01.PL_RUCC2, b_sample_syn_01.RACE, b_sample_syn_01.PL_CBSA, COUNT(*), AVG(b_sample_syn_01.AGE)  FROM b_sample_syn_01  JOIN l_sample_syn_01  ON b_sample_syn_01.PNUM_R=l_sample_syn_01.PNUM_R  WHERE   l_sample_syn_01.AWEEKEND >= 0 AND  l_sample_syn_01.DRGVER < 24  GROUP BY b_sample_syn_01.PL_UR_CA, b_sample_syn_01.HISPANIC, b_sample_syn_01.PL_RUCA4, b_sample_syn_01.

In [12]:
matched_query=self._match_twin_query(rnd_query)
scored_query=self.calc_dist_scores(matched_query)

In [13]:
scored_query['query_desc']

{'type': 'twin_agg',
 'agg_fntn': ('AVG', 'b_sample.AGE'),
 'grpby_vars': ['PL_UR_CA',
  'HISPANIC',
  'PL_RUCA4',
  'NEOMAT',
  'PL_RUCC2',
  'RACE',
  'PL_CBSA'],
 'from_tbl_name_real': 'b_sample',
 'join_tbl_name_lst_real': ['l_sample'],
 'sql_real': 'SELECT b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.PL_RUCA4, b_sample.NEOMAT, b_sample.PL_RUCC2, b_sample.RACE, b_sample.PL_CBSA, COUNT(*), AVG(b_sample.AGE)  FROM b_sample  JOIN l_sample  ON b_sample.PNUM_R=l_sample.PNUM_R  WHERE   l_sample.AWEEKEND >= 0 AND  l_sample.DRGVER < 24  GROUP BY b_sample.PL_UR_CA, b_sample.HISPANIC, b_sample.PL_RUCA4, b_sample.NEOMAT, b_sample.PL_RUCC2, b_sample.RACE, b_sample.PL_CBSA',
 'n_cols_real': 9,
 'n_rows_real': 0,
 'from_tbl_name_syn': 'b_sample_syn_01',
 'join_tbl_name_lst_syn': ['l_sample_syn_01'],
 'sql_syn': 'SELECT b_sample_syn_01.PL_UR_CA, b_sample_syn_01.HISPANIC, b_sample_syn_01.PL_RUCA4, b_sample_syn_01.NEOMAT, b_sample_syn_01.PL_RUCC2, b_sample_syn_01.RACE, b_sample_syn_01.PL_CBSA, CO

In [14]:
scored_query['query_hlngr_score']

nan

In [15]:
scored_query['query_ecldn_score']

nan