In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
import os
import json

from fuzzy_sql.rnd_query import RND_QUERY
from fuzzy_sql.fuzzy_sql import *

In [2]:
#set directories
root_dir=Path('/home/samer/projects/fuzzy_sql')
metadata_dir=os.path.join(root_dir,'data/sdgd/processed/metadata')
db_path=os.path.join(root_dir,'db/sdgd.db')

In [3]:
#define input tables and metadata

tbl_names_lst=["C1"]

metadata_lst=[]
for tbl_name in tbl_names_lst:
    with open(os.path.join(metadata_dir,tbl_name+'.json'),'r') as f:
        metadata_lst.append(json.load(f))

In [4]:
#connect to db
conn = sqlite3.connect(db_path)
self=RND_QUERY(conn, tbl_names_lst, metadata_lst)

Metadata for table C1 is valid.
Parameter input is valid.


# Generate Random  queries 

In [5]:
#Single Aggregate
single_expr,groupby_lst,from_tbl, join_tbl_lst, agg_fntn_terms=self.compile_agg_expr() 
rnd_query=self.make_single_agg_query(single_expr,groupby_lst,from_tbl, join_tbl_lst, agg_fntn_terms)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=167
no_of_cols=5
product=0.835


SELECT income, relationship, occupation, COUNT(*), SUM(capital)  FROM C1  GROUP BY income, relationship, occupation


In [6]:
rnd_query['query_desc']

{'type': 'single_agg',
 'agg_fntn': ('SUM', 'capital'),
 'grpby_vars': ['income', 'relationship', 'occupation'],
 'from_tbl_name': 'C1',
 'join_tbl_name_lst': [],
 'sql': 'SELECT income, relationship, occupation, COUNT(*), SUM(capital)  FROM C1  GROUP BY income, relationship, occupation',
 'n_rows': 167,
 'n_cols': 5}

In [7]:
# Twin Aggregate 

syn_tbl_name_lst=['C1_syn_default_1']
real_expr,real_groupby_lst,real_from_tbl, real_join_tbl_lst,agg_fntn_terms=self.compile_agg_expr()
rnd_query=self.make_twin_agg_query(syn_tbl_name_lst,real_expr,real_groupby_lst,real_from_tbl, real_join_tbl_lst,agg_fntn_terms)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(rnd_query['query_desc']['sql_real'])
print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")

print(rnd_query['query_desc']['sql_syn'])
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")



SELECT sex, native_country, occupation, COUNT(*), SUM(hours_per_week)  FROM C1  GROUP BY sex, native_country, occupation
REAL:
no_of_rows=775
no_of_cols=5
product=3.875


SELECT sex, native_country, occupation, COUNT(*), SUM(hours_per_week)  FROM C1_syn_default_1  GROUP BY sex, native_country, occupation
SYN:
no_of_rows=913
no_of_cols=5
product=4.565




In [8]:
rnd_query['query_desc']

{'type': 'twin_agg',
 'agg_fntn': ('SUM', 'hours_per_week'),
 'grpby_vars': ['sex', 'native_country', 'occupation'],
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': 'SELECT sex, native_country, occupation, COUNT(*), SUM(hours_per_week)  FROM C1  GROUP BY sex, native_country, occupation',
 'n_cols_real': 5,
 'n_rows_real': 775,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': 'SELECT sex, native_country, occupation, COUNT(*), SUM(hours_per_week)  FROM C1_syn_default_1  GROUP BY sex, native_country, occupation',
 'n_cols_syn': 5,
 'n_rows_syn': 913}

In [10]:
#Single Filter

# self.no_groupby_vars=2
# self.no_where_vars=2
# self.max_in_terms=2

single_expr,from_tbl, join_tbl_lst =self.compile_fltr_expr()
rnd_query=self.make_single_fltr_query(single_expr,from_tbl, join_tbl_lst)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=8953
no_of_cols=13
product=116.389


SELECT *  FROM C1  WHERE   relationship LIKE 'Husband' AND  occupation NOT LIKE 'Other-service' AND NOT  education <> 'Some-college' AND  race IN ('White') AND  native_country IN ('United-States') AND  marital_status LIKE 'Divorced' AND  fnlwgt = 72338 AND  workclass <> 'State-gov' AND  income = '<=50K' AND NOT  sex NOT LIKE 'Female' AND  capital < 0 AND NOT  age BETWEEN 52 AND 78 OR  hours_per_week BETWEEN 42 AND 50 


In [11]:
rnd_query['query_desc']

{'type': 'single_fltr',
 'from_tbl_name': 'C1',
 'join_tbl_name_lst': [],
 'sql': "SELECT *  FROM C1  WHERE   relationship LIKE 'Husband' AND  occupation NOT LIKE 'Other-service' AND NOT  education <> 'Some-college' AND  race IN ('White') AND  native_country IN ('United-States') AND  marital_status LIKE 'Divorced' AND  fnlwgt = 72338 AND  workclass <> 'State-gov' AND  income = '<=50K' AND NOT  sex NOT LIKE 'Female' AND  capital < 0 AND NOT  age BETWEEN 52 AND 78 OR  hours_per_week BETWEEN 42 AND 50 ",
 'n_rows': 8953,
 'n_cols': 13}

In [13]:
# Twin Filter 


syn_tbl_name_lst=['C1_syn_default_1']
real_expr,real_from_tbl, real_join_tbl_lst =self.compile_fltr_expr()
rnd_query=self.make_twin_fltr_query(syn_tbl_name_lst,real_expr,real_from_tbl, real_join_tbl_lst)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

REAL:
no_of_rows=205
no_of_cols=13
product=2.665


SYN:
no_of_rows=1381
no_of_cols=13
product=17.953


SELECT *  FROM C1  WHERE   hours_per_week = 40 AND NOT  native_country <> 'United-States' AND  relationship <> 'Husband' AND  age BETWEEN 38 AND 43 AND  occupation NOT LIKE 'Handlers-cleaners' AND NOT  race = 'White' AND  marital_status NOT LIKE 'Divorced' 


SELECT *  FROM C1_syn_default_1  WHERE   hours_per_week = 40 AND NOT  native_country <> 'United-States' AND  relationship <> 'Husband' AND  age BETWEEN 38 AND 43 AND  occupation NOT LIKE 'Handlers-cleaners' AND NOT  race = 'White' AND  marital_status NOT LIKE 'Divorced' 


In [14]:
rnd_query['query_desc']

{'type': 'twin_fltr',
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': "SELECT *  FROM C1  WHERE   hours_per_week = 40 AND NOT  native_country <> 'United-States' AND  relationship <> 'Husband' AND  age BETWEEN 38 AND 43 AND  occupation NOT LIKE 'Handlers-cleaners' AND NOT  race = 'White' AND  marital_status NOT LIKE 'Divorced' ",
 'n_cols_real': 13,
 'n_rows_real': 205,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': "SELECT *  FROM C1_syn_default_1  WHERE   hours_per_week = 40 AND NOT  native_country <> 'United-States' AND  relationship <> 'Husband' AND  age BETWEEN 38 AND 43 AND  occupation NOT LIKE 'Handlers-cleaners' AND NOT  race = 'White' AND  marital_status NOT LIKE 'Divorced' ",
 'n_cols_syn': 13,
 'n_rows_syn': 1381}

In [16]:
#Single Aggregate-Filter

single_expr,groupby_lst,from_tbl, join_tbl_lst, agg_fntn_terms=self.compile_aggfltr_expr()
rnd_query=self.make_single_aggfltr_query(single_expr,groupby_lst,from_tbl, join_tbl_lst, agg_fntn_terms)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

no_of_rows=6145
no_of_cols=10
product=61.45


SELECT education, income, native_country, marital_status, occupation, sex, workclass, race, COUNT(*), MAX(fnlwgt)  FROM C1  WHERE   native_country LIKE 'United-States'  GROUP BY education, income, native_country, marital_status, occupation, sex, workclass, race


In [17]:
# Twin Aggregate-Filter 

syn_tbl_name_lst=['C1_syn_default_1']
real_expr,real_groupby_lst,real_from_tbl, real_join_tbl_lst, agg_fntn_terms=self.compile_aggfltr_expr()
rnd_query=self.make_twin_aggfltr_query(syn_tbl_name_lst,real_expr,real_groupby_lst,real_from_tbl, real_join_tbl_lst, agg_fntn_terms)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(rnd_query['query_desc']['sql_real'])
print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")

print(rnd_query['query_desc']['sql_syn'])
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")

SELECT education, income, marital_status, sex, native_country, workclass, occupation, COUNT(*), SUM(fnlwgt)  FROM C1  WHERE   native_country = 'United-States' AND  education = '10th'  GROUP BY education, income, marital_status, sex, native_country, workclass, occupation
REAL:
no_of_rows=231
no_of_cols=9
product=2.079


SELECT education, income, marital_status, sex, native_country, workclass, occupation, COUNT(*), SUM(fnlwgt)  FROM C1_syn_default_1  WHERE   native_country = 'United-States' AND  education = '10th'  GROUP BY education, income, marital_status, sex, native_country, workclass, occupation
SYN:
no_of_rows=196
no_of_cols=9
product=1.764




In [18]:
matched_query=self._match_twin_query(rnd_query)
scored_query=self.calc_dist_scores(matched_query)

In [19]:
scored_query['query_desc']

{'type': 'twin_agg',
 'agg_fntn': ('SUM', 'fnlwgt'),
 'grpby_vars': ['education',
  'income',
  'marital_status',
  'sex',
  'native_country',
  'workclass',
  'occupation'],
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': "SELECT education, income, marital_status, sex, native_country, workclass, occupation, COUNT(*), SUM(fnlwgt)  FROM C1  WHERE   native_country = 'United-States' AND  education = '10th'  GROUP BY education, income, marital_status, sex, native_country, workclass, occupation",
 'n_cols_real': 9,
 'n_rows_real': 231,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': "SELECT education, income, marital_status, sex, native_country, workclass, occupation, COUNT(*), SUM(fnlwgt)  FROM C1_syn_default_1  WHERE   native_country = 'United-States' AND  education = '10th'  GROUP BY education, income, marital_status, sex, native_country, workclass, occupation",
 'n_cols_syn': 9,
 'n_rows_syn': 196}

In [20]:
scored_query['query_hlngr_score']

0.4126832082190124

In [21]:
scored_query['query_ecldn_score']

nan