In [363]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from pathlib import Path
import os
import json

from fuzzy_sql.rnd_query import RND_QUERY
from fuzzy_sql.fuzzy_sql import *

In [364]:
#set directories
root_dir=Path('/home/samer/projects/fuzzy_sql')
metadata_dir=os.path.join(root_dir,'data/sdgd/processed/metadata')
db_path=os.path.join(root_dir,'db/sdgd.db')

In [365]:
#define input tables and metadata

tbl_names_lst=["C1"]

metadata_lst=[]
for tbl_name in tbl_names_lst:
    with open(os.path.join(metadata_dir,tbl_name+'.json'),'r') as f:
        metadata_lst.append(json.load(f))

In [366]:
#setup parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.9,'OR':0.1},
    'NOT_STATE':{'0':0.8, '1':0.2},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.15, 'IN':0.15, 'NOT LIKE':0.1, 'NOT IN':0.1},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.2, 'NOT BETWEEN':0.1},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.2, 'IN':0.1, 'NOT BETWEEN':0.1, 'NOT IN':0.1},
    # 'FILTER_TYPE':{'WHERE':0.5, 'AND':0.5}, #Use WHERE or AND with JOIN CLAUSE
    # 'JOIN_TYPE': {'JOIN':0.5, 'LEFT JOIN':0.5}
}

#connect to db
conn = sqlite3.connect(db_path)

self=RND_QUERY(conn, tbl_names_lst, metadata_lst,DFLT_PARAMS)

Metadata fot table C1 validated.


# Generate Random  queries 

In [367]:
#Single Aggregate

rnd_query=self.make_single_agg_query(agg_fntn=True)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

SELECT education, occupation, marital_status, race, sex, workclass, COUNT(*), AVG(capital)  FROM C1  GROUP BY education, occupation, marital_status, race, sex, workclass
no_of_rows=5923
no_of_cols=8
product=47.384


SELECT education, occupation, marital_status, race, sex, workclass, COUNT(*), AVG(capital)  FROM C1  GROUP BY education, occupation, marital_status, race, sex, workclass


In [368]:
rnd_query['query_desc']

{'type': 'single_agg',
 'agg_fntn': ('AVG', 'capital'),
 'grpby_vars': ['education',
  'occupation',
  'marital_status',
  'race',
  'sex',
  'workclass'],
 'from_tbl_name': 'C1',
 'join_tbl_name_lst': [],
 'sql': 'SELECT education, occupation, marital_status, race, sex, workclass, COUNT(*), AVG(capital)  FROM C1  GROUP BY education, occupation, marital_status, race, sex, workclass',
 'n_rows': 5923,
 'n_cols': 8}

In [369]:
# Twin Aggregate 

syn_tbl_name_lst=['C1_syn_default_1']
rnd_query=self.make_twin_agg_query(syn_tbl_name_lst, agg_fntn=True)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(rnd_query['query_desc']['sql_real'])
print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")

print(rnd_query['query_desc']['sql_syn'])
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")



SELECT education, sex, race, COUNT(*), SUM(age)  FROM C1  GROUP BY education, sex, race
SELECT education, sex, race, COUNT(*), SUM(age)  FROM C1  GROUP BY education, sex, race
REAL:
no_of_rows=154
no_of_cols=5
product=0.77


SELECT education, sex, race, COUNT(*), SUM(age)  FROM C1_syn_default_1  GROUP BY education, sex, race
SYN:
no_of_rows=152
no_of_cols=5
product=0.76




In [370]:
rnd_query['query_desc']

{'type': 'twin_agg',
 'agg_fntn': ('SUM', 'age'),
 'grpby_vars': ['education', 'sex', 'race'],
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': 'SELECT education, sex, race, COUNT(*), SUM(age)  FROM C1  GROUP BY education, sex, race',
 'n_cols_real': 5,
 'n_rows_real': 154,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': 'SELECT education, sex, race, COUNT(*), SUM(age)  FROM C1_syn_default_1  GROUP BY education, sex, race',
 'n_cols_syn': 5,
 'n_rows_syn': 152}

In [371]:
#Single Filter

# self.no_groupby_vars=2
# self.no_where_vars=2
# self.max_in_terms=2

rnd_query=self.make_single_fltr_query()


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

SELECT *  FROM C1  WHERE   capital >= 0 AND  sex <> 'Male' AND  fnlwgt = 187203 AND  occupation <> 'Prof-specialty' AND  native_country NOT LIKE 'United-States' AND  hours_per_week <> 50 AND  marital_status NOT IN ('Never-married', 'Married-civ-spouse') 
no_of_rows=0
no_of_cols=13
product=0.0


SELECT *  FROM C1  WHERE   capital >= 0 AND  sex <> 'Male' AND  fnlwgt = 187203 AND  occupation <> 'Prof-specialty' AND  native_country NOT LIKE 'United-States' AND  hours_per_week <> 50 AND  marital_status NOT IN ('Never-married', 'Married-civ-spouse') 


In [372]:
rnd_query['query_desc']

{'type': 'single_fltr',
 'from_tbl_name': 'C1',
 'join_tbl_name_lst': [],
 'sql': "SELECT *  FROM C1  WHERE   capital >= 0 AND  sex <> 'Male' AND  fnlwgt = 187203 AND  occupation <> 'Prof-specialty' AND  native_country NOT LIKE 'United-States' AND  hours_per_week <> 50 AND  marital_status NOT IN ('Never-married', 'Married-civ-spouse') ",
 'n_rows': 0,
 'n_cols': 13}

In [373]:
# Twin Filter 


syn_tbl_name_lst=['C1_syn_default_1']
rnd_query=self.make_twin_fltr_query(syn_tbl_name_lst)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")
print(rnd_query['query_desc']['sql_real'])
print("\n")
print(rnd_query['query_desc']['sql_syn'])

SELECT *  FROM C1  WHERE  NOT  race NOT LIKE 'White' AND  capital BETWEEN 0 AND 0 AND  fnlwgt <> 117778 
REAL:
no_of_rows=36163
no_of_cols=13
product=470.119


SYN:
no_of_rows=126
no_of_cols=13
product=1.638


SELECT *  FROM C1  WHERE  NOT  race NOT LIKE 'White' AND  capital BETWEEN 0 AND 0 AND  fnlwgt <> 117778 


SELECT *  FROM C1_syn_default_1  WHERE  NOT  race NOT LIKE 'White' AND  capital BETWEEN 0 AND 0 AND  fnlwgt <> 117778 


In [374]:
rnd_query['query_desc']

{'type': 'twin_fltr',
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': "SELECT *  FROM C1  WHERE  NOT  race NOT LIKE 'White' AND  capital BETWEEN 0 AND 0 AND  fnlwgt <> 117778 ",
 'n_cols_real': 13,
 'n_rows_real': 36163,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': "SELECT *  FROM C1_syn_default_1  WHERE  NOT  race NOT LIKE 'White' AND  capital BETWEEN 0 AND 0 AND  fnlwgt <> 117778 ",
 'n_cols_syn': 13,
 'n_rows_syn': 126}

In [375]:
#Single Aggregate-Filter

rnd_query=self.make_single_aggfltr_query(agg_fntn=True)


n_rows=rnd_query['query_desc']['n_rows']
n_cols=rnd_query['query_desc']['n_cols']

print(f"no_of_rows={n_rows}\nno_of_cols={n_cols}\nproduct={n_rows*n_cols/1000}")
print("\n")
print(rnd_query['query_desc']['sql'])

SELECT relationship, native_country, occupation, race, COUNT(*), MIN(age)  FROM C1  WHERE  NOT  marital_status = 'Never-married' AND  capital >= 0 AND NOT  fnlwgt NOT BETWEEN 56460 AND 73471 AND  age BETWEEN 22 AND 29 AND  sex = 'Female' AND  occupation NOT IN ('Machine-op-inspct', 'Tech-support', 'Craft-repair') AND NOT  workclass <> 'Private' AND  race = 'White' AND  education NOT LIKE 'Some-college' AND  hours_per_week <= 35  GROUP BY relationship, native_country, occupation, race
no_of_rows=3
no_of_cols=6
product=0.018


SELECT relationship, native_country, occupation, race, COUNT(*), MIN(age)  FROM C1  WHERE  NOT  marital_status = 'Never-married' AND  capital >= 0 AND NOT  fnlwgt NOT BETWEEN 56460 AND 73471 AND  age BETWEEN 22 AND 29 AND  sex = 'Female' AND  occupation NOT IN ('Machine-op-inspct', 'Tech-support', 'Craft-repair') AND NOT  workclass <> 'Private' AND  race = 'White' AND  education NOT LIKE 'Some-college' AND  hours_per_week <= 35  GROUP BY relationship, native_countr

In [376]:
# Twin Aggregate-Filter 

syn_tbl_name_lst=['C1_syn_default_1']
rnd_query=self.make_twin_aggfltr_query(syn_tbl_name_lst, agg_fntn=True)

n_rows_real=rnd_query['query_desc']['n_rows_real']
n_cols_real=rnd_query['query_desc']['n_cols_real']
n_rows_syn=rnd_query['query_desc']['n_rows_syn']
n_cols_syn=rnd_query['query_desc']['n_cols_syn']

print(rnd_query['query_desc']['sql_real'])
print(f"REAL:\nno_of_rows={n_rows_real}\nno_of_cols={n_cols_real}\nproduct={n_rows_real*n_cols_real/1000}\n\n")

print(rnd_query['query_desc']['sql_syn'])
print(f"SYN:\nno_of_rows={n_rows_syn}\nno_of_cols={n_cols_syn}\nproduct={n_rows_syn*n_cols_syn/1000}\n\n")

SELECT race, income, marital_status, relationship, native_country, COUNT(*), AVG(fnlwgt)  FROM C1  WHERE  NOT  sex IN ('Female', 'Male') AND  age >= 37 AND NOT  hours_per_week > 45 AND  education <> 'Bachelors' AND NOT  fnlwgt BETWEEN 339956 AND 87052 AND  income = '<=50K' AND NOT  native_country LIKE 'United-States' AND NOT  capital < 0 AND  race IN ('White') AND  workclass IN ('Private') OR  occupation = 'Craft-repair' AND  relationship = 'Not-in-family'  GROUP BY race, income, marital_status, relationship, native_country
SELECT race, income, marital_status, relationship, native_country, COUNT(*), AVG(fnlwgt)  FROM C1  WHERE  NOT  sex IN ('Female', 'Male') AND  age >= 37 AND NOT  hours_per_week > 45 AND  education <> 'Bachelors' AND NOT  fnlwgt BETWEEN 339956 AND 87052 AND  income = '<=50K' AND NOT  native_country LIKE 'United-States' AND NOT  capital < 0 AND  race IN ('White') AND  workclass IN ('Private') OR  occupation = 'Craft-repair' AND  relationship = 'Not-in-family'  GROUP BY

In [377]:
matched_query=self._match_twin_query(rnd_query)
scored_query=self.calc_dist_scores(matched_query)

In [378]:
scored_query['query_desc']

{'type': 'twin_agg',
 'agg_fntn': ('AVG', 'fnlwgt'),
 'grpby_vars': ['race',
  'income',
  'marital_status',
  'relationship',
  'native_country'],
 'from_tbl_name_real': 'C1',
 'join_tbl_name_lst_real': [],
 'sql_real': "SELECT race, income, marital_status, relationship, native_country, COUNT(*), AVG(fnlwgt)  FROM C1  WHERE  NOT  sex IN ('Female', 'Male') AND  age >= 37 AND NOT  hours_per_week > 45 AND  education <> 'Bachelors' AND NOT  fnlwgt BETWEEN 339956 AND 87052 AND  income = '<=50K' AND NOT  native_country LIKE 'United-States' AND NOT  capital < 0 AND  race IN ('White') AND  workclass IN ('Private') OR  occupation = 'Craft-repair' AND  relationship = 'Not-in-family'  GROUP BY race, income, marital_status, relationship, native_country",
 'n_cols_real': 7,
 'n_rows_real': 104,
 'from_tbl_name_syn': 'C1_syn_default_1',
 'join_tbl_name_lst_syn': [],
 'sql_syn': "SELECT race, income, marital_status, relationship, native_country, COUNT(*), AVG(fnlwgt)  FROM C1_syn_default_1  WHERE  N

In [379]:
scored_query['query_hlngr_score']

0.2935015880154722

In [380]:
scored_query['query_ecldn_score']

0.1386750490563073