### An example for generating *Aggregate-Filter* for longitudinal multiple-child data with customized parameters 
######  Before executing this notebook, please makes sure that data was imported earlier into the database.

In [None]:
! pip install --upgrade pip
! pip install fuzzy-sql

In [None]:
import json
import os
from pathlib import Path

from fuzzy_sql.randomquery import RandomQuery
from fuzzy_sql.report import Report


DATASET_NAME='cms'


In [None]:
# set directories
DATA_DIR=os.path.join(os.getcwd(),'data')
DB_DIR=os.path.join(os.getcwd(),'databases')

metadata_dir = os.path.join(DATA_DIR, DATASET_NAME,'metadata')
db_path = os.path.join(DB_DIR, f'{DATASET_NAME}.db')

### Generate Customized Random Queries

In [None]:
# define input tables and metadata 
real_tbl_lst=['s1_ben_sum_2008','s1_ben_sum_2009','s1_ben_sum_2010','s1_carrier_1a','s1_carrier_1b','s1_inpatient','s1_outpatient','s1_prescrp']
syn_tbl_lst=['s2_ben_sum_2008','s2_ben_sum_2009','s2_ben_sum_2010','s2_carrier_1a','s2_carrier_1b','s2_inpatient','s2_outpatient','s2_prescrp']

metadata_lst = []
for tbl_name in real_tbl_lst:
    with open(os.path.join(metadata_dir, tbl_name+'.json'), 'r') as f:
        metadata_lst.append(json.load(f))


# Generate queries while customizing class attributes 
n_queries=10
queries = []
k = 0
while k < n_queries:
    query_obj = RandomQuery(db_path, real_tbl_lst, metadata_lst)
    query_obj.no_groupby_vars = 2 # Restrict number of GROUP BY variables to 2
    query_obj.no_where_vars = 2 # Restrict number of WHERE variables to 2
    query_obj.no_join_tables = 2 # Restrict number of JOIN tables to 2
    query_obj.oprtns={ #Customizing operations probabilities
        'AGG_OPS': {'AVG': 0, 'SUM': 1, 'MAX': 0, 'MIN': 0},
        'LOGIC_OPS': {'AND': 1, 'OR': 0},
        'NOT_STATE': {'0': 1, '1': 0},
        'CAT_OPS': {'=': 0.25, '<>': 0.25, 'LIKE': 0.15, 'IN': 0.15, 'NOT LIKE': 0.1, 'NOT IN': 0.1},
        'CNT_OPS': {'=': 0.2, '>': 0.1, '<': 0.1, '>=': 0.1, '<=': 0.1, '<>': 0.1, 'BETWEEN': 0.2, 'NOT BETWEEN': 0.1},
        'DT_OPS': {'=': 0.2, '>': 0.1, '<': 0.1, '>=': 0, '<=': 0, '<>': 0.1, 'BETWEEN': 0.2, 'IN': 0.1, 'NOT BETWEEN': 0.1, 'NOT IN': 0.1},
        'FILTER_TYPE': {'WHERE': 0.5, 'AND': 0.5},
        'JOIN_TYPE': {'JOIN': 0, 'LEFT JOIN': 1}
    }
    real_expr, real_groupby_lst, real_from_tbl, real_join_tbl_lst, agg_fntn_terms = query_obj.compile_aggfltr_expr()
    if not query_obj._test_query_time(db_path,real_expr): #skipping query if it takes a long time to return results
        continue
    rnd_query = query_obj.make_twin_aggfltr_query(
        syn_tbl_lst, real_expr, real_groupby_lst, real_from_tbl, real_join_tbl_lst, agg_fntn_terms)
    matched_query = query_obj._match_queries4agg(rnd_query)
    scored_query = query_obj.gather_metrics4agg(matched_query)
    queries.append(scored_query)
    k += 1
    print('Generated Random Aggregate Filter Query - {} '.format(str(k)))
    

### REPORTING 

In [None]:
rprtr=Report(real_tbl_lst, queries)
rprtr.print_html_mltpl(f'{DATASET_NAME}_customized.html')
rprtr.plot_violin('Hellinger',f'{DATASET_NAME}_customized_hlngr.png' )
rprtr.plot_violin('Euclidean',f'{DATASET_NAME}_customized_ecldn.png' )