### An example for longitudinal multiple child data
##### *(Note: in this example database tables are created in names different than the data csv filenames but identical to json metadata filenames)*

In [1]:
#! pip install --upgrade pip
#! pip install fuzzy_sql-1.1.1b0-py3-none-any.whl
#%matplotlib inline

In [2]:
from config_paths import *
import sys
sys.path.append(str(SRC_DIR))

from fuzzy_sql.fuzzy_sql import *
import json
import os
from pathlib import Path

DATASET_NAME='cms'


import matplotlib
matplotlib.use('Agg')

### PREPROCESSING AND IMPORTING DATA INTO DATABASE
##### (This is typically done for one time only)

In [3]:
# set directories
metadata_dir = os.path.join(DATA_DIR, DATASET_NAME,'processed/metadata')
real_dir = os.path.join(DATA_DIR,DATASET_NAME, 'raw/sample1')
syn_dir = os.path.join(DATA_DIR,DATASET_NAME, 'raw/sample2')
db_path = os.path.join(DB_DIR, f'{DATASET_NAME}.db')



# identify input data file names
real_csvs=["DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2009_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2010_Beneficiary_Summary_File_Sample_1.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_1A.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_1B.csv" ,\
    "DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv",\
    "DE1_0_2008_to_2010_Outpatient_Claims_Sample_1.csv",\
    "DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_1.csv" ]

meta_jsons=['s1_ben_sum_2008.json','s1_ben_sum_2009.json','s1_ben_sum_2010.json','s1_carrier_1a.json',\
    's1_carrier_1b.json','s1_inpatient.json','s1_outpatient.json','s1_prescrp.json']

syn_csvs=["DE1_0_2008_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2009_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2010_Beneficiary_Summary_File_Sample_2.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_2A.csv",\
    "DE1_0_2008_to_2010_Carrier_Claims_Sample_2B.csv" ,\
    "DE1_0_2008_to_2010_Inpatient_Claims_Sample_2.csv",\
    "DE1_0_2008_to_2010_Outpatient_Claims_Sample_2.csv",\
    "DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_2.csv" ]

# Number of rows to be extracted from corresponding tables. This is only added for "example" purposes to reduce query times..
rows=[10000,10000,10000,20000,20000,15000,15000,15000]

In [4]:
conn = sqlite3.connect(db_path) #conneting to databse
with conn:  
    for real_csv,meta_json,syn_csv in zip(real_csvs,meta_jsons,syn_csvs):
        #Import real
        data,_=prep_data_for_db(os.path.join(real_dir,real_csv)) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        with open(os.path.join(metadata_dir, meta_json), 'r') as f:
            metadata=json.load(f)
        candidate_db_idx=get_vars_to_index(metadata,data, index_vars_types='cat',cardinality_cutoff=20)
        table_name=Path(meta_json).stem #real tables names are identical to json file names (without the extension)
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)
        #Import syn
        data,_=prep_data_for_db(os.path.join(syn_dir,syn_csv)) # Note: This function  can be used to quickly generate metadata template, but this is not shown here.
        table_name=table_name.replace("s1","s2") #synthetic tables are the same as the real table names but with s1 replaced by s2
        make_table(table_name, data, conn,indx_vars=candidate_db_idx)


Table s1_ben_sum_2008 is created in the database
.... The index: IDX_s1_ben_sum_2008_PPPYMT_IP is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_BENE_BIRTH_DT is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_BENRES_OP is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_BENRES_CAR is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_MEDREIMB_CAR is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_BENRES_IP is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_PPPYMT_OP is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_MEDREIMB_IP is created for the table: s1_ben_sum_2008 in the database
.... The index: IDX_s1_ben_sum_2008_PPPYMT_CAR is created for the table: s1_ben_sum_2008 in the database
....

### GENERATING RANDOM QUERIES 

In [None]:
# Create lists with table names. Table names shall be identical to the names initially created in the database.
real_tbl_lst=[Path(meta_json).stem for meta_json in meta_jsons]
syn_tbl_lst=[real_tbl.replace("s1","s2") for real_tbl in real_tbl_lst]

In [None]:
# Read metadata from the provided json files into a list of dictionaries. 
# Note 1: Both real and synthetic data should have the same metadata file.
# Note 2: Each input table in real_tbl_lst above shall have its own metadata file.
# Note 2: The json file name shall match that of the real data file name in real_tbl_lst. 
metadata_lst = []
for tbl_name in real_tbl_lst:
    with open(os.path.join(metadata_dir, tbl_name+'.json'), 'r') as f:
        metadata_lst.append(json.load(f))

In [None]:
rnd_queries=gen_aggfltr_queries(3,db_path, real_tbl_lst, metadata_lst,  syn_tbl_lst )

### REPORTING 

In [None]:
rprtr=QryRprt(real_tbl_lst, rnd_queries)
rprtr.print_html_mltpl(f'{DATASET_NAME}.html')
rprtr.plot_violin('Hellinger',f'{DATASET_NAME}_hlngr.png' )
rprtr.plot_violin('Euclidean',f'{DATASET_NAME}_ecldn.png' )