### An example for generating *Aggregate-Filter* for cross-sectional data 
######  Before executing this notebook, please makes sure that data was imported earlier into the database.

In [1]:
! pip install --upgrade pip
! pip install fuzzy_sql-2.0.0b0-py3-none-any.whl

Collecting pip
  Using cached pip-23.0.1-py3-none-any.whl (2.1 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-23.0.1
Processing ./fuzzy_sql-2.0.0b0-py3-none-any.whl
Collecting Jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting jupyter
  Using cached jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py39-none-any.whl (132 kB)
Collecting jsonschema
  Using cached jsonschema-4.17.3-py3-none-any.whl (90 kB)
Collecting pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0
  Using cached pyrsistent-0.19.3-py3-none-any.whl (57 kB)
Collecting qtconsole
  Using cached qtconsole-5.4.0-py3-none-any.whl (121 kB)
Collecting jupyter-console
  Downloading jupyter_console-6.6.2-py3-none-any.whl (24 kB)
Collecting notebook
  Using cached notebook-6.5.2-py3-none-any.whl (439 kB)

In [2]:
import json
import os
from pathlib import Path

from fuzzy_sql.generate import gen_aggfltr_queries
from fuzzy_sql.report import Report

DATASET_NAME='sdgd'

In [9]:
# set directories
DATA_DIR=os.path.join(os.getcwd(),'data')
DB_DIR=os.path.join(os.getcwd(),'databases')

metadata_dir = os.path.join(DATA_DIR, DATASET_NAME,'metadata')
db_path = os.path.join(DB_DIR, f'{DATASET_NAME}.db')

### GENERATING RANDOM QUERIES 

In [10]:
# Create lists with table names. Table names shall be identical to the names initially created in the database.
real_tbl_lst=['C1']
syn_tbl_lst=['C1_syn_default_1']

# Read metadata from the provided json files into a list of dictionaries. 
# Note 1: Both real and synthetic data should have the same metadata file.
# Note 2: Each input table in real_tbl_lst above shall have its own metadata file.
# Note 2: The json file name shall match that of the real data file name in real_tbl_lst. 
metadata_lst = []
for tbl_name in real_tbl_lst:
    with open(os.path.join(metadata_dir, tbl_name+'.json'), 'r') as f:
        metadata_lst.append(json.load(f))

In [11]:
rnd_queries=gen_aggfltr_queries(10,db_path, real_tbl_lst, metadata_lst,  syn_tbl_lst )

Generated Random Aggregate Filter Query - 1 in 0.5 seconds.
Generated Random Aggregate Filter Query - 2 in 0.6 seconds.
Generated Random Aggregate Filter Query - 3 in 0.5 seconds.
Generated Random Aggregate Filter Query - 4 in 0.5 seconds.
Generated Random Aggregate Filter Query - 5 in 0.5 seconds.
Generated Random Aggregate Filter Query - 6 in 0.5 seconds.
Generated Random Aggregate Filter Query - 7 in 0.5 seconds.
Generated Random Aggregate Filter Query - 8 in 0.7 seconds.
Generated Random Aggregate Filter Query - 9 in 0.7 seconds.
Generated Random Aggregate Filter Query - 10 in 0.5 seconds.


### REPORTING 

In [12]:
rprtr=Report(real_tbl_lst, rnd_queries)
rprtr.print_html_mltpl(f'{DATASET_NAME}.html')
rprtr.plot_violin('Hellinger',f'{DATASET_NAME}_hlngr.png' )
rprtr.plot_violin('Euclidean',f'{DATASET_NAME}_ecldn.png' )