In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from fuzzy_sql.fuzzy_sql import *

In [2]:
def _assign_dtype(df, dict):
    #Correct dtypes of real and syn dataframes before saving in the database 
    #map metaddata into dtype dict with pandas dtypes
    #cols in df shall match keys in in_dict
    assert bool(set(df.columns).intersection(set(dict.keys())))
    out_dict={}
    for key in dict:
        if dict[key] in ['quantitative','continuous','interval','ratio']:
            out_dict[key]='float64'
        elif dict[key] in ['date','time','datetime']:
            out_dict[key]='datetime64'
        else:
            out_dict[key]='category'
    
    for col in df.columns:
        df[col]=df[col].astype(out_dict[col])

    return df


In [3]:
class LONG_QUERY():
    """ Generates random queries for baseline-longitudinal datasets. 
    """

    def __init__(self, db_conn: object, parent_tbl_name: str, child_tbl_name: str, metadata: dict,params: dict ):
        """ 
        Args:
            db_conn: The connection object of the sqlite database where the data exists.
            parent_tbl_name: The name of the parent table (i.e. baseline data) in the database.
            child_tbl_name: The name of the child table (i.e. longitudinal data) in the database.
            metadata: A dictionary that includes table's variable names (i.e. column names) as keys and types of variables as values. THey types shall be restricted to: 'continuous', 'data' and 'nominal'. Any table shall have at least one nominal variable.
        """

        self_QUERY_PARAMS=params

        self.CUR = db_conn.cursor()
        self.RP_NAME = parent_tbl_name #RP = Real Parent
        # self.P_VAR_NAMES = list(metadata.keys()) #P = Parent 
        self.RC_NAME = child_tbl_name #RC = Real Child
        # self.C_VAR_NAMES = list(metadata.keys()) #C = Child 
        
        #Fetch Real data (both parent and child)
        self.RP_DF=pd.read_sql_query(f'SELECT * FROM {self.RP_NAME}', db_conn) #Real Parent Data frame
        self.RC_DF=pd.read_sql_query(f'SELECT * FROM {self.RC_NAME}', db_conn) #Real Child Data frame

        #Segregate variables into lists based on their types
        self.CAT_VARS={}
        self.CNT_VARS={}
        self.DT_VARS={}
        self.CAT_VARS['parent']=[key for key, value in meta['parent'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CAT_VARS['child']=[key for key, value in meta['child'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CNT_VARS['parent']=[key for key, value in meta['parent'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.CNT_VARS['child']=[key for key, value in meta['child'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.DT_VARS['parent']=[key for key, value in meta['parent'].items() if value in ['date','time','datetime']]
        self.DT_VARS['child']=[key for key, value in meta['child'].items() if value in ['date','time','datetime']]


        # self.CAT_VARS = [key for key, value in metadata.items(
        # ) if value == 'nominal']  # Get all categorical (nominal) var names
        # self.CNT_VARS = [key for key, value in metadata.items(
        # ) if value == 'continuous']  # Get all continuous var names
        # self.DT_VARS = [key for key, value in metadata.items(
        # ) if value == 'date']  # Get all dated var names
        
        # self.AGG_FNCTN=True if self.CNT_VARS else False

        # # COUNT is always included in queries
        # self.AGG_OPS={'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 } #######################DONE
        # self._agg_op_bag = list(self.AGG_OPS.keys())
        # self._agg_op_wghts = list(self.AGG_OPS.values())

        # self.LOGIC_OPS={'AND':0.5,'OR':0.5} #########################DONE
        # self._logic_op_bag = list(self.LOGIC_OPS.keys())
        # self._logic_op_wghts = list(self.LOGIC_OPS.values())

        # # 1 means a NOT is added before the variable name
        # self.NOT_OP_STATE={'0':1, '1':0} ###########################DONE
        # self._logic_not_states = [int(x) for x in list(self.NOT_OP_STATE.keys())]
        # self._logic_not_wghts = list(self.NOT_OP_STATE.values())

        # # Comparison operations for categorical variables; BETWEEN is excluded since it can be generated by other operations
        # self.CAT_OPS={'=':0.25, '<>':0.25, 'LIKE':0.25, 'IN':0.25} ##################################DONE
        # self._cat_cmp_op_bag = list(self.CAT_OPS.keys())
        # self._cat_cmp_op_wghts = list(self.CAT_OPS.values())

        # # Comparison operations for continuous variables
        # self.CNT_OPS={'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.3} #######################DONE
        # self._cnt_cmp_op_bag = list(self.CNT_OPS.keys())
        # self._cnt_cmp_op_wghts = list(self.CNT_OPS.values())

        # # A dictionary that holds all possible values for each categorical variable
        # self.CAT_VAL_BAG = {}
        # for CAT_VAR in self.CAT_VARS:
        #     this_var= pd.read_sql_query("SELECT `{}` FROM {}".format(CAT_VAR, self.REAL_TBL_NAME), db_conn).values.ravel()
        #     this_var=[x for x in this_var if x==x] #drop nan
        #     this_var = list(filter(None, this_var)) #drop None
        #     # this_var=this_var.astype(str) #make sure it is string
        #     # #this_var=np.unique(this_var) #drop duplicates
        #     self.CAT_VAL_BAG[CAT_VAR] =this_var if len(this_var)!=0  else ['N/A']


        # # A dictionary that holds all possible values for each continuous variable
        # self.CNT_VAL_BAG = {}
        # for CNT_VAR in self.CNT_VARS:
        #     this_var = pd.read_sql_query("SELECT `{}` FROM {}".format(CNT_VAR, self.REAL_TBL_NAME), db_conn).values.ravel()
        #     this_var=[x for x in this_var if x==x] #drop nan
        #     this_var = list(filter(None, this_var)) #drop None
        #     # this_var=this_var.astype(str) #make sure it is string
        #     # #this_var=np.unique(this_var) #drop duplicates
        #     self.CNT_VAL_BAG[CNT_VAR] =this_var if len(this_var)!=0 else ['N/A']




        # # A dictionary that holds all possible values for each date variable
        # self.DT_VAL_BAG = {}
        # for DT_VAR in self.DT_VARS:
        #     this_var=pd.read_sql_query("SELECT `{}` FROM {}".format(DT_VAR, self.REAL_TBL_NAME), db_conn).values.ravel()
        #     this_var=[x for x in this_var if x==x] #drop nan
        #     this_var = list(filter(None, this_var)) #drop None
        #     # this_var=this_var.astype(str) #make sure it is string
        #     # #this_var=np.unique(this_var) #drop duplicates
        #     self.DT_VAL_BAG[DT_VAR] =this_var if len(this_var)!=0 else ['N/A']


In [4]:
rp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/b_sample.csv" #real parent (baseline) path 
rc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/l_sample.csv" #real child (longitudinal) path 
sp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/b_sample_syn.csv" #synthetic parent (baseline) path 
sc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/l_sample_syn.csv" #synthetic child (longitudinal) path 
meta_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/metadata/sample.json" #metdata path

In [5]:
# read data frames with all variables read as string and eliminate the apostrophe  '
rp=load_csv(rp_path) 
rc=load_csv(rc_path) 
sp=load_csv(sp_path)  
sc=load_csv(sc_path) 
import _json
with open(meta_path) as f:
    meta=json.load(f) #metadata for the data 

In [6]:
#fix datatypes in loaded csv 
rp=_assign_dtype(rp, meta['parent'])
rc=_assign_dtype(rc, meta['child'])
sp=_assign_dtype(sp, meta['parent'])
sc=_assign_dtype(sc, meta['child'])

In [7]:
#define default parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.5,'OR':0.5},
    'NOT_OP_STATE':{'0':1, '1':0},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.25, 'IN':0.25},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.3}
}

In [8]:
conn = sqlite3.connect('fuzzy_sql.db')
make_table('sample_r_b', rp, conn)
make_table('sample_r_l', rc, conn)
make_table('sample_s_b', sp, conn)
make_table('sample_s_l', sc, conn)

test_long=LONG_QUERY(conn,'sample_r_b','sample_r_l', meta,DFLT_PARAMS)

Table sample_r_b is created in the database
Table sample_r_l is created in the database
Table sample_s_b is created in the database
Table sample_s_l is created in the database
