In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from fuzzy_sql.fuzzy_sql import *

In [2]:
def _assign_dtype(df, dict):
    #Correct dtypes of real and syn dataframes before saving in the database 
    #map metaddata into dtype dict with pandas dtypes
    #cols in df shall match keys in in_dict
    assert bool(set(df.columns).intersection(set(dict.keys())))
    out_dict={}
    for key in dict:
        if dict[key] in ['quantitative','continuous','interval','ratio']:
            out_dict[key]='float64'
        elif dict[key] in ['date','time','datetime']:
            out_dict[key]='datetime64'
        else:
            out_dict[key]='category'
    
    for col in df.columns:
        df[col]=df[col].astype(out_dict[col])

    return df


In [3]:
from logging import exception


class LONG_QUERY():
    """ Generates random queries for baseline-longitudinal datasets. 
    """

    def __init__(self, db_conn: object, parent_tbl_name: str, child_tbl_name: str, metadata: dict,params: dict , seed=False):
        """ 
        Args:
            db_conn: The connection object of the sqlite database where the data exists.
            parent_tbl_name: The name of the parent table (i.e. baseline data) in the database.
            child_tbl_name: The name of the child table (i.e. longitudinal data) in the database.
            metadata: A dictionary that includes table's variable names (i.e. column names) as keys and types of variables as values. THey types shall be restricted to: 'continuous', 'data' and 'nominal'. Any table shall have at least one nominal variable.
            params: A dictionary that includes the set of parameters that are necessary for generating the random queries. 
            seed: If set to True, generated random queries become deterministic. 
        """
        self.SEED=seed
        self.seed_no=141

        self.CUR = db_conn.cursor()
        self.PARENT_NAME = parent_tbl_name #RP = Real Parent
        self.CHILD_NAME = child_tbl_name #RC = Real Child
        self.metadata=copy.deepcopy(metadata)
        
        #Fetch Real data (both parent and child)
        self.PARENT_DF=pd.read_sql_query(f'SELECT * FROM {self.PARENT_NAME}', db_conn) #Real Parent Dataframe
        self.CHILD_DF=pd.read_sql_query(f'SELECT * FROM {self.CHILD_NAME}', db_conn) #Real Child Dataframe


        #Get foreign key name
        self.FKEY_NAME=self.metadata['key']

        #Delete foreign key from child variables to avoid repetition of variable in various expression
        del self.metadata['child'][self.FKEY_NAME]


        #Segregate variables into lists based on their types
        self.CAT_VARS={} #Parent Categorical Variables
        self.CNT_VARS={}
        self.DT_VARS={}
        self.CAT_VARS['parent']=[key for key, value in self.metadata['parent'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CAT_VARS['child']=[key for key, value in self.metadata['child'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CNT_VARS['parent']=[key for key, value in self.metadata['parent'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.CNT_VARS['child']=[key for key, value in self.metadata['child'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.DT_VARS['parent']=[key for key, value in self.metadata['parent'].items() if value in ['date','time','datetime']]
        self.DT_VARS['child']=[key for key, value in self.metadata['child'].items() if value in ['date','time','datetime']]



        # Aggregate function applies only when there is at least one continuous variable 
        self.AGG_FNCTN=True if len(self.CNT_VARS['parent'])!=0 or len(self.CNT_VARS['child'])!=0 else False

        # Define random query attributes
        self.ATTRS=params #General attributes that can be set by the user


        # Generate dictionaries of bags for various variables
        self.CAT_VAL_BAGS={}
        self.CNT_VAL_BAGS={}
        self.DT_VAL_BAGS={}
        self.CAT_VAL_BAGS['parent']=self._make_bags(self.PARENT_DF[self.CAT_VARS['parent']])
        self.CNT_VAL_BAGS['parent']=self._make_bags(self.PARENT_DF[self.CNT_VARS['parent']])
        self.DT_VAL_BAGS['parent']=self._make_bags(self.PARENT_DF[self.DT_VARS['parent']])
        self.CAT_VAL_BAGS['child']=self._make_bags(self.CHILD_DF[self.CAT_VARS['child']])
        self.CNT_VAL_BAGS['child']=self._make_bags(self.CHILD_DF[self.CNT_VARS['child']])
        self.DT_VAL_BAGS['child']=self._make_bags(self.CHILD_DF[self.DT_VARS['child']])
        
        self.max_no_in_terms=5 #Maximum number of n terms (set it to 0 if you do not want to impose any limit)

    def _make_bags(self,df:pd.DataFrame)-> dict:
        val_bags={}
        for var in df.columns:
            vals=df[var].values
            vals=[x for x in vals if x==x] #drop nan
            vals=list(filter(None, vals)) #drop None
            val_bags[var]=vals if len(vals)!=0 else ['N/A']
        return val_bags

    def make_query(self,cur: object, query_exp: str)-> pd.DataFrame:
        cur.execute(query_exp)
        query = cur.fetchall()
        query=pd.DataFrame(query, columns=[description[0] for description in cur.description])
        return query

    # def _get_var_idx(self, var_name):
    #     if var_name in self.PARENT_DF.columns: #search in parent
    #         idx=self.PARENT_DF.columns.get_loc(var_name)
    #         return 'parent',idx
    #     elif var_name in self.CHILD_DF.columns: #search in child
    #         idx=self.CHILD_DF.columns.get_loc(var_name)
    #         return 'child', idx
    #     else:
    #         raise Exception(f"{var_name} not found in parent or child tables!")

    def _mix_vars(self,*args):
        #accepts variable length of arguments as tuples where each tuple consists of the table name and some variables that belong to that table
        # returns mixed variables in one list but each variable is concatenated with its respective table name
        mixed_vars=[]
        for arg in args:
            vars=[arg[0]+'.'+x for x in arg[1]]
            mixed_vars+=vars
        return mixed_vars

    def _get_twin_lst(self, in_lst: list,in_parent_name: str, out_parent_name: str, in_child_name: str, out_child_name: str)-> list:
        # replaces the table names of teh real dataset by the table names of the synthetic datasets.
        out_lst=[var.replace(in_parent_name,out_parent_name ) for var in in_lst] 
        out_lst=[var.replace(in_child_name,out_child_name ) for var in out_lst] 
        return out_lst

    def _get_rnd_groupby_lst(self)-> list:
        #returned randomly picked cat vars including the concatenated table name of the real data (ie that is defined in teh class)
        #Note: You can group by CAT_VARS whether from parent or child or both
        if self.SEED:
            np.random.seed(self.seed_no)
            random.seed(self.seed_no)
        all_cat_vars=self._mix_vars((self.PARENT_NAME,self.CAT_VARS['parent']),(self.CHILD_NAME,self.CAT_VARS['child']))
        # parent_cat_vars=[self.PARENT_NAME+'.'+x for x in self.CAT_VARS['parent']]
        # child_cat_vars=[self.CHILD_NAME+'.'+x for x in self.CAT_VARS['child']]
        # all_cat_vars=parent_cat_vars +child_cat_vars
        n_vars_bag=np.arange(1, 1+len(all_cat_vars)) 
        if self.ATTRS['LESS_GRP_VARS']:#define slope-down discrete distribution 
            n_var_probs=n_vars_bag[::-1]/n_vars_bag.sum()
            # n_var_probs= np.zeros_like(n_vars_bag)
            # n_var_probs[0]=1
            n_vars=np.random.choice(n_vars_bag, p=n_var_probs)
        else:
            n_vars=np.random.choice(n_vars_bag)
        picked_vars = random.sample(all_cat_vars, n_vars)
        return picked_vars


    def _get_rnd_aggfntn_tpl(self) -> tuple:
        #returns a random tuple of agg function and continuous OR date variable 
        # Note: continuous variable can be from either parent or child tables 
        if self.SEED:
            np.random.seed(self.seed_no)
            random.seed(self.seed_no)
        all_possible_vars=self._mix_vars((self.PARENT_NAME,self.CNT_VARS['parent']),(self.PARENT_NAME,self.DT_VARS['parent']),(self.CHILD_NAME,self.CNT_VARS['child']),(self.CHILD_NAME,self.DT_VARS['child']))
        picked_var=np.random.choice(all_possible_vars)
        picked_op=np.random.choice(list(self.ATTRS['AGG_OPS'].keys()), p=list(self.ATTRS['AGG_OPS'].values()))
        return (picked_op,picked_var)

    
    def _get_rnd_where_lst(self) -> tuple:
        # use WHERE with mix of CAT, CNT, DT variables from both PARENT and CHILD
        if self.SEED:
            np.random.seed(self.seed_no)
            random.seed(self.seed_no)
        all_possible_vars=self._mix_vars((self.PARENT_NAME,self.CAT_VARS['parent']),(self.CHILD_NAME,self.CAT_VARS['child']),(self.PARENT_NAME,self.CNT_VARS['parent']),(self.CHILD_NAME,self.CNT_VARS['child']),(self.PARENT_NAME,self.DT_VARS['parent']),(self.CHILD_NAME,self.DT_VARS['child']))
        n_vars_bag=np.arange(1, 1+len(all_possible_vars)) #this gives possible number of terms in the where clause
        if self.ATTRS['LESS_CMP_VARS']:#define slope-down discrete distribution 
            n_var_probs=n_vars_bag[::-1]/n_vars_bag.sum()
            # n_var_probs= np.zeros_like(n_vars_bag)
            # n_var_probs[0]=1
            n_vars=np.random.choice(n_vars_bag, p=n_var_probs)
        else:
            n_vars=np.random.choice(n_vars_bag)
        picked_vars = random.sample(all_possible_vars, n_vars)

        all_cat_vars=np.concatenate(list(self.CAT_VARS.values()))
        all_cnt_vars=np.concatenate(list(self.CNT_VARS.values()))
        all_dt_vars=np.concatenate(list(self.DT_VARS.values()))
        terms=[]
        log_ops=[]
        for long_var_name in picked_vars: #This loop will find the a proper random value comparison operation and proper random value for all the picked variables
            #var=long_var_name[long_var_name.find(".")+1:]
            x=long_var_name.split(".") #Note is assumed that variable names do NOT include any "."
            var_tbl=x[0]
            var=x[1]
            var_tbl_rank='parent' if var_tbl==self.PARENT_NAME else 'child'
            
            #adding not to long variable name 
            not_status=np.random.choice(list(self.ATTRS['NOT_STATE'].keys()), p=list(self.ATTRS['NOT_STATE'].values()) )
            selected_long_var_name= 'NOT '+long_var_name if not_status=='1' else long_var_name
            
            if var in all_cat_vars:
                picked_cmp_op=np.random.choice(list(self.ATTRS['CAT_OPS'].keys()),p=list(self.ATTRS['CAT_OPS'].values()))
                if picked_cmp_op=='IN' or picked_cmp_op=='NOT IN' :
                    possible_no_of_in_terms=np.arange(2,len(self.CAT_VAL_BAGS[var_tbl_rank][var]))
                    no_of_in_terms=np.min([np.random.choice(possible_no_of_in_terms),self.max_no_in_terms]) if self.max_no_in_terms != 0 else np.random.choice(possible_no_of_in_terms)
                    vals=np.random.choice(self.CAT_VAL_BAGS[var_tbl_rank][var], size=no_of_in_terms)
                    if picked_cmp_op=='IN':
                        term =f" {selected_long_var_name} IN {tuple(vals)} "
                    else:
                        term=f" {selected_long_var_name} NOT IN {tuple(vals)} "
                else:
                    val=np.random.choice(self.CAT_VAL_BAGS[var_tbl_rank][var])
                    term=f" {selected_long_var_name} {picked_cmp_op} {val} "
            
            elif var in all_cnt_vars:
                picked_cmp_op=np.random.choice(list(self.ATTRS['CNT_OPS'].keys()),p=list(self.ATTRS['CNT_OPS'].values()))
                if picked_cmp_op=='BETWEEN' or picked_cmp_op=='NOT BETWEEN':
                    lower_bound_bag=self.CNT_VAL_BAGS[var_tbl_rank][var]
                    lower_bound=np.random.choice(lower_bound_bag)
                    upper_bound_bag=[x for x in lower_bound_bag if x>=lower_bound]
                    upper_bound=np.random.choice(upper_bound_bag)
                    if picked_cmp_op=='BETWEEN':
                        term=f" {selected_long_var_name} BETWEEN {lower_bound} AND {upper_bound} "
                    else:
                        term=f" {selected_long_var_name} NOT BETWEEN {lower_bound} AND {upper_bound} "
                else:
                    val=np.random.choice(self.CNT_VAL_BAGS[var_tbl_rank][var])
                    term=f" {selected_long_var_name} {picked_cmp_op} {val} "
            
            elif var in all_dt_vars:
                picked_cmp_op=np.random.choice(list(self.ATTRS['DT_OPS'].keys()),p=list(self.ATTRS['DT_OPS'].values()))
                if picked_cmp_op=='BETWEEN':
                    pass
                elif picked_cmp_op=='IN':
                    pass
                else:
                    pass
            else:
                raise Exception(f"Can not find {var} in the lists of all variables!!")
            
            terms.append(term)
        
        selected_logic_ops=np.random.choice(list(self.ATTRS['LOGIC_OPS'].keys()), size=len(terms)-1, p=list(self.ATTRS['LOGIC_OPS'].values()))
        return terms, selected_logic_ops



#########################################################################

    def _build_agg_expr(self,  pname: str, cname: str, fkey: str, groupby_lst: list) -> str:
        expr1=f'SELECT *,COUNT(*) FROM {pname} JOIN {cname} ON {pname}.{fkey} = {cname}.{fkey}'
        expr2_1=' GROUP BY '
        expr2_2=f'{groupby_lst}'
        expr2_2=expr2_2.replace("[","")
        expr2_2=expr2_2.replace("]","")
        expr2_2=expr2_2.replace("'","")
        return expr1+expr2_1+expr2_2

    
    def make_single_agg_query(self) -> dict:
        dic={}
        single_grp_lst=self._get_rnd_groupby_lst()
        single_expr=self._build_agg_expr(self.PARENT_NAME, self.CHILD_NAME,self.FKEY_NAME, single_grp_lst)
        query=self.make_query(self.CUR, single_expr)
        dic['query']=query
        dic['query_desc']={
            "type":"single_agg",
            "sql":single_expr,
            "n_rows":query.shape[0],
            "n_cols":query.shape[1]
        }
        return dic

    def make_twin_agg_query(self, twin_parent_name, twin_child_name):
        dic={}
        real_grp_lst =self._get_rnd_groupby_lst()
        syn_grp_lst=self._get_twin_lst(real_grp_lst, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)
        real_expr=self._build_agg_expr(self.PARENT_NAME, self.CHILD_NAME,self.FKEY_NAME, real_grp_lst)
        syn_expr=self._build_agg_expr(twin_parent_name, twin_child_name,self.FKEY_NAME, syn_grp_lst)
        query_real=self.make_query(self.CUR, real_expr)
        query_syn=self.make_query(self.CUR, syn_expr)
        dic['query_real']=query_real
        dic['query_syn']=query_syn
        dic['query_desc']={
            "type":"twin_agg",
            "sql_real":real_expr,
            "n_cols_real":query_real.shape[1],
            "n_rows_real":query_real.shape[0],
            "sql_syn":syn_expr,
            "n_cols_syn":query_syn.shape[1],
            "n_rows_syn":query_syn.shape[0],
        }
        return dic

#-------------------------------------------------------------------------------------------

    def _build_agg_expr_w_aggfntn(self,pname: str, cname: str, fkey: str, agg_fntn_tpl: tuple, groupby_lst: list) -> str:
        expr1=f'SELECT *,COUNT(*), {agg_fntn_tpl[0]}({agg_fntn_tpl[1]}) FROM {pname} JOIN {cname} ON {pname}.{fkey} = {cname}.{fkey}'
        expr2_1=' GROUP BY '
        expr2_2=f'{groupby_lst}'
        expr2_2=expr2_2.replace("[","")
        expr2_2=expr2_2.replace("]","")
        expr2_2=expr2_2.replace("'","")
        expr=expr1+expr2_1+expr2_2
        return expr
    

    def make_single_agg_query_w_aggfntn(self):
        dic={}
        single_grp_lst=self._get_rnd_groupby_lst()
        agg_fntn_tpl=self._get_rnd_aggfntn_tpl()
        expr=self._build_agg_expr_w_aggfntn(self.PARENT_NAME,self.CHILD_NAME, self.FKEY_NAME,agg_fntn_tpl,single_grp_lst)
        query=self.make_query(self.CUR, expr)
        dic['query']=query
        dic['query_desc']={
            "type":"single_agg",
            "sql":expr,
            "n_rows":query.shape[0],
            "n_cols":query.shape[1]
        }
        return dic

    def make_twin_agg_query_w_aggfntn(self,twintbl_parent_name,twintbl_child_name):
        dic={}
        real_groupby_lst=self._get_rnd_groupby_lst()
        syn_groupby_lst=self._get_twin_lst(real_groupby_lst, self.PARENT_NAME,twintbl_parent_name, self.CHILD_NAME, twintbl_child_name)
        real_aggfntn_tpl=self._get_rnd_aggfntn_tpl()
        syn_aggfntn_tpl=self._get_twin_lst(real_aggfntn_tpl, self.PARENT_NAME,twintbl_parent_name, self.CHILD_NAME, twintbl_child_name)
        real_expr=self._build_agg_expr_w_aggfntn(self.PARENT_NAME,self.CHILD_NAME,self.FKEY_NAME,real_aggfntn_tpl, real_groupby_lst)
        syn_expr=self._build_agg_expr_w_aggfntn(twintbl_parent_name,twintbl_child_name,self.FKEY_NAME,syn_aggfntn_tpl, syn_groupby_lst)
        query_real=self.make_query(self.CUR, real_expr)
        query_syn=self.make_query(self.CUR, syn_expr)
        dic['query_real']=query_real
        dic['query_syn']=query_syn
        dic['query_desc']={
            "type":"twin_agg",
            "sql_real":real_expr,
            "n_cols_real":query_real.shape[1],
            "n_rows_real":query_real.shape[0],
            "sql_syn":syn_expr,
            "n_cols_syn":query_syn.shape[1],
            "n_rows_syn":query_syn.shape[0],
            }
        return dic


#######################################################################################

    def _build_fltr_expr(self,  pname: str, cname: str, fkey: str, where_terms: list, log_ops:list ) -> str:
        expr1=f'SELECT * FROM {pname} JOIN {cname} ON {pname}.{fkey} = {cname}.{fkey} WHERE '
        where_expr=[None]*(len(where_terms)+len(log_ops))
        where_expr[::2]=where_terms
        where_expr[1::2]=log_ops
        where_expr=' '.join(x for x in where_expr )
        where_expr=where_expr + ' '
        return expr1+where_expr


    def make_single_fltr_query(self) -> dict:
        dic={}
        where_terms, log_ops=self._get_rnd_where_lst()
        single_expr=self._build_fltr_expr(self.PARENT_NAME,self.CHILD_NAME, self.FKEY_NAME, where_terms, log_ops)
        query=self.make_query(self.CUR, single_expr)
        dic['query']=query
        dic['query_desc']={
            "type":"single_fltr",
            "sql":single_expr,
            "n_rows":query.shape[0],
            "n_cols":query.shape[1]
        }
        return dic

    def make_twin_fltr_query(self, twin_parent_name: str, twin_child_name:str) -> dict:
        dic={}
        real_where_terms, log_ops =self._get_rnd_where_lst()
        syn_where_terms=self._get_twin_lst(real_where_terms, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)
        real_expr=self._build_fltr_expr(self.PARENT_NAME, self.CHILD_NAME,self.FKEY_NAME, real_where_terms, log_ops)
        syn_expr=self._build_fltr_expr(twin_parent_name, twin_child_name,self.FKEY_NAME, syn_where_terms,log_ops)
        query_real=self.make_query(self.CUR, real_expr)
        query_syn=self.make_query(self.CUR, syn_expr)
        dic['query_real']=query_real
        dic['query_syn']=query_syn
        dic['query_desc']={
            "type":"twin_fltr",
            "sql_real":real_expr,
            "n_cols_real":query_real.shape[1],
            "n_rows_real":query_real.shape[0],
            "sql_syn":syn_expr,
            "n_cols_syn":query_syn.shape[1],
            "n_rows_syn":query_syn.shape[0],
        }
        return dic

##########################################################################################

    def _build_aggfltr_expr(self,  pname: str, cname: str, fkey: str, groupby_lst: list, where_terms: list, log_ops: list) -> str:
        expr1=f'SELECT *,COUNT(*) FROM {pname} JOIN {cname} ON {pname}.{fkey} = {cname}.{fkey} '
        expr2=np.random.choice(list(self.ATTRS['JOIN_CNDTN'].keys()), p=list(self.ATTRS['JOIN_CNDTN'].values()))+' '
        expr2_1=[None]*(len(where_terms)+len(log_ops))
        expr2_1[::2]=where_terms
        expr2_1[1::2]=log_ops
        expr2_1=' '.join(x for x in expr2_1)
        expr2_1='('+expr2_1+')' 
        expr3_1=' GROUP BY '
        expr3_2=f'{groupby_lst}'
        expr3_2=expr3_2.replace("[","")
        expr3_2=expr3_2.replace("]","")
        expr3_2=expr3_2.replace("'","")
        return expr1+expr2+expr2_1+expr3_1+expr3_2


    def make_single_aggfltr_query(self) -> dict:
        dic={}
        grp_lst=self._get_rnd_groupby_lst()
        where_terms, log_ops=self._get_rnd_where_lst()
        single_expr=self._build_aggfltr_expr(self.PARENT_NAME, self.CHILD_NAME, self.FKEY_NAME,grp_lst, where_terms,log_ops )
        query=self.make_query(self.CUR, single_expr)
        dic['query']=query
        dic['query_desc']={
            "type":"single_aggfltr",
            "sql":single_expr,
            "n_rows":query.shape[0],
            "n_cols":query.shape[1]
        }
        return dic


    def make_twin_aggfltr_query(self, twin_parent_name: str, twin_child_name:str) -> dict:
        dic={}
        
        real_grp_lst =self._get_rnd_groupby_lst()
        syn_grp_lst=self._get_twin_lst(real_grp_lst, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)

        real_where_terms, log_ops =self._get_rnd_where_lst()
        syn_where_terms=self._get_twin_lst(real_where_terms, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)
        
        real_expr=self._build_aggfltr_expr(self.PARENT_NAME, self.CHILD_NAME, self.FKEY_NAME,real_grp_lst, real_where_terms,log_ops)
        syn_expr=self._build_aggfltr_expr(twin_parent_name, twin_child_name, self.FKEY_NAME,syn_grp_lst, syn_where_terms,log_ops)

        query_real=self.make_query(self.CUR, real_expr)
        query_syn=self.make_query(self.CUR, syn_expr)
        dic['query_real']=query_real
        dic['query_syn']=query_syn
        dic['query_desc']={
            "type":"twin_aggfltr",
            "sql_real":real_expr,
            "n_cols_real":query_real.shape[1],
            "n_rows_real":query_real.shape[0],
            "sql_syn":syn_expr,
            "n_cols_syn":query_syn.shape[1],
            "n_rows_syn":query_syn.shape[0],
        }
        return dic


#-----------------------------------------------------------------------------------------------------

    def _build_aggfltr_expr_w_aggfntn(self,pname: str, cname: str, fkey: str, agg_fntn_tpl: tuple, groupby_lst: list, where_terms: list, log_ops: list) -> str:
        expr1=f'SELECT *,COUNT(*), {agg_fntn_tpl[0]}({agg_fntn_tpl[1]}) FROM {pname} JOIN {cname} ON {pname}.{fkey} = {cname}.{fkey} '
        expr2=np.random.choice(list(self.ATTRS['JOIN_CNDTN'].keys()), p=list(self.ATTRS['JOIN_CNDTN'].values()))+' '
        expr2_1=[None]*(len(where_terms)+len(log_ops))
        expr2_1[::2]=where_terms
        expr2_1[1::2]=log_ops
        expr2_1=' '.join(x for x in expr2_1)
        expr2_1='('+expr2_1+')' 
        expr3_1=' GROUP BY '
        expr3_2=f'{groupby_lst}'
        expr3_2=expr3_2.replace("[","")
        expr3_2=expr3_2.replace("]","")
        expr3_2=expr3_2.replace("'","")
        return expr1+expr2+expr2_1+expr3_1+expr3_2



    def make_single_aggfltr_query_w_aggfntn(self) -> dict:
        dic={}
        agg_fntn_tpl=self._get_rnd_aggfntn_tpl()
        grp_lst=self._get_rnd_groupby_lst()
        where_terms, log_ops=self._get_rnd_where_lst()
        single_expr=self._build_aggfltr_expr_w_aggfntn(self.PARENT_NAME, self.CHILD_NAME, self.FKEY_NAME,agg_fntn_tpl,grp_lst, where_terms,log_ops )
        query=self.make_query(self.CUR, single_expr)
        dic['query']=query
        dic['query_desc']={
            "type":"single_aggfltr",
            "sql":single_expr,
            "n_rows":query.shape[0],
            "n_cols":query.shape[1]
        }
        return dic



    def make_twin_aggfltr_query_w_aggfntn(self, twin_parent_name: str, twin_child_name:str) -> dict:
        dic={}
        real_agg_fntn_tpl=self._get_rnd_aggfntn_tpl()
        syn_agg_fntn_tpl=tuple(self._get_twin_lst(real_agg_fntn_tpl, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name))

        real_grp_lst =self._get_rnd_groupby_lst()
        syn_grp_lst=self._get_twin_lst(real_grp_lst, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)

        real_where_terms, log_ops =self._get_rnd_where_lst()
        syn_where_terms=self._get_twin_lst(real_where_terms, self.PARENT_NAME,twin_parent_name, self.CHILD_NAME,twin_child_name)

        real_expr=self._build_aggfltr_expr_w_aggfntn(self.PARENT_NAME, self.CHILD_NAME, self.FKEY_NAME,real_agg_fntn_tpl,real_grp_lst, real_where_terms,log_ops )
        syn_expr=self._build_aggfltr_expr_w_aggfntn(twin_parent_name, twin_child_name, self.FKEY_NAME,syn_agg_fntn_tpl,syn_grp_lst, syn_where_terms,log_ops )

        query_real=self.make_query(self.CUR, real_expr)
        query_syn=self.make_query(self.CUR, syn_expr)
        dic['query_real']=query_real
        dic['query_syn']=query_syn
        dic['query_desc']={
            "type":"twin_aggfltr",
            "sql_real":real_expr,
            "n_cols_real":query_real.shape[1],
            "n_rows_real":query_real.shape[0],
            "sql_syn":syn_expr,
            "n_cols_syn":query_syn.shape[1],
            "n_rows_syn":query_syn.shape[0],
        }
        return dic





In [4]:
rp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/b_sample.csv" #real parent (baseline) path 
rc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/l_sample.csv" #real child (longitudinal) path 
sp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/b_sample_syn.csv" #synthetic parent (baseline) path 
sc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/l_sample_syn.csv" #synthetic child (longitudinal) path 
meta_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/metadata/sample.json" #metdata path

In [5]:
# read data frames with all variables read as string and eliminate the apostrophe  '
rp=load_csv(rp_path) 
rp.columns=[col.replace(".","_") for col in rp.columns] #replace dots in variable names by _ to avoid conflicts in sql naming 
rc=load_csv(rc_path) 
sp=load_csv(sp_path)  
sc=load_csv(sc_path) 

rp.columns=[col.replace(".","_") for col in rp.columns] #replace dots in variable names by _ to avoid conflicts in sql naming 



import _json
with open(meta_path) as f:
    meta=json.load(f) #metadata for the data 

In [6]:
#fix datatypes in loaded csv 
rp=_assign_dtype(rp, meta['parent'])
rc=_assign_dtype(rc, meta['child'])
sp=_assign_dtype(sp, meta['parent'])
sc=_assign_dtype(sc, meta['child'])

In [7]:
#define default parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.9,'OR':0.1},
    'NOT_STATE':{'0':0.8, '1':0.2},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.15, 'IN':0.15, 'NOT LIKE':0.1, 'NOT IN':0.1},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.2, 'NOT BETWEEN':0.1},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.2, 'IN':0.1, 'NOT BETWEEN':0.1, 'NOT IN':0.1},
    'LESS_GRP_VARS': False, # enforce bias in random queries toward smaller number of groupby vars. Default is no bias (i.e. uniform sampling)
    'LESS_CMP_VARS':False, # enforce bias in random queries toward small number of  comparison terms. Default is no bias (i.e. uniform sampling)
    'JOIN_CNDTN':{'WHERE':0.5, 'AND':0.5} #Use WHERE or AND with JOIN CLAUSE
}

In [8]:
# import real data into database
conn = sqlite3.connect('fuzzy_sql.db')
make_table('sample_r_b', rp, conn)
make_table('sample_r_l', rc, conn)
make_table('sample_s_b', sp, conn)
make_table('sample_s_l', sc, conn)



Table sample_r_b already exists in the database
Table sample_r_l already exists in the database
Table sample_s_b already exists in the database
Table sample_s_l already exists in the database


In [9]:
self=LONG_QUERY(conn,'sample_r_b','sample_r_l', meta,DFLT_PARAMS)
smk1=self.make_single_agg_query()
T_smk1=self.make_twin_agg_query('sample_s_b','sample_s_l')
smk2=self.make_single_agg_query_w_aggfntn()
T_smk2=self.make_twin_agg_query_w_aggfntn('sample_s_b','sample_s_l')
smk3=self.make_single_fltr_query()
T_smk3=self.make_twin_fltr_query('sample_s_b','sample_s_l')
smk4=self.make_single_aggfltr_query()
T_smk4=self.make_twin_aggfltr_query('sample_s_b', 'sample_s_l')
smk5=self.make_single_aggfltr_query_w_aggfntn()
T_smk5=self.make_twin_aggfltr_query_w_aggfntn('sample_s_b', 'sample_s_l')


In [16]:
T_smk5['query_desc']

{'type': 'twin_aggfltr',
 'sql_real': "SELECT *,COUNT(*), AVG(sample_r_l.NECODE) FROM sample_r_b JOIN sample_r_l ON sample_r_b.PNUM_R = sample_r_l.PNUM_R AND ( sample_r_b.DNR <> 0  AND  sample_r_b.FEMALE <> 0  AND  sample_r_b.RACE = 3  AND  sample_r_b.PL_NCHS2 IN ('2', '1', '1', '1', '3')  AND  sample_r_b.PL_CBSA <> 2  AND  sample_r_l.TOTCHG BETWEEN 25324.0 AND 75821.0  AND  sample_r_b.NEOMAT LIKE 0  OR  sample_r_l.ASCHED <> 0  AND  NOT sample_r_l.DX1 <> 2761  AND  sample_r_b.PL_UR_CA <> 1  AND  sample_r_b.PNUM_R = 3865962  AND  NOT sample_r_l.PROCTYPE BETWEEN 1.0 AND 1.0  OR  sample_r_b.PL_RUCC2 <> 1  AND  sample_r_b.HOSPBRTH IN ('0', '0', '0', '0', '0')  AND  NOT sample_r_l.NECODE > 2.0  AND  NOT sample_r_b.PL_UIC20 NOT IN ('1', '2', '1', '1', '1')  AND  sample_r_b.AGE > 17.0  OR  sample_r_l.Date NOT BETWEEN 11.0 AND 12.0  AND  sample_r_l.MDC BETWEEN 23.0 AND 23.0  OR  sample_r_b.Homeless IN ('0', '0', '0', '0', '0')  AND  sample_r_l.LOS >= 9.0  AND  sample_r_b.DIED <> 0  OR  sample_

In [17]:
real=T_smk5['query_real']
syn=T_smk5['query_syn']

In [18]:
real.value_counts()

Unnamed: 0,PNUM_R,AGE,DIED,DNR,FEMALE,HOSPBRTH,Homeless,HISPANIC,NEOMAT,PL_CBSA,...,NPR,TOTCHG,PROCTYPE,DX1,ASCHED,AWEEKEND,DRGVER,HCUP_ED,COUNT(*),AVG(sample_r_l.NECODE)
0,12051248,62.0,,0,0,0,0,1,0,2,...,21.0,882732.0,1.0,85221,0,1.0,24.0,4.0,4,2.000000
1,13066132,52.0,,0,1,0,0,1,0,2,...,3.0,49071.0,1.0,V5789,0,0.0,24.0,0.0,2,2.000000
2,14088785,22.0,,0,1,0,0,1,0,2,...,2.0,8861.0,1.0,9678,0,1.0,24.0,4.0,1,3.000000
3,1367754,58.0,,0,1,0,0,2,0,2,...,0.0,19280.0,0.0,30001,0,0.0,24.0,4.0,6,0.000000
4,14183510,54.0,,0,0,0,0,2,0,2,...,20.0,568188.0,1.0,44101,0,1.0,24.0,4.0,3,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288,2923783,72.0,1,0,0,0,0,2,0,2,...,5.0,,1.0,5770,0,0.0,24.0,4.0,6,0.666667
289,10244187,45.0,1,0,0,0,0,2,0,2,...,0.0,147870.0,0.0,V598,0,0.0,25.0,0.0,2,1.000000
290,13982134,50.0,1,0,1,0,0,2,0,2,...,0.0,154384.0,0.0,V594,1,0.0,25.0,0.0,2,0.000000
291,4244792,74.0,1,0,0,0,0,2,0,2,...,0.0,,0.0,V598,1,1.0,24.0,0.0,6,0.333333


In [21]:
real['AWEEKEND']

0.0    238
1.0     55
Name: AWEEKEND, dtype: int64

In [None]:
        #match records
        if len(real_idx)!=len(syn_idx):
            missed_in_syn=real_idx.difference(syn_idx) # find missing classes in syn results
            missed_in_real=syn_idx.difference(real_idx) # find missing classes in real results( very unlikely and it may indicate that a continuous varibale is mistakely defjned as nominal in the metadata)
            for missed_idx in list(missed_in_syn):
                syn_var[missed_idx]=0
            for missed_idx in list(missed_in_real):
                real_var[missed_idx]=0
        # if len(real_idx)>len(syn_idx):
        #     missing=real_idx.difference(syn_idx) #get missing index in syn
        #     for idx in list(missing): #insert missing indices in syn 
        #         syn_var[idx]=0 #penalize it by adding zero count
        # elif len(real_idx)<len(syn_idx):
        #     missing=syn_idx.difference(real_idx) #do same thing with missing index in real
        #     for idx in list(missing):  
        #         real_var[idx]=0
        assert len(real_var)==len(syn_var)