In [1]:
import sys
sys.path.append('/home/samer/projects/fuzzy_sql/src') #This will enable reading the modules
from fuzzy_sql.fuzzy_sql import *

In [2]:
def _assign_dtype(df, dict):
    #Correct dtypes of real and syn dataframes before saving in the database 
    #map metaddata into dtype dict with pandas dtypes
    #cols in df shall match keys in in_dict
    assert bool(set(df.columns).intersection(set(dict.keys())))
    out_dict={}
    for key in dict:
        if dict[key] in ['quantitative','continuous','interval','ratio']:
            out_dict[key]='float64'
        elif dict[key] in ['date','time','datetime']:
            out_dict[key]='datetime64'
        else:
            out_dict[key]='category'
    
    for col in df.columns:
        df[col]=df[col].astype(out_dict[col])

    return df


In [84]:
class LONG_QUERY():
    """ Generates random queries for baseline-longitudinal datasets. 
    """

    def __init__(self, db_conn: object, parent_tbl_name: str, child_tbl_name: str, metadata: dict,params: dict , seed=False):
        """ 
        Args:
            db_conn: The connection object of the sqlite database where the data exists.
            parent_tbl_name: The name of the parent table (i.e. baseline data) in the database.
            child_tbl_name: The name of the child table (i.e. longitudinal data) in the database.
            metadata: A dictionary that includes table's variable names (i.e. column names) as keys and types of variables as values. THey types shall be restricted to: 'continuous', 'data' and 'nominal'. Any table shall have at least one nominal variable.
            params: A dictionary that includes the set of parameters that are necessary for generating the random queries. 
            seed: If set to True, generated random queries become deterministic. 
        """
        self.SEED=seed

        self.CUR = db_conn.cursor()
        self.RP_NAME = parent_tbl_name #RP = Real Parent
        self.RC_NAME = child_tbl_name #RC = Real Child
        self.metadata=copy.deepcopy(metadata)
        
        #Fetch Real data (both parent and child)
        self.RP_DF=pd.read_sql_query(f'SELECT * FROM {self.RP_NAME}', db_conn) #Real Parent Dataframe
        self.RC_DF=pd.read_sql_query(f'SELECT * FROM {self.RC_NAME}', db_conn) #Real Child Dataframe
        #rename variables by concatenating table names
        self.RP_DF.columns=[self.RP_NAME+'.'+x for x in self.RP_DF.columns]
        self.RC_DF.columns=[self.RC_NAME+'.'+x for x in self.RC_DF.columns]


        #Get foreign key name
        self.FKEY_NAME=self.metadata['key']

        #Delete foreign key from child variables to avoid repetition of variable in various expression
        del self.metadata['child'][self.FKEY_NAME]


        #Segregate variables into lists based on their types
        self.CAT_VARS={} #Parent Categorical Variables
        self.CNT_VARS={}
        self.DT_VARS={}
        self.CAT_VARS['parent']=[self.RP_NAME+'.'+key for key, value in self.metadata['parent'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CAT_VARS['child']=[self.RC_NAME+'.'+key for key, value in self.metadata['child'].items() if value in ['qualitative','categorical','nominal','discrete','ordinal','dichotomous']]
        self.CNT_VARS['parent']=[self.RP_NAME+'.'+key for key, value in self.metadata['parent'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.CNT_VARS['child']=[self.RC_NAME+'.'+key for key, value in self.metadata['child'].items() if value in ['quantitative','continuous','interval','ratio']]
        self.DT_VARS['parent']=[self.RP_NAME+'.'+key for key, value in self.metadata['parent'].items() if value in ['date','time','datetime']]
        self.DT_VARS['child']=[self.RC_NAME+'.'+key for key, value in self.metadata['child'].items() if value in ['date','time','datetime']]



        # Aggregate function applies only when there is at least one continuous variable 
        self.AGG_FNCTN=True if len(self.CNT_VARS['parent'])!=0 or len(self.CNT_VARS['child'])!=0 else False

        # Define random query attributes
        self.ATTRS=params #General attributes that can be set by the user


        # Generate dictionaries of bags for various variables
        self.CAT_VAL_BAGS={}
        self.CNT_VAL_BAGS={}
        self.DT_VAL_BAGS={}
        self.CAT_VAL_BAGS['parent']=self._make_bags(self.RP_DF[self.CAT_VARS['parent']])
        self.CNT_VAL_BAGS['parent']=self._make_bags(self.RP_DF[self.CNT_VARS['parent']])
        self.DT_VAL_BAGS['parent']=self._make_bags(self.RP_DF[self.DT_VARS['parent']])
        self.CAT_VAL_BAGS['child']=self._make_bags(self.RC_DF[self.CAT_VARS['child']])
        self.CNT_VAL_BAGS['child']=self._make_bags(self.RC_DF[self.CNT_VARS['child']])
        self.DT_VAL_BAGS['child']=self._make_bags(self.RC_DF[self.DT_VARS['child']])
        


    def _make_bags(self,df:pd.DataFrame)-> dict:
        val_bags={}
        for var in df.columns:
            vals=df[var].values
            vals=[x for x in vals if x==x] #drop nan
            vals=list(filter(None, vals)) #drop None
            val_bags[var]=vals if len(vals)!=0 else ['N/A']
        return val_bags

    def _get_var_idx(self, var_name):
        if var_name in self.RP_DF.columns: #search in parent
            idx=self.RP_DF.columns.get_loc(var_name)
            return 'parent',idx
        elif var_name in self.RC_DF.columns: #search in child
            idx=self.RC_DF.columns.get_loc(var_name)
            return 'child', idx
        else:
            raise Exception(f"{var_name} not found in parent or child tables!")

        
    def _get_val_cmp_term(self) -> str:
        pass

    def _get_agg_fntn_term(self) -> str:
        pass

    def _get_groupby_term(self)-> str:
        #You can group by CAT_VARS whether from parent or child or both
        if self.SEED:
            np.random.seed(141)
        vars_select_bag=self.CAT_VARS['parent'] +self.CAT_VARS['child']
        n_vars_bag=np.arange(1, 1+len(vars_select_bag)) 
        if self.ATTRS['LESS_GRP_VARS']:#define slope-down discrete distribution 
            n_var_probs=n_vars_bag[::-1]/n_vars_bag.sum()
            # n_var_probs= np.zeros_like(n_vars_bag)
            # n_var_probs[0]=1
            n_vars=np.random.choice(n_vars_bag, p=n_var_probs)
        else:
            n_vars=np.random.choice(n_vars_bag)
        picked_vars = random.sample(vars_select_bag, n_vars)
        term=f"{picked_vars}"
        term=term.replace("[","")
        term=term.replace("]","")
        term=term.replace("'","")

        return term

    
    def _build_flter_expr(self, val_cmp_term: str) -> str:
        pass

    def _build_agg_expr(self,groupby_term: str) -> str:
        pass

    def _build_agg_w_fntn_expr(self, agg_fnt_term: str, groupby_term: str):
        pass

    def _build_aggfltr_expr(self, val_cmp_term: str, groupby_term: str):
        pass

    def _build_aggfltr_w_fntn_expr(self, agg_fnt_term: str, val_cmp_term, groupby_term: str):
        pass



        


        




In [4]:
rp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/b_sample.csv" #real parent (baseline) path 
rc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/real/l_sample.csv" #real child (longitudinal) path 
sp_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/b_sample_syn.csv" #synthetic parent (baseline) path 
sc_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/synthetic/l_sample_syn.csv" #synthetic child (longitudinal) path 
meta_path="/home/samer/projects/fuzzy_sql/data/longitudinal/ready/metadata/sample.json" #metdata path

In [5]:
# read data frames with all variables read as string and eliminate the apostrophe  '
rp=load_csv(rp_path) 
rc=load_csv(rc_path) 
sp=load_csv(sp_path)  
sc=load_csv(sc_path) 
import _json
with open(meta_path) as f:
    meta=json.load(f) #metadata for the data 

In [6]:
#fix datatypes in loaded csv 
rp=_assign_dtype(rp, meta['parent'])
rc=_assign_dtype(rc, meta['child'])
sp=_assign_dtype(sp, meta['parent'])
sc=_assign_dtype(sc, meta['child'])

In [7]:
#define default parameters
DFLT_PARAMS={
    'AGG_OPS':{'AVG':0.5, 'SUM':0.3, 'MAX':0.1, 'MIN':0.1 },
    'LOGIC_OPS':{'AND':0.5,'OR':0.5},
    'NOT_STATE':{'0':1, '1':0},
    'CAT_OPS':{'=':0.25, '<>':0.25, 'LIKE':0.25, 'IN':0.25},
    'CNT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0.1, '<=':0.1, '<>':0.1, 'BETWEEN':0.3},
    'DT_OPS':{'=':0.2, '>':0.1, '<':0.1, '>=':0, '<=':0, '<>':0.1, 'BETWEEN':0.3, 'IN':0.2},
    'LESS_GRP_VARS': False, # enforce bias in random queries toward smaller number of groupby vars. Default is no bias (i.e. uniform sampling)
    'LESS_CMP_VARS':False, # enforce bias in random queries toward small number of  comparison terms. Default is no bias (i.e. uniform sampling)
}

In [8]:
# import real data into database
conn = sqlite3.connect('fuzzy_sql.db')
make_table('sample_r_b', rp, conn)
make_table('sample_r_l', rc, conn)
make_table('sample_s_b', sp, conn)
make_table('sample_s_l', sc, conn)



Table sample_r_b already exists in the database
Table sample_r_l already exists in the database
Table sample_s_b already exists in the database
Table sample_s_l already exists in the database


In [85]:
self=LONG_QUERY(conn,'sample_r_b','sample_r_l', meta,DFLT_PARAMS)

In [113]:
self.ATTRS['LESS_GRP_VARS']=False

In [122]:
smk=self._get_groupby_term()

In [126]:
'GROUP BY ' + smk

'GROUP BY sample_r_b.PL_RUCC2, sample_r_b.PL_RUCA4, sample_r_b.PL_NCHS2, sample_r_b.RACE, sample_r_b.FEMALE, sample_r_b.PL_UR_CA'