In [None]:
from collections import defaultdict
import pandas as pd, numpy as np

import os
import pickle
import sys
import json
import boto3
import io
import s3fs
from smart_open import open as s_open

import xlsxwriter
from itertools import product
import featuretools as ft

import re
from featuretools.primitives import make_agg_primitive,make_trans_primitive
from featuretools.variable_types import DateOfBirth,Text,Numeric,Categorical,Boolean,Index,DatetimeTimeIndex,Datetime,TimeIndex,Discrete,Ordinal
from scipy.stats import mode
from featuretools.primitives import TimeSinceLast,TimeSinceFirst,AvgTimeBetween,TimeSince,TimeSincePrevious
from v_ft.custom_primitives import *
from v_ft import time_unit as tu
import time
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from functools import reduce

In [None]:
bucket_name ='automating-travis'
folder_name = 'data'

In [None]:
COL_PLACEHOLDER = ["Don't have"]

# 2. prepare config for feature engineering

In [None]:
'''
def pd_read(file_name,src='s3',sample=None,used_cols=None):

    csvFile = os.path.join('s3://',bucket_name,folder_name,file_name)
    table=None
    if csvFile.endswith('.csv'):
        table = pd.read_csv(csvFile, nrows=sample,usecols=used_cols)
    if csvFile.endswith('.xlsx'):
        s3_c = boto3.client('s3')
        file_path = os.path.join(folder_name,file_name)
        obj = s3_c.get_object(Bucket=bucket_name, Key=file_path)
        data = obj['Body'].read()
        table = pd.read_excel(io.BytesIO(data),nrows=sample)
            
    if csvFile.endswith('.parquet'):
        table = pd.read_parquet(csvFile,engine='pyarrow',columns=used_cols)
        if sample:
            table = table.head(sample)
    return table

def create_bin(df,col):
    bins = []

    for val in df[col]:
        if "<=" in val:
            val = int(val.strip("<="))
        elif ">=" in val:
            continue
            #val = int(val.strip(">="))
        else:
            val = int(val.split("-")[-1])
        bins.append(val)
    return bins

def my_cut(col,x, bins,
            lower_infinite=True, upper_infinite=True,
            **kwargs):
    r"""Wrapper around pandas cut() to create infinite lower/upper bounds with proper labeling.

    Takes all the same arguments as pandas cut(), plus two more.

    Args :
        lower_infinite (bool, optional) : set whether the lower bound is infinite
            Default is True. If true, and your first bin element is something like 20, the
            first bin label will be '<= 20' (depending on other cut() parameters)
        upper_infinite (bool, optional) : set whether the upper bound is infinite
            Default is True. If true, and your last bin element is something like 20, the
            first bin label will be '> 20' (depending on other cut() parameters)
        **kwargs : any standard pandas cut() labeled parameters

    Returns :
        out : same as pandas cut() return value
        bins : same as pandas cut() return value
    """

    # Quick passthru if no infinite bounds
    if not lower_infinite and not upper_infinite:
        return pd.cut(x, bins, **kwargs)

    # Setup
    num_labels      = len(bins) - 1
    include_lowest  = kwargs.get("include_lowest", False)
    right           = kwargs.get("right", True)

    # Prepend/Append infinities where indiciated
    bins_final = bins.copy()
    if upper_infinite:
        bins_final.insert(len(bins),float("inf"))
        num_labels += 1
    if lower_infinite:
        bins_final.insert(0,float("-inf"))
        num_labels += 1

    # Decide all boundary symbols based on traditional cut() parameters
    symbol_lower  = "<=" if include_lowest and right else "<"
    left_bracket  = "(" if right else "["
    right_bracket = "]" if right else ")"
    symbol_upper  = ">" if right else ">="

    # Inner function reused in multiple clauses for labeling
    def make_label(i, lb=left_bracket, rb=right_bracket):
        return "{0}{1}, {2}{3}".format(lb, bins_final[i], bins_final[i+1], rb)

    # Create custom labels
    labels=[]
    for i in range(0,num_labels):
        new_label = None

        if i == 0:
            if lower_infinite:
                new_label = "{0} {1}".format(symbol_lower, bins_final[i+1])
            elif include_lowest:
                new_label = make_label(i, lb="[")
            else:
                new_label = make_label(i)
        elif upper_infinite and i == (num_labels - 1):
            new_label = "{0} {1}".format(symbol_upper, bins_final[i])
        else:
            new_label = make_label(i)

        labels.append(new_label)
    ref_map[col]=labels
    # Pass thru to pandas cut()
    return pd.cut(x, bins_final, labels=labels, **kwargs)


def single_itr(fts:list,df):
    # fts like ['age','sp']
    itr_cols=[]
    itr_vals=[]
    for ft in fts:
        itr_cols.append(ft)
        if ft in ref_map:
            itr_vals.append(ref_map[ft])
        else:
            itr_vals.append(df[ft].unique().tolist())

    return itr_cols,itr_vals

def prod_itr(fts:list,df):
    # fts is like ['age|covergage','age|SP']
    # df should be the data in config['entities']
    # ref_map below is like {'SP':['Y','N']}
    itr_cols=[]
    itr_vals=[]
    for ft in fts:
        ft_used = ft.split('|')
        new_ft = '_'.join(ft_used)
        itr_cols.append(new_ft)
        if new_ft not in df:
            df[new_ft]=df[ft_used[0]].astype(str)+"_"+df[ft_used[1]].astype(str) # generate new column concat with two others
        unique_vals=[ref_map[f] if f in ref_map else df[f].unique().tolist() for f in ft_used] #get unique column values, check if in the reference table list
        print(unique_vals)
        itr_val = [x[0]+'_'+x[1] for x in list(product(unique_vals[0],unique_vals[1]))] #get the product values for new feature values
        itr_vals.append(itr_val)
    return itr_cols,itr_vals
def prepare_config(config):#path,config_name):
    #with s_open(os.path.join(path,'{}.pickle'.format(config_name)), 'rb') as handle:
    #    config = pickle.load(handle)


    #get data
    for i,entity in enumerate(config['entities']):
        data_name = config['all_data'][entity['entity']]
        print('data to read:',data_name)
        data = pd_read(data_name,src='s3',sample=1000)
        #config[entity['entity']]=eval(entity['entity'])


    #map reference if have and interesting values


    # deal with reference first
    for pvt in config['pivotings']:
        if pvt['reference']!=COL_PLACEHOLDER[0]: #only accept for reference for one column; which means we only have 1 single or 1 prod feature for this time, i.e. len(fts)==0
            # get data from entities
            df_name = config['all_data'][pvt['entity']]
            df = config[df_name]
            # get features
            if 'column2' in pvt:
                print('has prod')
                fts = pvt['column1'][0]+'_'+pvt['column2'][0] #get column name like [age,coverage]
                df[fts]=df[pvt['column1'][0]].astype(str)+"_"+df[pvt['column2'][0]].astype(str) #get new column
            else:
                fts = pvt['column1'][0]
            print('fts',fts)
            grp = pd_read(pvt['reference'])
            ref_type = pvt['ref_type']
            if ref_type=='range':
                #df_used = config['datas'][data_id]
                fe = grp.columns[0]
                bins = create_bin(grp,fe)
                print(bins)
                df[fts]=my_cut(fe,df[fts],bins,right=True,include_lowest=True)
                #reference.append(pvt
            else:
                maps = pd.Series(grp.iloc[:,1].values,index=grp.iloc[:,0]).rename(fts).to_dict()
                df.map(maps)
            
            # get interesting values
            if pvt['kind']=='two':
                itr_cols,itr_vals =prod_itr([fts],df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
            else:
                itr_cols,itr_vals =single_itr([fts],df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
            
    # get intereting values for others with no reference
    #reference={}
    for pvt in config['pivotings']:
        if pvt['reference']==COL_PLACEHOLDER[0]: #only accept for reference for one column; which means we only have 1 single or 1 prod feature for this time, i.e. len(fts)==0
            df_name = config['all_data'][pvt['entity']]
            #get data
            df = config[df_name]
            #get pivot feature list and get interesting values
            if pvt['kind']=='two':
                fts = [x[0]+'|'+x[1] for x in list(product(pvt['column1'],pvt['column2']))] #fts is like ['age|covergage','age|SP']
                print(fts)
                itr_cols,itr_vals =prod_itr(fts,df)
                df = prod_itr(fts,df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
            
            else:

                fts = pvt['column1'].split('|') #like ['age','sp']
                print('iam in single',fts)
                itr_cols,itr_vals =single_itr(fts,df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
                print('in single',(itr_cols,itr_vals))
            
    print('after itr, config is',config['pivotings'])

    return config
'''

In [None]:

def create_bin(df,col):
    bins = []

    for val in df[col]:
        if "<=" in val:
            val = int(val.strip("<="))
        elif ">=" in val:
            continue
            #val = int(val.strip(">="))
        else:
            val = int(val.split("-")[-1])
        bins.append(val)
    return bins

def my_cut(col,x, bins,
            lower_infinite=True, upper_infinite=True,
            **kwargs):
    r"""Wrapper around pandas cut() to create infinite lower/upper bounds with proper labeling.

    Takes all the same arguments as pandas cut(), plus two more.

    Args :
        lower_infinite (bool, optional) : set whether the lower bound is infinite
            Default is True. If true, and your first bin element is something like 20, the
            first bin label will be '<= 20' (depending on other cut() parameters)
        upper_infinite (bool, optional) : set whether the upper bound is infinite
            Default is True. If true, and your last bin element is something like 20, the
            first bin label will be '> 20' (depending on other cut() parameters)
        **kwargs : any standard pandas cut() labeled parameters

    Returns :
        out : same as pandas cut() return value
        bins : same as pandas cut() return value
    """

    # Quick passthru if no infinite bounds
    if not lower_infinite and not upper_infinite:
        return pd.cut(x, bins, **kwargs)

    # Setup
    num_labels      = len(bins) - 1
    include_lowest  = kwargs.get("include_lowest", False)
    right           = kwargs.get("right", True)

    # Prepend/Append infinities where indiciated
    bins_final = bins.copy()
    if upper_infinite:
        bins_final.insert(len(bins),float("inf"))
        num_labels += 1
    if lower_infinite:
        bins_final.insert(0,float("-inf"))
        num_labels += 1

    # Decide all boundary symbols based on traditional cut() parameters
    symbol_lower  = "<=" if include_lowest and right else "<"
    left_bracket  = "(" if right else "["
    right_bracket = "]" if right else ")"
    symbol_upper  = ">" if right else ">="

    # Inner function reused in multiple clauses for labeling
    def make_label(i, lb=left_bracket, rb=right_bracket):
        return "{0}{1}, {2}{3}".format(lb, bins_final[i], bins_final[i+1], rb)

    # Create custom labels
    labels=[]
    for i in range(0,num_labels):
        new_label = None

        if i == 0:
            if lower_infinite:
                new_label = "{0} {1}".format(symbol_lower, bins_final[i+1])
            elif include_lowest:
                new_label = make_label(i, lb="[")
            else:
                new_label = make_label(i)
        elif upper_infinite and i == (num_labels - 1):
            new_label = "{0} {1}".format(symbol_upper, bins_final[i])
        else:
            new_label = make_label(i)

        labels.append(new_label)
    ref_map[col]=labels
    # Pass thru to pandas cut()
    return pd.cut(x, bins_final, labels=labels, **kwargs)


def single_itr(fts:list,df):
    # fts like ['age','sp']
    itr_cols=[]
    itr_vals=[]
    for ft in fts:
        itr_cols.append(ft)
        if ft in ref_map:
            itr_vals.append(ref_map[ft])
        else:
            itr_vals.append(df[ft].unique().tolist())

    return itr_cols,itr_vals

def prod_itr(fts:list,df):
    # fts is like ['age|covergage','age|SP']
    # df should be the data in config['entities']
    # ref_map below is like {'SP':['Y','N']}
    itr_cols=[]
    itr_vals=[]
    for ft in fts:
        ft_used = ft.split('|')
        new_ft = '_'.join(ft_used)
        itr_cols.append(new_ft)
        if new_ft not in df:
            df[new_ft]=df[ft_used[0]].astype(str)+"_"+df[ft_used[1]].astype(str) # generate new column concat with two others
        unique_vals=[ref_map[f] if f in ref_map else df[f].unique().tolist() for f in ft_used] #get unique column values, check if in the reference table list
        print(unique_vals)
        itr_val = [x[0]+'_'+x[1] for x in list(product(unique_vals[0],unique_vals[1]))] #get the product values for new feature values
        itr_vals.append(itr_val)
    return itr_cols,itr_vals


def prepare_config(config,bucket_name,folder_name):#path,config_name):
    #with s_open(os.path.join(path,'{}.pickle'.format(config_name)), 'rb') as handle:
    #    config = pickle.load(handle)


    #get data
    for i,entity in enumerate(config['entities']):
        data_name = config['all_data'][entity['entity']]
        print('data to read:',data_name)
        data = pd_read(data_name,src='s3',sample=1000)
        config[data_name]=data

    #map reference if have and interesting values


    # deal with reference first
    for pvt in config['pivotings']:
        if pvt['reference']!=COL_PLACEHOLDER[0]: #only accept for reference for one column; which means we only have 1 single or 1 prod feature for this time, i.e. len(fts)==0
            # get data from entities
            df_name = config['all_data'][pvt['entity']]
            df = config[df_name]
            # get features
            if 'column2' in pvt:
                print('has prod')
                fts = pvt['column1']+'|'+pvt['column2'] #get column name like 'age|coverage'
                df[fts]=df[pvt['column1']].astype(str)+"_"+df[pvt['column2']].astype(str) #get new column
            else:
                print('in refereence not prod with pvt',pvt)
                fts = pvt['column1']
            print('fts',fts)
            grp = pd_read(config['all_data'][pvt['reference']],bucket_name=bucket_name,pre=folder_name,src='s3')
            ref_type = pvt['ref_type']
            if ref_type=='range':
                #df_used = config['datas'][data_id]
                fe = grp.columns[0]
                bins = create_bin(grp,fe)
                print(bins)
                df[fts]=my_cut(fe,df[fts],bins,right=True,include_lowest=True)
                #reference.append(pvt
            else:
                maps = pd.Series(grp.iloc[:,1].values,index=grp.iloc[:,0]).rename(fts).to_dict()
                df.map(maps)

            # get interesting values
            if pvt['kind']=='two':
                itr_cols,itr_vals =prod_itr([fts],df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
            else:
                itr_cols,itr_vals =single_itr([fts],df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
    # get intereting values for others with no reference
    #reference={}
    for pvt in config['pivotings']:
        if pvt['reference']==COL_PLACEHOLDER[0]: #only accept for reference for one column; which means we only have 1 single or 1 prod feature for this time, i.e. len(fts)==0
            df_name = config['all_data'][pvt['entity']]
            #get data
            df = config[df_name]
            #get pivot feature list and get interesting values
            if pvt['kind']=='two':
                fts = [x[0]+'|'+x[1] for x in list(product(pvt['column1'].split('|'),pvt['column2'].split('|')))] #fts is like ['age|covergage','age|SP']
                itr_cols,itr_vals =prod_itr(fts,df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
            else:

                fts = pvt['column1'].split('|') #like ['age','sp']
                print('iam in single',fts)
                itr_cols,itr_vals =single_itr(fts,df)
                pvt['columns']=itr_cols
                pvt['values']=itr_vals
                print('in single',(itr_cols,itr_vals))
    print('after itr, config is',config['pivotings'])

    return config

def get_features(bucket_name,folder_name,config_name,schema_name):
    #config = prepare_config(path,config_name)
    path = os.path.join('s3://',bucket_name,folder_name)
    with s_open(os.path.join(path,'{}.pickle'.format(config_name)), 'rb') as handle:
        config = pickle.load(handle)
    config_prd = prepare_config(config,bucket_name,folder_name)
    ent = ft_agg(config_prd)
    en_set,cutoff = ent.get_entityset()
    agg_used,tfm_used,whr_pmts = ent.get_prmts()
    features = ent.run_dfs(es=en_set,cutoff=cutoff,featureonly=True)
    print('save to:',os.path.join(path,"{}.json".format(schema_name)))
    ft.save_features(features, os.path.join(path,"{}.json".format(schema_name)))

    target_idx_columns = ent.target_idx_columns
    features_out = target_idx_columns+[f.get_name() for f in features]
    return features_out#,features

# 3. feature engineerin

In [None]:


# # featuretools class


class ft_agg(object):
    def __init__(self,config):
        self.config = config
        self.entities = config['entities']
        self.relations = config['relations']
        self.pivotings = config['pivotings']
        #self.datasets=[[config['names'][i],config['datas'][i],config['keys'][i]] for i in range(len(config['names']))]
        #self.frn_keys = config['frn_keys']
        #self.relations = config['relations']

        #self.time_idx = config['time_index']

        self.agg_kpt = config['pmt_agg_kpt']
        self.tfm_kpt = config['pmt_tfm_kpt']
        #self.n_sample = config['samples']
        #self.ign_var_all = config['ignore_vars']
        #self.k_n_drp=config['keys_not_drp']
        self.time_unit = 'year'#config['t_unit'] #year,month,day, with year as default

        ### get intr config###
        #self.if_intr = config['if_intr_value']
        #self.if_prod_itr = config['if_prod_itr']
        if self.pivotings:
            self.whr_pmt = config['where_primitives']
            #self.itr_fts = config['itr_fts']
            #self.itr_vals = config['itr_vals']
        #if self.if_prod_itr:
        #    self.prod_itr_fts=config['prod_itr_fts']
        #    self.prod_itr_vals =config['prod_itr_vals']

        else:# (not self.if_intr) and (not self.if_prod_itr):
            self.whr_pmt = []
            #self.itr_val = None


        # cutoff config
        self.if_cut = False #config['has_cutoff']

        self.cutoff = None # config['cutoff_date']
        # check if use cutoff dataset
        #if config['use_cutoff_df']:
        #    self.cutoff = config['cut_df'].get_dataframe()
            #config['cut_df'] = None
        #    #config['cut_df_idx']=None
        #    self.cutoff_idx = config['cut_df_idx']

        # primitive options
        self.options = config['primitive_options']
        self.ig_vars = {}

    def get_prmts(self):
        # 1 define customized primitives
        #### already done by import v_ft

        # 2 change the time units of time-related primitives

        t = tu.time_unit(self.time_unit) #from time_unit in v_ft
        time_since_last = t.time_since_last
        time_since_first = t.time_since_first
        avg_time_between = t.avg_time_between
        time_since = t.time_since
        time_since_previous = t.time_since_previous
        #avg_age = AvgAge(time=self.cutoff)
        # 3 get used primitives'count,if_exist,time_since_first,time_since_last'
        whr_dic = {'count':'count','if_exist':enco,'time_since_first':time_since_first,'time_since_last':time_since_last}
        whr_pmts = [whr_dic[n] for n in self.whr_pmt]
        time_agg = {'if_exist':enco,'max_age':max_age,'min_age':min_age,'max_bol':max_bol,'sum_bol':sum_bol,'time_since_first':time_since_first,'time_since_last':time_since_last,'avg_time_between':avg_time_between,'min_time':min_d,'max_time':max_d}
        time_tfm = {'time_since':time_since,'time_since_previous':time_since_previous,'seasons':seasons,'partofday':PartDay,'week_day':week_day}
        #all_pmt = ft.list_primitives()
        #agg_pmt = ['sum','percent_true','num_unique','any','min','entropy','trend',\
        #           'first','mean','skew','count','num_true','all','max','last','mode','std','median']
        #tfm_pmt = ['hour','latitude','minute','month','weekday','is_null','cum_min','cum_count',\
        #         'week','longitude','second','time_since','day','percentile','year',\
        #         'time_since_previous','cum_max','is_weekend','haversine']
        #agg_custm = [min_d,max_d]#,time_since_last,time_since_first,avg_time_between]
        #tfm_custm = [time_since,time_since_previous,season]
        #print('tfm_kpt is:, ',self.tfm_kpt)
        #print('tfm jud: ',self.tfm_kpt[0]=='')
        agg_used = list(set([time_agg[p] if p in time_agg else p for p in self.agg_kpt]  + whr_pmts))
        print('agg_used',agg_used)
        print('agg_not_whr',[time_agg[p] if p in time_agg else p for p in self.agg_kpt])
        print('agg wher',whr_pmts)
        if self.tfm_kpt:
            tfm_used = list(set([time_tfm[p] if p in time_tfm else p for p in self.tfm_kpt]))
        else:
            tfm_used = []
        #self.config['agg_used'] = agg_used
        #self.config['tfm_used'] = tfm_used
        return agg_used,tfm_used,whr_pmts


    def get_info(self,df_name,info_name):
            for entity in self.entities:
                if entity['entity']==df_name:
                    return entity[info_name]

    def add_r(self,es,one_df,one_col,many_df,many_col):
        es = es.add_relationship(ft.Relationship(es[one_df][one_col],es[many_df][many_col]))
        return es

    def gen_idx(self,df,cols_k=None,name_key=None,keep_cols=[]): #no need to drop since index only has count method (and for those concat keys, since the single column is not index, so count won't be used)
        # concat keys and drop related cols but keep those also used to do calculation (like c_loss_typ in IP)
        print('col are',df.columns)
        print('key is',cols_k)
        print('name key is',name_key)
        #col_drp = []
        if not name_key in df.columns and name_key and cols_k:
            print('generate key')
            df[name_key]=df[cols_k].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
            #col_drp+=cols_k
        #print(col_drp)
        #col_drp = list(set(col_drp)-set(keep_cols))
       # print('col to drop',col_drp)
        #print('cols have',df.columns)
        #df = df.drop(columns=list(set(col_drp)))
        #print('after drop',df)
        return df

    def dup_chk(self,datasets):
        # use the key subset
        for data in datasets:
            data[2].drop_duplicates(subset=data[1], keep='first', inplace=True)

    def get_entityset(self):

        ###############################################
        #### generate keys and frn keysfor each table #
        ###############################################

        for entity in self.entities:
            # deal with time_idx
            if entity['time_idx']==COL_PLACEHOLDER[0]:
                entity['time_idx']=None
            # get target alias
            if entity['if_target']=='yes':
                self.target=entity['alias']
                self.target_idx_columns=entity['idx'].split('|') #used to recover concat index features (since index column will be in the output features' index)
                self.target_key = entity['alias']+'_key'
            if not '|' in entity['idx']:
                entity['key']=entity['idx']
                continue
            cols_key = entity['idx'].split('|') #key columns
            #self.ig_vars[entity['alias']]=cols_key #add the col that consitute the index and will be ignore during dfs
            data_cur = self.config[entity['entity']] #data
            idx_key = entity['alias']+'_key' #new key name
            entity['key']=idx_key #change name
            data_cur = self.gen_idx(data_cur,cols_key,idx_key) # no need to save back since it is mutable varaible


        for relation in self.relations:
            # get one and many alias
            one_alias = self.get_info(relation['one'],'alias')
            many_alias = self.get_info(relation['many'],'alias')
            # get key name
            frn_idx_key = many_alias+'_frn_'+one_alias
            #get many side data
            frn_data_cur = self.config[relation['many']]
            #get frn key columns
            frn_cols_key = relation['foreign_key'].split('|')
            #change parameters in reation
            relation['foreign_key'] = frn_idx_key
            relation['one']=one_alias
            relation['many']=many_alias
            relation['one_key']=relation['one']+'_key'
            frn_data_cur = self.gen_idx(frn_data_cur,frn_cols_key,frn_idx_key) # no need to save back since it is mutable varaible


        es = ft.EntitySet() #id='itli'
        # add entity
        ## check duplicate keys (should be avoided in the future)
        #self.dup_chk(new_datasets)


        ##################################
        # get cutoff date for processing#
        ##################################
        if self.if_cut:
            df_4_cutoff = new_datasets[-1][2]
            if not self.cutoff:
                self.cutoff = 'cutoff'
                df_4_cutoff[self.cutoff] =  pd.to_datetime('today')

            if self.config['use_cutoff_df']:
                print('i am in use cutoff dataset')
                cut_off_df=self.gen_idx(self.cutoff,self.cutoff_idx,new_datasets[-1][1])

            else:
                if is_numeric_dtype(df_4_cutoff[self.cutoff]) or is_string_dtype(df_4_cutoff[self.cutoff]):
                    df_4_cutoff[self.cutoff]=pd.to_datetime(df_4_cutoff[self.cutoff],errors='coerce')
                cut_off_df = df_4_cutoff[[new_datasets[-1][1],self.cutoff]].drop_duplicates()

               #make cotoff time at target entity level and no duplicates
        else:
            cut_off_df=None


        ##################################
          # add entities and relations
          #  frn names is list right now, so new_datasets[i][3][0]
          # #
        ##################################
        '''
        itr_data = [new_datasets[i][2].copy() for i in range(len(new_datasets))]
        if self.config['process']=='dev' or self.config['prod_sample']:
            print('get sample')
            n_sample =int(config['prod_sample'] or 1000)
            if self.if_cut:
                #print('check cut0')
                cut_off_df = cut_off_df.iloc[:n_sample,:]
                #print('check cut',cut_off_df.head())

            print('type is',type(new_datasets[-1][2]))
            new_datasets[-1][2] = new_datasets[-1][2].iloc[:n_sample,:]

        '''
        for entity in self.entities:
            idx_used = entity['key']
            if idx_used:
                mk_idx=False
            else:
                idx_used=entity['alias']+'_key'
                mk_idx=True

            es.entity_from_dataframe(entity_id = entity['alias'],
                                     dataframe=self.config[entity['entity']],
                                     make_index=mk_idx,
                                     index=idx_used,
                                     time_index=entity['time_idx'])
        '''
        for i,data in enumerate(new_datasets):
            print('data and time_idx',(data[0],self.time_idx[i]))
            es.entity_from_dataframe(entity_id = data[0],
                                     dataframe=data[2],
                                     index=data[1],
                                     time_index=self.time_idx[i])#,
                                    #variable_types=self.vtype[i])
        '''
            #ents.append([data[0],])
        # add relations (one to many: one_df, one_key, many_df,many_frn)
        for relation in self.relations:
            es = self.add_r(es,relation['one'],relation['one_key'],relation['many'],relation['foreign_key'])
        '''
        for r in self.relations:
            if isinstance(r[0],int):
                i,j = r
                print('rela columns',es[new_datasets[i][0]].df.columns)
                print('frn_key',new_datasets[i][3])
                es = self.add_r(es,new_datasets[j][0],new_datasets[j][1],new_datasets[i][0],new_datasets[i][3][0])
            else:
                for n,(i,j) in enumerate(r):
                    es = self.add_r(es,new_datasets[j][0],new_datasets[j][1],new_datasets[i][0],new_datasets[i][3][n])
        '''
            #es = self.add_r(es,new_datasets[1][0],new_datasets[1][1],new_datasets[0][0],new_datasets[0][3])
            #es = self.add_r(es,new_datasets[1][0],new_datasets[1][1],new_datasets[0][0],new_datasets[0][3])

        #es = add_r('ip_clms','ip_clm','claim_data','claims_frn')
        print('es is',es)



        ##################################
          # get interesting values #
        ##################################

        if self.pivotings:
            for pivoting in self.pivotings:
                alias = self.get_info(pivoting['entity'],'alias')
                for i,col in enumerate(pivoting['columns']):
                    es[alias][col].interesting_values = pivoting['values'][i]
                    print('interesting finished')
                    #print('check cutoff in get_entityset(): ',cutoff.info())

        print('get entityset done')
        return es,cut_off_df


        # get dfs result
    def run_dfs(self,es=None,cutoff = None,saved_feature = None, featureonly = False, mat_name = 'all_fm', fet_name ='Whole_schema'):
        # save_res decides whether to save dfs resutls to config
        # mat_name is the key name of the saved matrix result from dfs; fet_name is the kay name of the saved features
        #start = time.time()
        #print('time window values',time_window)
        if not es:
            es,cut_off = self.get_entityset()
        #ignore_vars = self.get_ignore_vars()
        agg_used,tfm_used,whr_pmts = self.get_prmts()
        #print('get primitives over')
        #print('trans is: ',tfm_used)
        #start2 = time.time()
        dfs_res =  ft.dfs(target_entity=self.target,
                                entityset=es,
                                #cutoff_time=self.config['cutoff'],
                                agg_primitives=agg_used,
                                #agg_primitives=[],
                                trans_primitives=tfm_used,
                                #ignore_entities=['matched_clm'],
                                where_primitives=whr_pmts,
                                ignore_variables=self.ig_vars,
                                cutoff_time = cutoff,
                                features_only = featureonly
                               )
        #end2 = time.time()
        #print('get dfs over with time: ',end2-start2)
        '''
        if not featureonly:
            self.config[fet_name] = dfs_res[1]
            self.config[mat_name] = dfs_res[0]
            self.config['index_save']=dfs_res[0].index

        else:
            self.config[fet_name] = dfs_res
            print('featureonly')
        '''
        return dfs_res

    def calculate_matrix(self,es,features,cutoff):
        mtrx = ft.calculate_feature_matrix(features=features, entityset=es,cutoff_time=cutoff)
        return mtrx

    def recover_col(self,df,cols,key):
        df = df.reset_index()
        df[cols] = df[key].str.split("_",n=len(cols)-1,expand=True)
        df.drop(columns=key,inplace=True)
        return df

    def deal_colName(self,df):
        rep_names=[]
        for i in df.columns:
            rep_names.append(i.replace('(','<').replace(')','>').replace('=','@').replace('.','/').replace(',','/').replace(' ',''))
        df.columns=rep_names
        return df



# 4. process after schema

In [None]:
def filter_cols(features,keyword=None,feature_lst=None,status='check'):
    # keyword should be a list of segments (separate segemetns) of the feature names
    # for example ['SP','_N'] in feature name: IF(claimant.idx WHERE age_SP = <= 18_N)
    if keyword:
        pattern = '.*'+'.*'.join(keyword)+'.*'
        pattern =  re.compile(pattern)
        selected = [f.get_name() for f in features if pattern.match(f.get_name())]
    if feature_lst:
        selected = feature_lst
     
    return selected if status=='check' else [f for f in features if not f.get_name() in selected]