In [5]:
import pandas as pd
import numpy as np
import itertools
import warnings
from itertools import chain
from collections import Counter


# split the events
def get_event_list(x):
    event_list = []
    for i in x:
        event_list.append(i)
    return event_list

# clean the data (for raw)
def clean_data(raw):
    df =\
    (
        raw
        .assign(
            pic_age = lambda x: x.pic_age.str.replace('-','to').str.replace('+','plus').str.split(',').str[0], # get the first value in case of multiple pic_age
            business_age = lambda x: x.business_age.str.replace('-','to').str.replace('+','plus').str.split(',').str[0], # get the first value in case of multiple business_age
            surrounding_area_v2 = lambda x: x.surrounding_area_v2.str.replace('/','or').str.replace(' ',''),
            edc_count = lambda x: x.edc_count.str.replace('-', 'to'),
            mbanking_count = lambda x: x.mbanking_count.str.replace('-', 'to'),
            count_employee = lambda x: pd.to_numeric(x.count_employee, errors='coerce'),
            est_daily_customer = lambda x: pd.to_numeric(x.est_daily_customer, errors='coerce'),
            count_trf = lambda x: pd.to_numeric(x.count_trf),
            interest_to_loan_flag = lambda x: x.interest_to_loan_flag.str.replace('Ya', 'Tertarik')=="Tertarik", # as bool
            cumulative_W3_core_NetRevenue = lambda x: pd.to_numeric(x.cumulative_W3_core_NetRevenue, errors='coerce'),
            W3_retained_flag = lambda x: x.W3_retained_flag.fillna(0).astype(bool),
            offline_acquired_date = lambda x: pd.to_datetime(x.offline_acquired_date),
            # -- core
            # F3D_core_TPU = lambda x: pd.to_numeric(x.F3D_core_TPU, errors='coerce'),
            F3D_core_TPU_wd = lambda x: pd.to_numeric(x.F3D_core_TPU_wd, errors='coerce'),
            # F3D_core_TPV = lambda x: pd.to_numeric(x.F3D_core_TPV, errors='coerce'),
            F3D_core_TPV_wd = lambda x: pd.to_numeric(x.F3D_core_TPV_wd, errors='coerce'),
            # F3D_core_NetRevenue = lambda x: pd.to_numeric(x.F3D_core_NetRevenue, errors='coerce'),  
            F3D_core_NetRevenue_wd = lambda x: pd.to_numeric(x.F3D_core_NetRevenue_wd, errors='coerce'),
            F3D_wallet_share = lambda x: pd.to_numeric(np.where(x.F3D_wallet_share > 1, None, x.F3D_wallet_share*100)),
            # -- ppob
            # F3D_ppob_TPU = lambda x: pd.to_numeric(x.F3D_ppob_TPU, errors='coerce'),
            F3D_ppob_TPU_wd = lambda x: pd.to_numeric(x.F3D_ppob_TPU_wd, errors='coerce'),
        )
        # normalization
        .assign(
            count_employee = lambda x: x.count_employee.apply(np.log1p),
            est_daily_customer = lambda x: x.est_daily_customer.apply(np.log1p),
            count_trf = lambda x: pd.to_numeric(np.where(((x.count_trf > 5000) & (x.count_trf < 0)), None, x.count_trf.apply(np.log1p))),
            # -- core
            cumulative_W3_core_NetRevenue = lambda x: x.cumulative_W3_core_NetRevenue.apply(np.log1p),
            # F3D_core_TPU = lambda x: x.F3D_core_TPU.apply(np.log1p).astype(float),
            F3D_core_TPU_wd = lambda x: x.F3D_core_TPU_wd.apply(np.log1p).astype(float),
            # F3D_core_TPV = lambda x: x.F3D_core_TPV.apply(np.log1p).astype(float),
            F3D_core_TPV_wd = lambda x: x.F3D_core_TPV_wd.apply(np.log1p).astype(float),
            # F3D_core_NetRevenue = lambda x: x.F3D_core_NetRevenue.apply(np.log1p),
            F3D_core_NetRevenue_wd = lambda x: x.F3D_core_NetRevenue_wd.apply(np.log1p).astype(float),
            F3D_wallet_share = lambda x: x.F3D_wallet_share.apply(np.log1p),
            # -- ppob
            # F3D_ppob_TPU = lambda x: x.F3D_ppob_TPU.apply(np.log1p).astype(float),
            F3D_ppob_TPU_wd = lambda x: x.F3D_ppob_TPU_wd.apply(np.log1p).astype(float),
        )
        .apply(lambda x: (x==1).fillna(False).astype(bool) if x.name in raw.filter(like='flag').columns else x) #change all "_flag" columns to boolean
    )

    df =\
    (
        df
        .assign(
            event_list = lambda x: x.event.apply(get_event_list)
        )
        .drop(columns=['event'])
    )

    # get all columns except for event_list
    other_col = list(df.drop(columns='event_list').columns)

    # split each item in each list into multiple columns and count their occurence
    for i in sorted(set(sum(df['event_list'].tolist(),[]))):  
        df[i] = df['event_list'].apply(lambda x: x.count(i) if i in x else np.NaN)

    # get only the top 50 events
    count = pd.Series(Counter(chain.from_iterable(df.event_list)))
    event_col = (count.sort_values(ascending=False)[:100].index.tolist())

    # # remove any event columns that are related to payment
    event_col = [d for d in event_col if not any(True for w in ['payment', 'pembayaran', 'Payment', 'nulltest', 'ppob'] if w in d)]
    col = event_col + other_col

    return df[col]


# # turn numeric columns into categorical
def cat_num(x):
    x2 = x[x > 0].describe()[4]
    x3 = x[x > 0].describe()[5]
    x4 = x[x > 0].describe()[6]
    x5 = x[x > 0].describe()[7]
    xcuts = pd.cut(x, bins=[0, x2, x3, x4, x5], include_lowest=True, duplicates='drop')
    return xcuts

# create pivot table
def create_pivot(df, target_col, univariate=True):
    for i in [col for col in df.select_dtypes(include=['float', 'int']).columns]:
        df[i] = cat_num(df[i])
    
    if univariate:
        for i in df.drop(columns=target_col):
            pivot = df.pivot_table(index=i, values=target_col, aggfunc=['mean', 'sum', 'count'])
            display(pivot)
    else:
        for i in list(itertools.combinations(df.drop(columns=target_col).columns, 2)):
            pivot = df.pivot_table(index=[i[0], i[1]], values=target_col, aggfunc=['mean', 'sum', 'count'])
            display(pivot)


In [6]:
warnings.filterwarnings('ignore')

res = (pd.read_csv('result/train_inference.csv')).append(pd.read_csv('result/test_inference.csv'))

