In [1]:
import matplotlib.pyplot as plt
import math
import pandas as pd
import numpy as np

def create_table(df, area=None, ind='score_bucket', match=False):
    if area is None:
        df = df
    else:
        df = df.query(f"area=='{area}'")
    
    if match==False:
        ind='score_bucket'
    else:
        ind=ind


    pivot_data =\
    (
        df
        .pivot_table(
            index=ind,
            values=['phone_number_clean', 'prediction_label'],
            aggfunc={
                'phone_number_clean':len,
                'prediction_label': lambda x: (x==1).sum()
            }
        )
        .rename(
            columns={
                'phone_number_clean':'all_user_count',
                'prediction_label':'interested_user_count'
            }
        )
        .sort_values(by=ind, ascending=False)
        .assign(
            pct = lambda x: x.all_user_count / x.all_user_count.agg(sum),
            cml_count = lambda x: x.all_user_count.cumsum(),
            cml_pct = lambda x: x.pct.cumsum(),
            # uninterested_user_count = lambda x: x.all_user_count - x.interested_user_count,
            # interested_pct = lambda x: x.interested_user_count / x.all_user_count,
            # uninterested_pct = lambda x: x.uninterested_user_count / x.all_user_count,
            # all_user_cml_count = lambda x: x.all_user_count.cumsum(),
            # interested_user_cml_count = lambda x: x.interested_user_count.cumsum(),
            # uninterested_user_cml_count = lambda x: x.uninterested_user_count.cumsum(),
            # retained_cml_pct = lambda x: x.retained_user_cml_count / x.all_user_cml_count,
            # unretained_cml_pct = lambda x: x.unretained_user_cml_count / x.all_user_cml_count,
            # retained_cml_pct = lambda x: x.interested_user_cml_count / x.interested_user_count.agg(sum),
            # unretained_cml_pct = lambda x: x.uninterested_user_cml_count / x.uninterested_user_count.agg(sum),
        )
    )
    table = pivot_data[['all_user_count'] + [col for col in pivot_data.columns if col != 'all_user_count']]
    return table


# default group by score_buckets
def create_plot(df, col, ind='score_bucket', area=None, num_func='sum', bool_func='sum', show_table=True, save=True):
    if area is None:
        df = df[[ind] + [i for i in col]]
    else:
        df = df.query(f"area == '{area}'")[[ind] + [i for i in col]]

    str_col = df.drop(ind, axis=1).select_dtypes(include=['object']).columns
    num_col = df.drop(ind, axis=1).select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns
    bool_col = df.drop(ind, axis=1).select_dtypes(include=['bool']).columns

    dicts={}
    for i in str_col:
        dicts[i]='count'
    for i in num_col:
        dicts[i]=num_func
    for i in bool_col:
        dicts[i]=bool_func

    res =\
    (
        df
        .pivot_table(
            index=ind,
            values=col,
            aggfunc=dicts
        )
        .reset_index()
        .melt(
            id_vars=ind,
            value_vars=col,
            value_name='value'
        )
        .pivot(
            index=ind,
            columns='variable',
            values='value'
        )
        .sort_values('score_bucket', ascending=False)
    )

    mat = res.copy()
    for i in ['kyc', 'lyl', 'acq', 'bizage', 'edc_count', 'edc_type', 'ms', 'is_core', 'is_ppob', 'is_qris']:
        mat[f'main_{i}'] = mat[[col for col in mat.columns if col.startswith(i)]].applymap(lambda x: "{0:.2f}".format(x*100)).astype(str).apply(lambda x: x.name.split('_', maxsplit=1)[1] + ' (' + x + '%)').agg(', '.join, axis=1)
        mat = mat.drop(columns=[col for col in mat.columns if col.startswith(i)])
    mat = mat.T

    graph =\
    (
        res
        .plot(
            kind='bar',
            subplots=True,
            layout=(2, math.ceil(len(col)/2)),
            xlabel='',
            rot=(0 if ind=='score_bucket' else 90),
            sharex=(False if ind=='score_bucket' else True),
            legend=None, 
            figsize=(5*len(col),10),
            title=area
        )
    )

    if save:
        res.to_csv('result/bucket_profile.csv')
        mat.to_csv('result/bucket_matrix.csv')

    return res, graph, mat


# show both table and plot
def create_pd(df, col, ind='score_bucket', area=None, num_func='sum', bool_func='sum', match=False):
    if area is None:
        create_table(df, area, ind, match).to_csv(f'result/probability_table.csv')
    else:
        create_table(df, area, ind, match).to_csv(f'result/probability_table_{area}.csv')
    print('Displaying proability table...')
    display(create_table(df, area, ind, match))
    res, graph, mat = create_plot(df, col, ind, area, num_func, bool_func)
    print('Displaying bucket profiles...')
    display(res)
    print('Displaying bucket matrix...')
    display(mat)
    


In [None]:
# Load the most recent data from deploy_result folder
res = pd.read_csv('edc_result_1307.csv')


col = [col for col in res.columns][2:-7]
# sel = [col for col in res.select_dtypes('bool').columns]

create_pd(df=res, 
            col=col,
            # sel=sel,
        #   col = [col for col in res.columns if col.startswith('LM')],
              ind='score_bucket',
            #   area='Sumatera 2',
              num_func='mean', #sum, mean, median, count
              bool_func='mean',
            #   match=True
              )

In [2]:
# detect data nullness

train = pd.read_pickle('data/raw_data.pkl')
ops = pd.read_pickle('data/deploy_raw.pkl')

In [47]:
def create_null_table(train, name):
    # get needed values
    train_nulls = 1 - (train.count() / len(train))
    train_iqr = train.describe().iloc[6,:] - train.describe().iloc[4,:]
    # ops_nulls = 1 - (ops.count() / len(ops))
    # ops_iqr = ops.describe().iloc[6,:] - ops.describe().iloc[4,:]

    #
    train_table =\
        (
            pd.DataFrame(
                {
                    'nulls':train_nulls,
                    'iqr':train_iqr
                }
            ).T
        )

    train.to_csv(f'result/null_table_{name}.csv')
    display(train_table, 'train')

In [48]:
print('Displaying null tables for training set...')
create_null_table(train, 'train')

print('Displaying null tables for ops set...')
create_null_table(ops, 'ops')

Displaying null tables for training set...


Unnamed: 0,LM0_core_count,LM0_core_tpv,LM0_ppob_count,LM0_ppob_tpv,LM0_qris_count,LM0_qris_tpv,LM1_core_count,LM1_core_tpv,LM1_ppob_count,LM1_ppob_tpv,...,m0_transaxi_cnt,m2_Utang_cnt,m2_transaxi_cnt,m3_Utang_cnt,m3_transaxi_cnt,ms_area,phone_number_clean,ppob_before_shutdown_flag,referee_count,user_age
nulls,0.596276,0.875854,0.596276,0.844817,0.596276,0.99193,0.596276,0.854997,0.596276,0.815394,...,0.147983,0.147983,0.147983,0.147983,0.147983,0.078833,0.0,0.0,0.0,0.456238
iqr,4.0,,9.0,,0.0,,14.0,,27.0,,...,34.0,64.0,42.0,69.0,43.0,,,0.0,0.0,10.0


'train'

Displaying null tables for ops set...


Unnamed: 0,LM0_core_count,LM0_core_tpv,LM0_ppob_count,LM0_ppob_tpv,LM0_qris_count,LM0_qris_tpv,LM1_core_count,LM1_core_tpv,LM1_ppob_count,LM1_ppob_tpv,...,m0_transaxi_cnt,m2_Utang_cnt,m2_transaxi_cnt,m3_Utang_cnt,m3_transaxi_cnt,ms_area,phone_number_clean,ppob_before_shutdown_flag,referee_count,user_age
nulls,0.700908,0.976267,0.700908,0.960579,0.700908,0.998721,0.700908,0.96538,0.700908,0.949087,...,0.131937,0.131937,0.131937,0.131937,0.131937,0.162341,0.0,0.0,0.979413,0.603532
iqr,0.0,,0.0,,0.0,,0.0,,0.0,,...,0.0,1.0,0.0,2.0,1.0,,,0.0,0.0,11.0


'train'

In [31]:
ops_iqr = ops.describe().iloc[6,:] - ops.describe().iloc[4,:]
ops_iqr.to_frame('iqr')

Unnamed: 0,iqr
user_age,11.0
age_on_core_days,428.0
age_ncore_days,74.0
age_on_accounting_days,643.0
referee_count,0.0
core_before_shutdown_flag,0.0
ppob_before_shutdown_flag,0.0
m0_transaxi_cnt,0.0
m0_Utang_cnt,0.0
lm_transaxi_cnt,0.0


In [23]:
a.to_frame()

Unnamed: 0,25%
LM2_qris_count,0.0
LM3_core_count,0.0
LM3_ppob_count,0.0
LM3_qris_count,0.0
LQ1_core_count,0.0
LQ1_ppob_count,0.0
LQ1_qris_count,0.0
LQ2_core_count,0.0
LQ2_ppob_count,0.0
LQ2_qris_count,0.0


In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8055 entries, 0 to 8054
Data columns (total 50 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   phone_number_clean         8055 non-null   object
 1   interested_to_EDC_flag     8055 non-null   object
 2   ms_area                    7420 non-null   object
 3   acquisition_channel        8004 non-null   object
 4   user_age                   4380 non-null   Int64 
 5   kyc_tier                   3590 non-null   object
 6   loyalty_tier               4322 non-null   object
 7   age_on_core_days           2052 non-null   Int64 
 8   age_on_ncore_days          804 non-null    Int64 
 9   age_on_accounting_days     7174 non-null   Int64 
 10  referee_count              8055 non-null   Int64 
 11  est_daily_customer         416 non-null    object
 12  count_trf                  554 non-null    object
 13  edc_count                  422 non-null    object
 14  edc_type

In [35]:
ops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321373 entries, 0 to 321372
Data columns (total 50 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   phone_number_clean         321373 non-null  object
 1   ms_area                    269201 non-null  object
 2   interest                   321373 non-null  object
 3   acquisition_channel        321373 non-null  object
 4   user_age                   127414 non-null  Int64 
 5   kyc_tier                   321373 non-null  object
 6   loyalty_tier               321373 non-null  object
 7   age_on_core_days           52169 non-null   Int64 
 8   age_ncore_days             23635 non-null   Int64 
 9   age_on_accounting_days     278972 non-null  Int64 
 10  referee_count              6616 non-null    Int64 
 11  est_daily_customer         8564 non-null    object
 12  count_trf                  11001 non-null   object
 13  edc_count                  8645 non-null    