# Decison Tree Model by Tensorflow Decision Forests

## 1. Load data into python
Change user id

In [1]:
#Vega Setup
import logging
import pandas as pd
import numpy as np
from afterpay_gdp_interfaces import RedshiftHook

import datetime
import pytz
CST = pytz.timezone('Asia/Shanghai')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
vega = RedshiftHook(cluster='vega', okta_username='songli@squareup.com') # for vega connection
def vega_execute(query):
    import datetime
    """
        vega execute SQL wrapper to commit everytime
    """
    t_start = datetime.datetime.now()
    with vega.get_conn() as vega_conn:
        with vega_conn.cursor() as cur:
            cur.execute(query)
        vega_conn.commit()
        t_end = datetime.datetime.now()
        logging.info("Vega Query Finished. Time used: {}".format(str(t_end - t_start)))
        # vega_conn.close()

apply some initial filtering when pulling sample data

In [2]:
q="""
select
*
from sandbox_analytics_au.feature_base_sl drv
where
    par_region='AU'
    and lower(replace(in_flight_order_shipping_address_address_1,' ',''))!=lower(replace(consumer_address_1,' ',''))
    and lower(replace(in_flight_order_shipping_address_city,' ',''))!=lower(replace(consumer_city,' ',''))
    and in_flight_order_shipping_address_address_1!=''
    and days_since_first_order_date>42
    and ((TIMESTAMP 'epoch' + c_latest_login_2fa_success_timestamp::FLOAT *INTERVAL '1 second')>=cast(checkout_time as date)-30)
    and (case when nvl(in_flight_order_amount,'0')='' then '0' else in_flight_order_amount end)::float>200
;"""

In [3]:
%%time
print(datetime.datetime.now(CST))
rule_perf=vega.get_pandas_df(q)

2023-04-26 13:26:55.924972+08:00
CPU times: user 12.4 s, sys: 2.23 s, total: 14.7 s
Wall time: 26.3 s


In [4]:
rule_perf['order_date']=pd.to_datetime(rule_perf['checkout_time']).dt.date
import datetime
rule_perf['order_week']=rule_perf['order_date']-rule_perf['order_date'].apply(lambda x: datetime.timedelta(pd.to_datetime(x).weekday()))   



### check initial fraud rate by week

In [5]:
rule_perf.pivot_table(values='in_flight_order_amount',index='order_week', columns='loss_ind', aggfunc='count')

loss_ind,0.0,1.0
order_week,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-20,7312.0,3.0
2023-02-27,18707.0,7.0
2023-03-06,18015.0,8.0
2023-03-13,21514.0,9.0
2023-03-20,19305.0,5.0
2023-03-27,19850.0,7.0
2023-04-03,19418.0,4.0
2023-04-10,21143.0,9.0
2023-04-17,20836.0,2.0
2023-04-24,2183.0,


## 2. Import related packages
If you have not installed tensorflow use 
```shell
!pip install --user tensorflow  
!pip install --user tensorflow_decision_forests
```

In [6]:
import tensorflow_decision_forests as tfdf
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math



## 3. Preprocessing
Apply some filtering to the sample data.  
Apply feature exclusion list as some features should not be used (or you don't want to use) in the model.

In [7]:
# Apply some filter criteria
dataset_df = rule_perf[(pd.to_datetime(rule_perf['order_date'])>='2022-11-24')#&(pd.to_datetime(rule_perf['order_date'])<'2022-12-12')\
                       &(rule_perf['rule_ids']=='["Approved"]')
                       &~(rule_perf['control_group'].str.lower().str.contains('control'))
                       
                       & (rule_perf['days_since_first_order_date']>300)
                      ].copy()
# fill missing value
dataset_df['loss_ind'].fillna(0,inplace=True)

label = "loss_ind"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

#Exclude some features from being predictors 
dataset_df2=dataset_df[list(set(dataset_df.columns.to_list())-
                           set(['checkout_time','checkout_time_f','ato_ind','sf_id', 'gwr_ind','par_process_date'
                                ,'in_flight_order_shipping_address_hash','rule_ids','consumer_city'
                                ,'gmv_local','gmv_aud','order_week', 'sf_ind','consumer_given_names'
                                ,'c_latest_email_login_2fa_success_timestamp','session_user_agent'
                                ,'merchant_id', 'merchant_id_f','consumer_id','consumer_email','order_date', 'order_week','consumer_id_f', 'in_flight_order_consumer_email', 'consumer_mobile'
                                ,'in_flight_order_consumer_name','consumer_name','consumer_postcode','consumer_birth_date_epoch_millis','highest_delphi_score_last_12_hours'
                               ]+dataset_df.columns[dataset_df.columns.str.contains('time')].to_list()
                               +dataset_df.columns[(dataset_df.columns.str.contains('_address'))&~(dataset_df.columns.str.contains('geo'))].to_list()
                              ))].copy()
#Format conversion
dataset_df2[dataset_df2.dtypes[dataset_df2.dtypes=='bool'].index]=\
    dataset_df2[dataset_df2.dtypes[dataset_df2.dtypes=='bool'].index].astype(str)

Label classes: [0.0, 1.0]


## 4. Train Test split
Test ratio controls the test sample size.

In [8]:
def split_dataset(dataset, test_ratio=0.1):
    """Splits a panda dataframe in two."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

train_ds_pd, test_ds_pd = split_dataset(dataset_df2)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

160542 examples in training, 17792 examples for testing.


  features_dataframe = dataframe.drop(label, 1)


## 5. Build Model
Some key parameters: 
- min_examples: controls the minumum size of the node. You need to balance between overfitting and fraud rate. 
- max_depth: depth of the tree

In [11]:
%%time
# build model
model = tfdf.keras.RandomForestModel(min_examples=30,max_depth=5, num_trees=3)#,sampling_with_replacement=False,bootstrap_size_ratio=1.0, num_candidate_attributes=-1, num_trees=3)

# train model
model.fit(x=train_ds, validation_data=test_ds)

Use /tmp/tmpv27v65lc as temporary training directory
Starting reading the dataset
Dataset read in 0:00:07.972073
Training model
Model trained in 0:00:01.194876
Compiling model




CPU times: user 14 s, sys: 481 ms, total: 14.5 s
Wall time: 11.2 s


<keras.callbacks.History at 0x7f7ce00d6160>

In [None]:
model.summary()

## 6. Visualize the tree 

In [12]:
with open("plot.html", "w") as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=8))

from IPython.display import IFrame
IFrame(src='./plot.html', width=2000, height=600)

In [13]:
with open("plot.html", "w") as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=1, max_depth=8))

from IPython.display import IFrame
IFrame(src='./plot.html', width=2000, height=600)

In [14]:
with open("plot.html", "w") as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=2, max_depth=8))

from IPython.display import IFrame
IFrame(src='./plot.html', width=2000, height=600)

In [19]:
inspector=model.make_inspector()
inspector.extract_tree(tree_idx=0)

Tree(NonLeafNode(condition=(sp_c_online_order_amount_h12_0 >= 324.30999755859375; miss=False), pos_child=NonLeafNode(condition=(sp_c_fraud_decline_attempt_h1_0 >= 0.5; miss=False), pos_child=LeafNode(value=ProbabilityValue([0.9491525423728814, 0.05084745762711865],n=59.0), idx=11), neg_child=NonLeafNode(condition=(model_online_cb_global_july_2022_score >= 120.59197998046875; miss=False), pos_child=LeafNode(value=ProbabilityValue([0.8787878787878788, 0.12121212121212122],n=33.0), idx=10), neg_child=NonLeafNode(condition=(bp_m_mcc_code in ['5732', '5691']; miss=False), pos_child=LeafNode(value=ProbabilityValue([0.9947089947089947, 0.005291005291005291],n=378.0), idx=9), neg_child=LeafNode(value=ProbabilityValue([1.0, 0.0],n=4051.0), idx=8), value=ProbabilityValue([0.9995484307970196, 0.0004515692029803567],n=4429.0)), value=ProbabilityValue([0.9986553115194979, 0.001344688480502017],n=4462.0)), value=ProbabilityValue([0.9980092899800929, 0.0019907100199071004],n=4521.0)), neg_child=NonLe

### Selected desired(and reasonable) nodes and check performance!

In [18]:
dataset_df[
(    
(dataset_df['sp_c_fraud_decline_attempt_d7_0']>0)
&(dataset_df['model_online_cb_global_july_2022_score']>4.8)
&(dataset_df['bp_c_merch_side_email_age_days']<=5)
)
]\
.pivot_table(values='in_flight_order_amount',index='order_week', columns='loss_ind', aggfunc='count', margins=True)

loss_ind,0.0,1.0,All
order_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-02-20,1.0,,1
2023-02-27,15.0,,15
2023-03-06,13.0,2.0,15
2023-03-13,24.0,,24
2023-03-20,11.0,,11
2023-03-27,13.0,1.0,14
2023-04-03,12.0,,12
2023-04-10,15.0,3.0,18
2023-04-17,7.0,,7
2023-04-24,1.0,,1
