# Decison Tree Model by Tensorflow Decision Forests

## 1. Load data into python

In [5]:
#Vega Setup
import logging
import pandas as pd
import numpy as np
from afterpay_gdp_interfaces import RedshiftHook

import datetime
import pytz
CST = pytz.timezone('Asia/Shanghai')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
vega = RedshiftHook(cluster='vega', okta_username='songli@squareup.com') # for vega connection
def vega_execute(query):
    import datetime
    """
        vega execute SQL wrapper to commit everytime
    """
    t_start = datetime.datetime.now()
    with vega.get_conn() as vega_conn:
        with vega_conn.cursor() as cur:
            cur.execute(query)
        vega_conn.commit()
        t_end = datetime.datetime.now()
        logging.info("Vega Query Finished. Time used: {}".format(str(t_end - t_start)))
        # vega_conn.close()

apply some initial filtering when pulling sample data

In [6]:
q="""
select
*
from sandbox_analytics_au.feature_base_sl drv
where
    par_region='AU'
    and lower(replace(in_flight_order_shipping_address_address_1,' ',''))!=lower(replace(consumer_address_1,' ',''))
    and lower(replace(in_flight_order_shipping_address_city,' ',''))!=lower(replace(consumer_city,' ',''))
    and in_flight_order_shipping_address_address_1!=''
    and days_since_first_order_date>42
    and ((TIMESTAMP 'epoch' + c_latest_login_2fa_success_timestamp::FLOAT *INTERVAL '1 second')>=cast(checkout_time as date)-30)
    and (case when nvl(in_flight_order_amount,'0')='' then '0' else in_flight_order_amount end)::float>200
;"""

In [7]:
%%time
print(datetime.datetime.now(CST))
rule_perf=vega.get_pandas_df(q)

2022-12-28 10:13:09.030139+08:00
CPU times: user 17.3 s, sys: 4.39 s, total: 21.7 s
Wall time: 45.7 s


In [8]:
rule_perf['order_date']=pd.to_datetime(rule_perf['checkout_time']).dt.date
import datetime
rule_perf['order_week']=rule_perf['order_date']-rule_perf['order_date'].apply(lambda x: datetime.timedelta(pd.to_datetime(x).weekday()))   



### check initial fraud rate by week

In [9]:
rule_perf.pivot_table(values='in_flight_order_amount',index='order_week', columns='loss_ind', aggfunc='count')

loss_ind,0.0,1.0
order_week,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-10-24,9371.0,9.0
2022-10-31,25444.0,23.0
2022-11-07,29454.0,9.0
2022-11-14,31146.0,26.0
2022-11-21,57657.0,37.0
2022-11-28,33077.0,13.0
2022-12-05,32103.0,12.0
2022-12-12,33020.0,5.0
2022-12-19,25828.0,
2022-12-26,6108.0,1.0


## 2. Import related packages
If you have not installed tensorflow use 
```shell
!pip install --user tensorflow  
!pip install --user tensorflow_decision_forests
```

In [10]:
import tensorflow_decision_forests as tfdf
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math

## 3. Preprocessing
Apply some filtering to the sample data.  
Apply feature exclusion list as some features should not be used (or you don't want to use) in the model.

In [11]:
# Apply some filter criteria
dataset_df = rule_perf[(pd.to_datetime(rule_perf['order_date'])>='2022-11-24')&(pd.to_datetime(rule_perf['order_date'])<'2022-12-12')\
                       &(rule_perf['rule_ids']=='["Approved"]')
                       &~(rule_perf['control_group'].str.lower().str.contains('control'))
                      ].copy()
# fill missing value
dataset_df['loss_ind'].fillna(0,inplace=True)

label = "loss_ind"

classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

#Exclude some features from being predictors 
dataset_df2=dataset_df[list(set(dataset_df.columns.to_list())-
                           set(['checkout_time','checkout_time_f','ato_ind','sf_id', 'gwr_ind','par_process_date'
                                ,'in_flight_order_shipping_address_hash','rule_ids','consumer_city'
                                ,'gmv_local','gmv_aud','order_week', 'sf_ind','consumer_given_names'
                                ,'c_latest_email_login_2fa_success_timestamp','session_user_agent'
                                ,'merchant_id', 'merchant_id_f','consumer_id','consumer_email','order_date', 'order_week','consumer_id_f', 'in_flight_order_consumer_email', 'consumer_mobile'
                                ,'in_flight_order_consumer_name','consumer_name','consumer_postcode','consumer_birth_date_epoch_millis','highest_delphi_score_last_12_hours'
                               ]+dataset_df.columns[dataset_df.columns.str.contains('time')].to_list()
                               +dataset_df.columns[(dataset_df.columns.str.contains('_address'))&~(dataset_df.columns.str.contains('geo'))].to_list()
                              ))].copy()
#Format conversion
dataset_df2[dataset_df2.dtypes[dataset_df2.dtypes=='bool'].index]=\
    dataset_df2[dataset_df2.dtypes[dataset_df2.dtypes=='bool'].index].astype(str)

Label classes: [0.0, 1.0]


## 4. Train Test split
Test ratio controls the test sample size.

In [12]:
def split_dataset(dataset, test_ratio=0.1):
    """Splits a panda dataframe in two."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

train_ds_pd, test_ds_pd = split_dataset(dataset_df2)
print("{} examples in training, {} examples for testing.".format(
    len(train_ds_pd), len(test_ds_pd)))

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

98057 examples in training, 10922 examples for testing.


  features_dataframe = dataframe.drop(label, 1)


## 5. Build Model
Some key parameters: 
- min_examples: controls the minumum size of the node. You need to balance between overfitting and fraud rate. 
- max_depth: depth of the tree

In [None]:
%%time
# build model
model = tfdf.keras.RandomForestModel(min_examples=30,max_depth=5,sampling_with_replacement=False,bootstrap_size_ratio=1.0,num_trees=1)

# train model
model.fit(x=train_ds, validation_data=test_ds)

# evaluate model
# model.compile(metrics=["accuracy"])
# evaluation = model.evaluate(test_ds, return_dict=True)
# print()

# for name, value in evaluation.items():
#   print(f"{name}: {value:.4f}")

## 6. Visualize the tree 

In [24]:
IPython.display.HTML(filename='/mnt/efs/jupyter-songli/Unauthorized_Fraud_Risk/SurgicalAutomation/plot.html')

In [14]:
with open("plot.html", "w") as f:
    f.write(tfdf.model_plotter.plot_model(model, tree_idx=0, max_depth=8))

from IPython.display import IFrame
IFrame(src='./plot.html', width=2000, height=600)

### Selected desired(and reasonable) nodes and check performance!

In [16]:
dataset_df[
(    
(dataset_df['sp_c_fraud_decline_attempt_d7_0']>0)
&(dataset_df['model_online_cb_global_july_2022_score']>4.8)
&(dataset_df['bp_c_merch_side_email_age_days']<=5)
)
]\
.pivot_table(values='in_flight_order_amount',index='order_week', columns='loss_ind', aggfunc='count', margins=True)

loss_ind,0.0,1.0,All
order_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-11-21,18,9,27
2022-11-28,20,3,23
2022-12-05,14,5,19
All,52,17,69
