### Build ACH model - iter2 - final
---

In [1]:
import os, sys
import pickle as pkl
import pandas as pd
import autogluon as ag
from autogluon import TabularPrediction as task

from rdsutils import datagen
import rdsutils.plot as rdsplot
from rdsutils.lightgbm_helpers import train_lgb_baseline_grouped

sys.path.insert(1, '../../')
from src.utils import preprocess

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
modeling_df = pd.read_parquet('../../artifacts/20201005/modeling_df_w_baseline_preds.parquet')
test_df = pd.read_parquet('../../artifacts/20201005/test_df.parquet')
modeling_df.shape, test_df.shape

In [3]:
modeling_df = preprocess(modeling_df)

In [4]:
modeling_df.is_returned.value_counts()

False    108825
True       3774
Name: is_returned, dtype: int64

In [5]:
test_df.is_returned.value_counts()

False    84689
True      4201
Name: is_returned, dtype: int64

In [6]:
modeling_df['account_ending_balance'] = modeling_df['real_ending_balance']
modeling_df['days_since_first_transaction'] = modeling_df['days_since_first_deposit']
modeling_df['ach_target'] = modeling_df['is_returned']

### V2 Model Fitting
---

In [7]:
features = pkl.load(open('../../artifacts/20201005/features_corr_removed.pkl', 'rb'))

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

check_features = False
if check_features:
    for f in features:
        try:
            modeling_df[f].hist(bins=100)
            plt.title(f)
            plt.show()
        except:
            print(f)

In [None]:
import json

with open('../../artifacts/20201005/final_lgbm_params.json', 'r') as f:
    params_autogluon = json.load(f)

In [28]:
seed = 12345

target_col = 'ach_target'
count_pos = modeling_df[target_col].sum()
count_neg = (~modeling_df[target_col]).sum()
pos_wgt_scaling_factor = count_neg / count_pos

params = {
    "objective" : "binary",
    "metric" : "auc",
    "boosting": 'gbdt',
    "max_depth" : 3,
    "num_leaves" : 10,
    "learning_rate" : 0.02,
    "feature_fraction" : 0.6,
    "lambda_l1": 10,
    "lambda_l2": 10, 
    "min_data_in_leaf": 50,
    "scale_pos_weight": pos_wgt_scaling_factor,
    "tree_learner": "serial",
    "boost_from_average": "false",
    "bagging_seed" : seed,
    "verbosity" : -1,
    "seed": seed
}

for p in params_autogluon:
    params[p] = params_autogluon[p]

In [None]:
pred, fimp, models, auc = train_lgb_baseline_grouped(modeling_df, features, 
                                                     params, target_col, seed=seed)
modeling_df['boruta_pred_1005_clean_features'] = pred


In [None]:
rdsplot.display_feature_importance(fimp.feature, 
                                   fimp.importance,
                                   max_n_features=-1)

In [None]:
modeling_df['boruta_pred_1005_clean_features'].hist(bins=100)


### Get Stats
---

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt
from rdsutils.plot import plot_auc_curve_mult, plot_pr_curve_mult, plot_feature_over_time

plt.style.use('seaborn')

def get_binary_metrics(y_true, y_pred):
    from sklearn.metrics import roc_auc_score, average_precision_score
    from scikitplot.helpers import binary_ks_curve
    
    auc = round(roc_auc_score(y_true=y_true,
                              y_score=y_pred)*100, 2)
    ap = round(average_precision_score(y_true=y_true,
                                       y_score=y_pred)*100, 2)
    _, _, _, ks, _, _ = binary_ks_curve(y_true=y_true, y_probas=y_pred)
    ks = round(ks*100, 2) 
    
    metrics = {'auc': auc,
               'ap': ap,
               'ks': ks}

    return metrics


def get_pred_reports(df, target_col, pred_cols):
    import pandas as pd
    result = {}
    for col in pred_cols:
        metrics = get_binary_metrics(df[target_col], df[col])
        result[col] = metrics
    return pd.DataFrame(result).T

In [None]:
metrics = get_pred_reports(modeling_df, target_col, 
                           ['deposit_v1_pred', 
                            'deposit_v1_updated_pred', 
                            'customer_pred',
                            'boruta_pred_1005', 
                            'boruta_pred_1005_clean_features'])

metrics

In [None]:
preds = [(modeling_df['deposit_v1_pred'], 'deposit_v1_pred'), 
         (modeling_df['customer_pred'], 'customer_pred'),
         (modeling_df['boruta_pred_1005_clean_features'], 'boruta_pred_1005_clean_features')]

title = 'Precision-Recall curve: Baseline Comparison'
plot_pr_curve_mult(modeling_df[target_col], preds,
                   title=title, colors = ['r', 'g', 'b']) 

In [None]:
title = 'AUC-ROC curve: Baseline Comparison'
plot_auc_curve_mult(modeling_df[target_col], preds,
                   title=title, colors = ['r', 'g', 'b'])

### Compare with valid FICO
---

In [None]:
df_w_fico = modeling_df[(~modeling_df.fico_score.isna()) & (modeling_df.fico_score <= 850)]
# logically good fico score -> lower fraud prob
# flip the direction to match with target 

df_w_fico['fico_pred'] = -df_w_fico['fico_score']  
metrics = get_pred_reports(df_w_fico, target_col, 
                           ['fico_pred', 
                            'deposit_v1_pred', 
                            'customer_pred',
                            'boruta_pred_1005_clean_features'])

metrics

In [None]:
preds = [(df_w_fico['fico_pred'], 'fico_pred'),
         (df_w_fico['deposit_v1_pred'], 'deposit_v1_pred'), 
         (df_w_fico['customer_pred'], 'customer_pred'),
         (df_w_fico['boruta_pred_1005_clean_features'], 'boruta_pred_1005_clean_features')]

title = 'Precision-Recall curve: Baseline Comparison'
plot_pr_curve_mult(df_w_fico[target_col], preds,
                   title=title, colors = ['r', 'g', 'b', 'orange'])

In [None]:
title = 'AUC-ROC curve: Baseline Comparison'
plot_auc_curve_mult(df_w_fico[target_col], preds,
                   title=title, colors = ['r', 'g', 'b', 'orange'])

### Study Performance on Segments
---

In [None]:
pred_col = 'boruta_pred_1005_clean_features'


In [None]:
modeling_df[pred_col].hist(bins=100)


In [None]:
rdsplot.hist_by_target(pred_col, target_col, modeling_df)

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score

def print_metric_by_time_bin(modeling_df, target_col, pred_col, metric_fn=roc_auc_score):
#     dtmp = modeling_df[modeling_df.transaction_datetime < pd.to_datetime('2020-05-01')]
#     print("before May 2020", roc_auc_score(y_true=dtmp[target_col], y_score=dtmp[pred_col]))

    dtmp = modeling_df[modeling_df.transaction_datetime.between(pd.to_datetime('2020-05-01'), pd.to_datetime('2020-05-31'))]
    print("May 2020:", metric_fn(y_true=dtmp[target_col], y_score=dtmp[pred_col]))
    
    dtmp = modeling_df[modeling_df.transaction_datetime.between(pd.to_datetime('2020-06-01'), pd.to_datetime('2020-06-30'))]
    print("June 2020:", metric_fn(y_true=dtmp[target_col], y_score=dtmp[pred_col]))
    
    dtmp = modeling_df[modeling_df.transaction_datetime.between(pd.to_datetime('2020-07-01'), pd.to_datetime('2020-07-31'))]
    print("July 2020:", metric_fn(y_true=dtmp[target_col], y_score=dtmp[pred_col]))
    
#     dtmp = modeling_df[modeling_df.transaction_datetime > pd.to_datetime('2019-04-14')] #invalid fico
#     print("04/14/19-present:", roc_auc_score(y_true=dtmp['target'], y_score=dtmp['pred']))


In [None]:
print('--- AUC ---')
print_metric_by_time_bin(modeling_df, target_col, pred_col, roc_auc_score)

print('--- AP ---')
print_metric_by_time_bin(modeling_df, target_col, pred_col, average_precision_score)

In [None]:
modeling_df_older_accs = modeling_df[modeling_df.nr_past_transactions >= 10] 
modeling_df_younger_accs = modeling_df[modeling_df.nr_past_transactions < 10]

print('Older account: ', roc_auc_score(y_true=modeling_df_older_accs[target_col], y_score=modeling_df_older_accs[pred_col]))
print('Younger account: ', roc_auc_score(y_true=modeling_df_younger_accs[target_col], y_score=modeling_df_younger_accs[pred_col]))

In [None]:
modeling_df_fico = modeling_df[~modeling_df.fico_score.isna()] 
modeling_df_no_fico = modeling_df[modeling_df.fico_score.isna()]

print('w FICO account: ', roc_auc_score(y_true=modeling_df_fico[target_col], y_score=modeling_df_fico[pred_col]))
print('no FICO account: ', roc_auc_score(y_true=modeling_df_no_fico[target_col], y_score=modeling_df_no_fico[pred_col]))

#### Takeaway

Challenges:
- Older accounts
- Accounts w/o credit data