In [1]:
import sys, warnings, os
sys.path.insert(1, '..')
import pandas as pd
import numpy as np
import pickle as pkl
import lightgbm as lgb
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

plt.style.use('seaborn')

%load_ext autoreload
%autoreload 2

In [2]:
data_path = '/home/ec2-user/SageMaker/projects-model/money-risk-models/money-customer-risk/artifacts/modeling_df_final_with_oof_preds_replication.feather'
modeling_df = pd.read_feather(data_path)

In [3]:
model = pkl.load(open('/home/ec2-user/SageMaker/projects-model/money-risk-models/money-customer-risk/models/customer_risk_target_no_giact_time_since_last_link.pkl', 'rb'))

### Get WOE

In [4]:
from rdsutils.woe import WOE_Transform

In [5]:
%%time
woe = WOE_Transform(min_iv=-np.inf)
woe.fit(modeling_df[model.feature_name_], modeling_df["target"].astype(int), display=-1)

processed  41  num attributes

CPU times: user 1min 7s, sys: 728 ms, total: 1min 7s
Wall time: 1min 7s


In [6]:
pd.set_option('display.max_columns', None)
woe.display_bin_results(["vantage_score"], simple=0)

                  attr    min    max    #accts  %accts     #good     #bad  \
0        vantage_score  300.0  380.0     759.0   0.07%     400.0    359.0   
1        vantage_score  381.0  420.0    2295.0   0.21%    1621.0    674.0   
2        vantage_score  421.0  527.0   60243.0   5.55%   43454.0  16789.0   
3        vantage_score  528.0  537.0   10992.0   1.01%    8881.0   2111.0   
4        vantage_score  538.0  542.0    5723.0   0.53%    4632.0   1091.0   
5        vantage_score  543.0  548.0    7637.0   0.70%    6385.0   1252.0   
6        vantage_score  549.0  561.0   20728.0   1.91%   17534.0   3194.0   
7        vantage_score  562.0  583.0   35928.0   3.31%   32473.0   3455.0   
8        vantage_score  584.0  606.0   40436.0   3.72%   37336.0   3100.0   
9        vantage_score  607.0  662.0  115710.0  10.65%  111007.0   4703.0   
10       vantage_score  663.0  746.0  238269.0  21.94%  234775.0   3494.0   
11       vantage_score  747.0  839.0  362663.0  33.39%  360685.0   1978.0   

In [7]:
modeling_df[model.feature_name_].isna().sum()

first_deposit_amount                   0
vantage_score                     184800
bcc7120                           293534
email_risk_score                   26148
fraud_score_2                     246891
name_email_correlation             26148
transaction_as_pct_of_balance          0
mean_account_balance_30d               0
phone_risk_score                   26148
name_address_correlation           27111
all8220                           198019
lag_acc_open_first_transaction         0
dollar_val_dd                          0
all7120                           242424
sum_deposits_10d                       0
nr_past_transactions                   0
total_tradelines_open             163895
education_loan_amount             163895
address_risk_score                 27111
iqt9415                           178034
max_withdrawals_30d                    0
iln5520                           542546
max_deposits_30d                       0
pct_returned_deposits                  0
giact_nr_decline

### Code Transform

In [13]:
df = woe.transform(modeling_df[model.feature_name_])

transformed num 20
transformed num 40



In [14]:
df.shape

(1086183, 41)

In [16]:
df.isna().sum()

first_deposit_amount_woe              0
email_risk_score_woe                  0
transaction_code_encoded_woe          0
giact_nr_decline_woe                  0
dollar_val_dd_woe                     0
nr_past_transactions_woe              0
credit_card_loan_amount_woe           0
giact_nr_other_woe                    0
fraud_score_1_woe                     0
all7120_default_encoded_woe           0
max_deposits_30d_woe                  0
all7120_woe                           0
name_address_correlation_woe          0
mean_account_balance_30d_woe          0
nr_trans_ratio_woe                    0
iqt9413_woe                           0
dollar_val_returns_3d_woe             0
lag_acc_open_first_transaction_woe    0
nr_returns_30d_woe                    0
name_email_correlation_woe            0
iln5520_woe                           0
bal_ratio_woe                         0
bcc7120_default_encoded_woe           0
dollar_val_returns_woe                0
transaction_as_pct_of_balance_woe     0


In [17]:
df = woe.transform(modeling_df[model.feature_name_], keep=True)

transformed num 20
transformed num 40



In [18]:
df.shape

(1086183, 82)

In [19]:
df.isna().sum().sort_index()

address_risk_score                    27111
address_risk_score_woe                    0
age_money_account                         0
age_money_account_woe                     0
all7120                              242424
                                      ...  
transaction_as_pct_of_balance_woe         0
transaction_code_encoded                  0
transaction_code_encoded_woe              0
vantage_score                        184800
vantage_score_woe                         0
Length: 82, dtype: int64

#### Check some features

In [20]:
pd.set_option('display.max_columns', None)
woe.display_bin_results(["vantage_score"], simple=0)

                  attr    min    max    #accts  %accts     #good     #bad  \
0        vantage_score  300.0  380.0     759.0   0.07%     400.0    359.0   
1        vantage_score  381.0  420.0    2295.0   0.21%    1621.0    674.0   
2        vantage_score  421.0  527.0   60243.0   5.55%   43454.0  16789.0   
3        vantage_score  528.0  537.0   10992.0   1.01%    8881.0   2111.0   
4        vantage_score  538.0  542.0    5723.0   0.53%    4632.0   1091.0   
5        vantage_score  543.0  548.0    7637.0   0.70%    6385.0   1252.0   
6        vantage_score  549.0  561.0   20728.0   1.91%   17534.0   3194.0   
7        vantage_score  562.0  583.0   35928.0   3.31%   32473.0   3455.0   
8        vantage_score  584.0  606.0   40436.0   3.72%   37336.0   3100.0   
9        vantage_score  607.0  662.0  115710.0  10.65%  111007.0   4703.0   
10       vantage_score  663.0  746.0  238269.0  21.94%  234775.0   3494.0   
11       vantage_score  747.0  839.0  362663.0  33.39%  360685.0   1978.0   

In [21]:
df[["vantage_score", "vantage_score_woe"]].head(10)

Unnamed: 0,vantage_score,vantage_score_woe
0,,-0.2629
1,734.0,-1.1471
2,,-0.2629
3,780.0,-2.1455
4,737.0,-1.1471
5,,-0.2629
6,790.0,-2.1455
7,,-0.2629
8,482.0,2.1095
9,540.0,1.6146
