## Build Alternative Model With WOE transform
---

Build models
* with WOE applied after feature clipping
* log reg
* rf

In [1]:
# try:
#     import autogluon
# except:
#     !python3 -m pip install --upgrade pip
#     !python3 -m pip install --upgrade "mxnet<2.0.0"
#     !python3 -m pip install autogluon
#     !pip install bokeh==2.0.1

In [2]:
import os, sys
import json
import pickle as pkl
import pandas as pd
# import autogluon as ag
# from autogluon import TabularPrediction as task

from rdsutils import datagen
from rdsutils import plot
from utils import preprocess

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
modeling_df = pd.read_parquet('../../artifacts/final/modeling_df_w_preds.parquet')
test_df = pd.read_parquet('../../artifacts/final/test_df_w_preds.parquet')
modeling_df.shape, test_df.shape

((112599, 241), (152951, 241))

In [4]:
# # should have been processed in previous iterations
modeling_df = preprocess(modeling_df)
test_df = preprocess(test_df)

In [5]:
seed = 12345
target_col = 'ach_target'

In [6]:
display(modeling_df[target_col].value_counts(), 
        test_df[target_col].value_counts())

False    108825
True       3774
Name: ach_target, dtype: int64

False    145410
True       7541
Name: ach_target, dtype: int64

#### Load params and features

In [7]:
with open('../../models/ach-model.pkl', 'rb') as f:
    model = pkl.load(f)

In [8]:
id_col = 'business_account_number'
params = model.get_params()
features = model.feature_name_

#### WOE Transform

In [9]:
import numpy as np
from rdsutils.woe import WOE_Transform

In [10]:
%%time 

# fit woe
woe = WOE_Transform(min_iv=-np.inf)
woe.fit(modeling_df[features], modeling_df[target_col].astype(int), display=-1)
df = woe.transform(modeling_df[features], train_data=1, keep=False)

# merge transformed data and record features
modeling_df = modeling_df.merge(df, how='inner', left_index=True, right_index=True)
features_woe = modeling_df.columns[modeling_df.columns.str.contains("woe")]

processed  26  num attributes

transformed num 20

CPU times: user 8.07 s, sys: 687 ms, total: 8.76 s
Wall time: 8.67 s


In [11]:
df = woe.transform(test_df[features], train_data=0, keep=False)
test_df = test_df.merge(df, how='inner', left_index=True, right_index=True)

transformed num 20



In [12]:
features = [f+'_woe' for f in features]
modeling_df[features].head()

Unnamed: 0,vantage_score_woe,first_deposit_amount_woe,bcc7120_woe,credit_card_loan_amount_woe,plaid_max_avail_bal_woe,total_tradelines_open_woe,plaid_days_since_first_link_woe,nr_transactions_30d_div_nr_past_transactions_woe,plaid_min_avail_bal_woe,nr_transactions_per_day_woe,...,phone_risk_score_woe,quovo_available_bal_woe,email_risk_score_woe,deposits_ratio_woe,fraud_score_1_woe,mean_deposits_10d_div_mean_deposits_woe,fraud_score_2_woe,nr_past_deposits_woe,quovo_min_avail_bal_woe,address_risk_score_woe
6013,-0.619,-0.7765,1.0849,0.9454,-0.2405,1.0071,-0.9554,-1.4264,-0.2405,-0.1264,...,-1.2295,0.5332,-0.6591,-0.9281,-1.2511,0.3065,-0.5257,-1.6798,0.5332,-0.8118
6452,-0.619,-0.7765,1.0849,0.9454,-0.2405,1.0071,-0.9554,-1.4264,-0.2405,-0.1264,...,-1.2295,0.1177,-0.6591,-0.8447,0.3229,-0.6188,-0.5257,-1.3427,-0.2041,0.0362
6457,-0.619,-0.7765,1.0849,0.9454,-0.2405,1.0071,-0.9554,-1.4264,-0.2405,-0.1264,...,-1.2295,0.1177,-0.6591,-0.9281,0.3229,-0.6188,-0.5257,-1.3427,-0.2041,0.0362
6458,-0.619,-0.7765,1.0849,0.9454,-0.2405,1.0071,-0.9554,-1.4264,-0.2405,-0.1264,...,-1.2295,0.1177,-0.6591,-0.9281,0.3229,-0.6188,-0.5257,-1.3427,-0.2041,0.0362
6460,-0.619,-0.7765,1.0849,0.9454,-0.2405,1.0071,-0.9554,-1.4264,-0.2405,-0.1264,...,-1.2295,0.1177,-0.6591,-0.8447,0.3229,-0.6188,-0.5257,-1.3427,-0.2041,0.0362


### Train-test-split
---

In [13]:
modeling_dfs = datagen.GroupKFoldGenerator(modeling_df, 5, seed=seed,
                              strategize_by='ach_target', 
                              groupby='business_account_number')
train, test = next(modeling_dfs)

In [14]:
modeling_dfs_ = datagen.GroupKFoldGenerator(train, 4, seed=seed,
                              strategize_by='ach_target', 
                              groupby='business_account_number')
train, valid = next(modeling_dfs_)

In [15]:
train.shape, valid.shape, test.shape, modeling_df.shape

((67559, 269), (22520, 269), (22520, 269), (112599, 269))

### AutoGluon Data
---

In [16]:
pos_wgt_scaling_factor = datagen.get_positive_label_weight(modeling_df[target_col])

# train_data = task.Dataset(df=train[features+[target_col]+[id_col]])
# valid_data = task.Dataset(df=valid[features+[target_col]+[id_col]])
# test_data = task.Dataset(df=test[features+[target_col]+[id_col]])

In [17]:
# train_data.head()

### LR Train Models
---

In [18]:
# Just use sklearn

from scipy.stats import uniform
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV

In [19]:
logistic = linear_model.LogisticRegression(class_weight={True: 28.835453100158983, False: 1})

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter distribution using uniform distribution
C = uniform(loc=0, scale=10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [20]:
# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(logistic, hyperparameters, random_state=1, n_iter=20, cv=5, verbose=0, n_jobs=-1)

In [21]:
%%time

# Fit randomized search
X = train[features_woe]
y = train[target_col]
best_model = clf.fit(X, y, groups=train[id_col])

CPU times: user 3.73 s, sys: 3.13 s, total: 6.86 s
Wall time: 4.2 s


In [22]:
best_model.best_estimator_.get_params()

{'C': 9.325573593386588,
 'class_weight': {True: 28.835453100158983, False: 1},
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [23]:
best_model.best_estimator_

LogisticRegression(C=9.325573593386588,
                   class_weight={False: 1, True: 28.835453100158983})

In [24]:
import pickle as pkl

with open("../../artifacts/final/log-reg.pkl", "wb") as f:
    pkl.dump(best_model.best_estimator_, f)

### RF
---

In [25]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import uniform, truncnorm, randint

rf = RandomForestClassifier(random_state=42, 
                            class_weight={True: 28.835453100158983, False: 1})
from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())


Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': {False: 1, True: 28.835453100158983},
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [26]:
model_params = {
    'n_estimators': randint(4,500),
    'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
    'min_samples_split': uniform(0.01, 0.199),
    'max_depth': uniform(3, 3)
}

In [27]:
# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(rf, model_params, random_state=1, n_iter=20, cv=5, verbose=0, n_jobs=-1)

In [28]:
%%time

# Fit randomized search
X = train[features_woe]
y = train[target_col]
best_model = clf.fit(X, y, groups=train[id_col])

CPU times: user 7.54 s, sys: 52.4 ms, total: 7.59 s
Wall time: 59 s


In [29]:
best_model.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': {True: 28.835453100158983, False: 1},
 'criterion': 'gini',
 'max_depth': 4.251066014107722,
 'max_features': 0.3161577490440254,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 0.010022760588651633,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 207,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [30]:
import pickle as pkl

with open("../../artifacts/final/random-forest.pkl", "wb") as f:
    pkl.dump(best_model.best_estimator_, f)