In [1]:
import sys, os, json, copy, sklearn, shap, gc, joblib, copy, time, ast
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
import pickle as pkl
from tqdm import tqdm
from smart_open import open
import seaborn as sns
from sklearn.metrics import roc_curve,roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from rdsutils.woe import WOE_Transform                                            
from rdsutils.metrics import get_pred_reports


sys.path.insert(1, "./to_MRM/utility")
from performance_eval_plot import score_gain, plot_efficient_frontier, plot_percentile_bad_rate
import performance_eval_v3 as p_eval
import data_summary, woe, data_eda, psi
from governance import get_risk_plot,get_feature_by_importance, get_pdp, get_shap_dependence, pickle_obj, save_shap_dependence,\
    save_pdp, plot_hist, get_woe_plots

pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

print(lgb.__version__)

3.2.1


In [2]:
config_path = "s3://sofi-data-science/hpeng/pl-gen4/artifact/config.json"
artifact_path = "s3://sofi-data-science/hpeng/pl-gen4/artifact"
with open(os.path.join(config_path), "r") as f:
    config = json.load(f)

In [3]:
exp_dict = pd.read_csv(config["meta"]["exp_dict_path"])
af_dict = pd.read_csv(config["meta"]["af_member_cust_dict_path"])

In [5]:
model_member = pkl.load(open('./to_MRM/artifact/'+'member'+'_final_model.pkl', 'rb'))
model_non_member = pkl.load(open('./to_MRM/artifact/'+'non_member'+'_final_model.pkl', 'rb'))
model_prescreen = pkl.load(open('./to_MRM/artifact/'+'prescreen'+'_final_model.pkl', 'rb'))

Trying to unpickle estimator LabelEncoder from version 0.24.1 when using version 1.0.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [6]:
feature_list = model_prescreen.feature_name_ + model_member.feature_name_ + model_non_member.feature_name_  + config["data_columns"]["weights"] \
            + config["data_columns"]["target_cols"] + config["data_columns"]["benchmark_cols"] + \
            config["data_columns"]["meta_cols"] + ['all_time_lending_member_flag'] + ['funds_use_encode_v2','income_update']
feature_list = pd.unique(feature_list).tolist()

In [7]:
df_oot1_full = pd.read_parquet(config["data"]["clean"]["all_features_dev1"], columns = feature_list)
df_oot2_full = pd.read_parquet(config["data"]["clean"]["all_features_dev2"], columns = feature_list)
df_oot_full = pd.concat([df_oot1_full, df_oot2_full], axis=0, ignore_index=True)

df_oot_full['seg'] = 'other'
df_oot_full.loc[(df_oot_full.all_time_lending_member_flag==1), 'seg'] = 'member'
df_oot_full.loc[(df_oot_full.all_time_lending_member_flag!=1), 'seg'] = 'non_member'

In [8]:
def categorical_to_nan(series, categorical_list):
    """ given series and a list of catergorical values
    
    replace the categorical occurances to nan
    """
    if len(categorical_list) == 0:
        return series
    mapper = dict((k, np.nan) for k in categorical_list)
    return series.replace(mapper)

no_special_list = []

for col in tqdm(feature_list):
    try:
        special_val = ast.literal_eval(exp_dict[exp_dict.field_name == col].categorical.iloc[0])
        special_val = [int(i) for i in special_val]
        df_oot_full[col] = categorical_to_nan(df_oot_full[col], special_val)
    except:
        no_special_list.append(col)
        
len(no_special_list)

100%|██████████| 88/88 [00:12<00:00,  6.87it/s]


30

In [10]:
df_oot_full_member_ = df_oot_full[df_oot_full.seg=='member'].copy()
df_oot_full_non_member_ = df_oot_full[df_oot_full.seg=='non_member'].copy()
df_oot_full_prescreen_ = df_oot_full.copy()

impute_vals_member = pkl.load(open('./to_MRM/artifact/'+'member'+'_final_imputed_val.pkl', 'rb'))
impute_vals_non_member = pkl.load(open('./to_MRM/artifact/'+'non_member'+'_final_imputed_val.pkl', 'rb'))
impute_vals_prescreen = pkl.load(open('./to_MRM/artifact/'+'prescreen'+'_final_imputed_val.pkl', 'rb'))

for f in tqdm(model_member.feature_name_):
    try:
        val = impute_vals_member[f]
        df_oot_full_member_[f].fillna(val, inplace=True)
    except:
        print("no special value for " + f)    
        
for f in tqdm(model_non_member.feature_name_):
    try:
        val = impute_vals_non_member[f]
        df_oot_full_non_member_[f].fillna(val, inplace=True)
    except:
        print("no special value for " + f)
        
for f in tqdm(model_prescreen.feature_name_):
    try:
        val = impute_vals_prescreen[f]
        df_oot_full_prescreen_[f].fillna(val, inplace=True)
    except:
        print("no special value for " + f)

100%|██████████| 31/31 [00:00<00:00, 1041.66it/s]


no special value for p13_iqz9420
no special value for p13_iqz9426
no special value for sofi_num_inq_12month


 35%|███▌      | 11/31 [00:00<00:00, 98.33it/s]

no special value for p13_iqz9427


100%|██████████| 31/31 [00:00<00:00, 99.02it/s] 


no special value for sofi_num_inq_1month
no special value for sofi_num_inq_3month
no special value for sofi_num_inq_personal_finance


  0%|          | 0/31 [00:00<?, ?it/s]

no special value for p13_iqz9420


 71%|███████   | 22/31 [00:00<00:00, 86.23it/s] 

no special value for p13_iqz9425


100%|██████████| 31/31 [00:00<00:00, 79.20it/s]


In [11]:
df_oot_full_member_['gen4_underwriting_prob'] = model_member.predict_proba(df_oot_full_member_[model_member.feature_name_])[:,1]
df_oot_full_non_member_['gen4_underwriting_prob'] = model_non_member.predict_proba(df_oot_full_non_member_[model_non_member.feature_name_])[:,1]
df_oot_ = pd.concat([df_oot_full_member_, df_oot_full_non_member_], axis=0, ignore_index=True)
df_oot_["weight_eval"] = df_oot_["weight_cob"] * df_oot_["weight_ri_v2"]

df_oot_full_prescreen_['gen4_prescreen_prob'] = model_prescreen.predict_proba(df_oot_full_prescreen_[model_prescreen.feature_name_])[:,1]
df_oot_full_prescreen_["weight_eval"] = df_oot_full_prescreen_["weight_cob"] * df_oot_full_prescreen_["weight_ri_v2"]

In [12]:
df_oot_.seg.value_counts()

non_member    4080905
member         323859
Name: seg, dtype: int64

In [13]:
df_oot_full_prescreen_.shape

(4404764, 91)

In [16]:
df_oot_ = pd.merge(df_oot_, df_oot_full_prescreen_[['gen4_prescreen_prob','id','applicant_type','date_start','weight_eval']],
              how = 'inner', on = ['id', 'applicant_type', 'date_start','weight_eval'])

In [17]:
factor = 40/np.log(2)
offset = 650 - factor * np.log(5)

df_oot_['uw_log_odds'] = np.log((1 - df_oot_['gen4_underwriting_prob'])/df_oot_['gen4_underwriting_prob'])
df_oot_['uw_odds'] = ((1 - df_oot_['gen4_underwriting_prob'])/df_oot_['gen4_underwriting_prob'])

df_oot_['gen4_underwriting_score'] = round(offset+factor*df_oot_['uw_log_odds'])
df_oot_.loc[df_oot_.gen4_underwriting_score>850,'gen4_underwriting_score']=850
df_oot_.loc[df_oot_.gen4_underwriting_score<300,'gen4_underwriting_score']=300

df_oot_['pr_log_odds'] = np.log((1 - df_oot_['gen4_prescreen_prob'])/df_oot_['gen4_prescreen_prob'])
df_oot_['pr_odds'] = ((1 - df_oot_['gen4_prescreen_prob'])/df_oot_['gen4_prescreen_prob'])

df_oot_['gen4_prescreen_score'] = round(offset+factor*df_oot_['pr_log_odds'])
df_oot_.loc[df_oot_.gen4_prescreen_score>850,'gen4_prescreen_score']=850
df_oot_.loc[df_oot_.gen4_prescreen_score<300,'gen4_prescreen_score']=300

In [24]:
df_oot_[df_oot_['applicant_fico_score']>850].id.nunique()/df_oot_.id.nunique()

0.007957322852943635

In [27]:
df_oot_[df_oot_['applicant_fico_score']>850].gen4_underwriting_score.describe()

count    38884.000000
mean       556.440284
std         29.990566
min        419.000000
25%        542.000000
50%        554.000000
75%        564.000000
max        850.000000
Name: gen4_underwriting_score, dtype: float64

In [26]:
df_analysis = df_oot_[df_oot_['applicant_fico_score']<=850].copy()

In [42]:
df_analysis['fico_bin'] = pd.cut(df_analysis['applicant_fico_score'],np.arange(300,860,10),right=True, include_lowest=True)
df_analysis['gen4_prescreen_bin'] = pd.cut(df_analysis['gen4_prescreen_score'],np.arange(300,860,10),right=True, include_lowest=True)
df_analysis['gen4_underwriting_bin'] = pd.cut(df_analysis['gen4_underwriting_score'],np.arange(300,860,10),right=True, include_lowest=True)

In [57]:
ds = data_summary.DataSummary(df_analysis,['target_v2'],segment=['fico_bin','gen4_prescreen_bin'],weight='weight_eval')
df_cnt_prescreen = ds.get_report()
df_cnt_prescreen = df_cnt_prescreen.rename(columns={'total cnt': 'pre_cnt','target_v2': 'pre_target_v2'})

In [58]:
ds = data_summary.DataSummary(df_analysis[df_analysis.seg=='member'],['target_v2'],segment=['fico_bin','gen4_underwriting_bin'],weight='weight_eval')
df_cnt_member = ds.get_report()
df_cnt_member = df_cnt_member.rename(columns={'total cnt': 'member_cnt','target_v2': 'member_target_v2'})

In [59]:
ds = data_summary.DataSummary(df_analysis[df_analysis.seg!='member'],['target_v2'],segment=['fico_bin','gen4_underwriting_bin'],weight='weight_eval')
df_cnt_non_member = ds.get_report()
df_cnt_non_member = df_cnt_non_member.rename(columns={'total cnt': 'non_member_cnt','target_v2': 'non_member_target_v2'})

In [61]:
df_cnt = pd.merge(df_cnt_non_member, df_cnt_member, on =['fico_bin','gen4_underwriting_bin'], how='outer')
df_cnt = pd.merge(df_cnt, df_cnt_prescreen, how = 'outer', left_on =['fico_bin','gen4_underwriting_bin'], right_on = ['fico_bin','gen4_prescreen_bin'])

In [65]:
df_cnt.to_csv('./artifacts/df_cnt_dev.csv')