In [1]:
import sys, os, json
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
import pickle as pkl
from tqdm import tqdm
from smart_open import open

import seaborn as sns
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import shap

import gc

import shap
import joblib
from sklearn.model_selection import RandomizedSearchCV
import time 
import warnings

import copy
from rdsutils.woe import WOE_Transform, get_monotone_dir 

#import rdsutils.plot as rdsplot
from rdsutils.feature_selection import mrmr

import ast
from data import data_summary

from data import woe
from data import data_eda
from data import psi
from performance_eval import performance_eval_v3 as p_eval
from model_trainer import model_trainer
from feature_builder import feature_encoder
from feature_selection import feature_selection as fs
from data.preprocess import Preprocess
from data.WeightedCorr import WeightedCorr
from rdsutils.metrics import get_pred_reports
from rdsutils.plot import plot_auc_curve_mult, plot_pr_curve_mult

warnings.simplefilter(action='ignore')
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)

%load_ext autoreload
%autoreload 2

print(lgb.__version__)

3.2.1


In [2]:
df_dev = pd.read_parquet(f's3://sofi-data-science/hpeng/pl-gen4/artifact/df_gen4_score_dev.parquet')

In [3]:
df_dev.head()

Unnamed: 0,id,applicant_type,date_start,seg,ri_source,target_v2,weight_eval,gen4_prescreen_score,gen4_prescreen_prob,period,gen4_underwriting_prob,gen4_underwriting_score,gen3_score,fico_adj,applicant_vantage_score,funds_use_encode_v2,income_update,all_time_lending_member_flag
0,4937215,PRIMARY,2017-10-20,member,others,0.0,0.9996,678.0,0.108869,dev17,0.107678,679.0,660.0,686.0,653.0,1,205000.0,1
1,4937215,PRIMARY,2017-10-20,member,others,1.0,0.0004,678.0,0.108869,dev17,0.107678,679.0,660.0,686.0,653.0,1,205000.0,1
2,4820420,PRIMARY,2017-10-08,member,proxy,0.0,1.0,683.0,0.100897,dev17,0.055987,720.0,713.0,785.0,762.0,2,60000.0,1
3,4996279,PRIMARY,2017-10-27,member,proxy,0.0,1.0,683.0,0.100897,dev17,0.055105,721.0,713.0,785.0,762.0,2,60000.0,1
4,4820856,PRIMARY,2017-10-08,member,proxy,0.0,1.0,683.0,0.100897,dev17,0.055987,720.0,713.0,785.0,762.0,2,60000.0,1


In [4]:
df_recent = pd.read_parquet(f's3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_strategy/gen4_score_202104_202201.parquet')

In [5]:
df_member_flag = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/raw_tables/df_base_21.parquet')

In [6]:
display(df_recent.shape)
df_recent = pd.merge(df_recent, df_member_flag[['id','applicant_type','all_time_lending_member_flag']])
display(df_recent.shape)

(1799053, 5)

(1799053, 6)

In [7]:
df_recent.head()

Unnamed: 0,id,applicant_type,date_start,gen4_prescreen_score,gen4_underwriting_score,all_time_lending_member_flag
0,12486316,PRIMARY,2021-07-30,850.0,850.0,1
1,12598180,PRIMARY,2021-09-08,772.0,850.0,1
2,12498789,PRIMARY,2021-05-26,727.0,841.0,1
3,12601674,PRIMARY,2021-05-03,746.0,784.0,1
4,12661144,PRIMARY,2021-06-06,727.0,730.0,1


In [8]:
# prescreen model PSI
psi.calculate_psi(df_dev[~((df_dev.ri_source == 'others') & (df_dev.target_v2 == 0))]['gen4_prescreen_score'], df_recent[df_recent.date_start>='2021-10-01']['gen4_prescreen_score'], buckettype='quantiles', buckets=10, axis=1)

0.15956289514487537

In [9]:
# Underwriting model PSI
psi.calculate_psi(df_dev[~((df_dev.ri_source == 'others') & (df_dev.target_v2 == 0))]['gen4_underwriting_score'], df_recent[df_recent.date_start>='2021-10-01']['gen4_underwriting_score'], buckettype='quantiles', buckets=10, axis=1)

0.1525966493290794

In [10]:
# Member model PSI
psi.calculate_psi(df_dev[(~((df_dev.ri_source == 'others') & (df_dev.target_v2 == 0))) & (df_dev.seg=='member')]['gen4_underwriting_score'], 
                  df_recent[(df_recent.date_start>='2021-10-01') & (df_recent.all_time_lending_member_flag==1)]['gen4_underwriting_score'], buckettype='quantiles', buckets=10, axis=1)

0.14760181075344056

In [11]:
# non-member model PSI
psi.calculate_psi(df_dev[(~((df_dev.ri_source == 'others') & (df_dev.target_v2 == 0))) & (df_dev.seg!='member')]['gen4_underwriting_score'], 
                  df_recent[(df_recent.date_start>='2021-10-01') & (df_recent.all_time_lending_member_flag!=1)]['gen4_underwriting_score'], buckettype='quantiles', buckets=10, axis=1)

0.16416058673306333

In [12]:
df_feature_dev_member = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_member_psi_dev.parquet')
df_feature_dev_nonmember = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_nonmember_psi_dev.parquet')
df_feature_dev_prescreen = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_prescreen_psi_dev.parquet')

In [13]:
df_feature_21_member = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_member_psi_21q4.parquet')
df_feature_21_nonmember = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_nonmember_psi_21q4.parquet')
df_feature_21_prescreen = pd.read_parquet('s3://sofi-data-science/hpeng/pl-gen4/data_dump/data_to_mrm/df_prescreen_psi_21q4.parquet')

In [14]:
df_feature_dev_member.shape, df_feature_dev_nonmember.shape, df_feature_dev_prescreen.shape

((206316, 89), (2346519, 89), (2552835, 89))

In [15]:
df_feature_21_member.shape, df_feature_21_nonmember.shape, df_feature_21_prescreen.shape

((235427, 198), (1563626, 198), (1799053, 198))

In [16]:
prescreen_list = ['p13_bcc5520',
't11_tmti2752',
't11_tstu2752',
'p13_iqz9420',
't11_tall3205',
'p13_bcc8322',
't11_tbca4504',
'p13_upl8132',
't11_tbca3530',
't11_trev0722']
member_list = ['t11_tbca2526',
't11_tstu1752',
'sofi_all5840_mtf5838_to_income',
't11_tall02q3',
't11_tall3205',
'current_pl_trade_bal_ratio',
'p13_bcc8322',
't11_tpil01q1',
'sofi_num_month_recent_list_psl',
't11_tupl01q1']
nonmember_list = ['p13_bcc5520',
'sofi_all5840_mtf5838_to_income',
't11_tall2205',
'p13_bca8370',
't11_tstu2752',
'p13_iqz9427',
'p13_upl8132',
't11_tmti2752',
't11_tiln2755',
'sofi_num_inq_3month']

In [17]:
df_psi_member = pd.DataFrame()
for feature in member_list:
    psi_val = psi.calculate_psi(df_feature_dev_member[feature], df_feature_21_member[(df_feature_21_member.date_start>='2021-10-01')][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_member = df_psi_member.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_member

Unnamed: 0,field_name,psi
0,t11_tbca2526,0.163666
1,t11_tstu1752,0.144657
2,sofi_all5840_mtf5838_to_income,0.212683
3,t11_tall02q3,0.041064
4,t11_tall3205,0.093769
5,current_pl_trade_bal_ratio,0.317128
6,p13_bcc8322,0.130488
7,t11_tpil01q1,0.035017
8,sofi_num_month_recent_list_psl,0.053495
9,t11_tupl01q1,0.047536


In [18]:
df_psi_nonmember = pd.DataFrame()
for feature in nonmember_list:
    psi_val = psi.calculate_psi(df_feature_dev_nonmember[feature], df_feature_21_nonmember[(df_feature_21_nonmember.date_start>='2021-10-01')][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_nonmember = df_psi_nonmember.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_nonmember

Unnamed: 0,field_name,psi
0,p13_bcc5520,0.214359
1,sofi_all5840_mtf5838_to_income,0.254559
2,t11_tall2205,0.095473
3,p13_bca8370,0.159675
4,t11_tstu2752,0.166148
5,p13_iqz9427,0.021624
6,p13_upl8132,0.030139
7,t11_tmti2752,0.045117
8,t11_tiln2755,0.114093
9,sofi_num_inq_3month,0.016152


In [19]:
df_psi_prescreen = pd.DataFrame()
for feature in prescreen_list:
    psi_val = psi.calculate_psi(df_feature_dev_prescreen[feature], df_feature_21_prescreen[(df_feature_21_prescreen.date_start>='2021-10-01')][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_prescreen = df_psi_prescreen.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_prescreen

Unnamed: 0,field_name,psi
0,p13_bcc5520,0.184612
1,t11_tmti2752,0.027551
2,t11_tstu2752,0.122279
3,p13_iqz9420,0.092925
4,t11_tall3205,0.083484
5,p13_bcc8322,0.148204
6,t11_tbca4504,0.047024
7,p13_upl8132,0.02072
8,t11_tbca3530,0.080691
9,t11_trev0722,0.186737


### for NCAP analysis

In [27]:
df_psi_prescreen = pd.DataFrame()
for feature in ['p13_all8352','p13_all7936']:
    psi_val = psi.calculate_psi(df_feature_dev_prescreen[df_feature_dev_prescreen.date_start<'2017-07-01'][feature],
                                df_feature_dev_prescreen[df_feature_dev_prescreen.date_start>'2018-04-01'][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_prescreen = df_psi_prescreen.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_prescreen

Unnamed: 0,field_name,psi
0,p13_all8352,0.000863
1,p13_all7936,0.003553


In [28]:
df_psi_member = pd.DataFrame()
for feature in ['p13_all8352']:
    psi_val = psi.calculate_psi(df_feature_dev_member[df_feature_dev_member.date_start<'2017-07-01'][feature],
                                df_feature_dev_member[df_feature_dev_member.date_start>'2018-04-01'][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_member = df_psi_member.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_member

Unnamed: 0,field_name,psi
0,p13_all8352,0.00479


In [30]:
df_psi_nonmember = pd.DataFrame()
for feature in ['p13_all8352','p13_all7936']:
    psi_val = psi.calculate_psi(df_feature_dev_nonmember[df_feature_dev_nonmember.date_start<'2017-07-01'][feature],
                                df_feature_dev_nonmember[df_feature_dev_nonmember.date_start>'2018-04-01'][feature], buckettype='quantiles', buckets=10, axis=1)
    df_psi_nonmember = df_psi_nonmember.append({'field_name':feature, 'psi': psi_val}, ignore_index=True)
    
df_psi_nonmember

Unnamed: 0,field_name,psi
0,p13_all8352,0.000423
1,p13_all7936,0.002293
