In [1]:
import os 
cwd = os.getcwd()

os.chdir('../')
cwd = os.getcwd()
print(cwd)

os.chdir('./code/')
cwd = os.getcwd()
print(cwd)

import pandas as pd 
import numpy as np 
import seaborn as sns

import pickle 
import logging


# from config.config import SQLQuery
# querySno = SQLQuery('snowflake')

/Users/shashankgupta/Documents/code/git_project/plaid_credit
/Users/shashankgupta/Documents/code/git_project/plaid_credit/Code


In [2]:
from preprocess import Convert,MissingValues,Outlier,FeatureSelection
from feature_transformation import Scaler,Transform,Selection
from model_building import split_test_train, feature_encoding, classification_models
from model_evaluations import model_metrics, feature_importance, probability_bins, cross_validation
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

from statsmodels.stats.outliers_influence import variance_inflation_factor

# object initiation 
tf = Transform()
sel = Selection()
ft = FeatureSelection()
cv = Convert()
mv = MissingValues()
ot = Outlier()

# set seed
seed = 9

In [3]:
corr_arr = 0.4 # person correlation coefficient # change it to 0.5
vif_arr = 5 # vif coefficient
features_arr = 10 # total number of features to be selected from backward feature selection
iv_upper_limit = 0.5 # upper threshold of iv # change it to 0.6
iv_lower_limit = 0.02 # lower threshold of iv

In [4]:
df_raw = pd.read_pickle('/Users/shashankgupta/Documents/code/git_project/plaid_credit/data/final_dataset.pkl')
df_raw.shape

(646, 243)

In [5]:
df_raw.columns

Index(['business_id', 'lending_business_id', 'decision_date', 'drawn_flag',
       'everDPD_15', 'fico_score', 'target', 'loans_flag', 'payroll_flag',
       'pos_flag',
       ...
       'sum_credits_grt_1500_6M', 'ratio_credits_lessthan_100_1M_3M',
       'ratio_credits_lessthan_100_1M_6M', 'ratio_credits_grt_500_1M_3M',
       'ratio_credits_grt_500_1M_6M', 'ratio_credits_grt_1500_1M_3M',
       'ratio_credits_grt_1500_1M_6M', 'txn_each_mth_flag',
       'txn_grt_100_each_mth_flag', 'txn_flag'],
      dtype='object', length=243)

In [6]:
df_raw['txn_flag'].value_counts()

txn_flag
1.0    637
0.0      9
Name: count, dtype: int64

In [7]:
df_raw = df_raw[df_raw['txn_flag']==1]
df_raw.shape

(637, 243)

In [8]:
df_raw = df_raw.drop(['business_id', 'lending_business_id','decision_date','drawn_flag', 'everDPD_15', 'fico_score','txn_flag'], axis=1)
df_raw.shape

(637, 236)

In [9]:
## train-test split

# train test split
x_train, y_train, x_test, y_test = split_test_train(df_raw, target_column='target', test_size=0.3, random_state=seed)
print(f'{x_train.shape = }', '|' ,f'{y_train.shape = }', '|' ,f'{x_test.shape = }', '|' ,f'{y_test.shape = }')


# copy to df
df = x_train.copy(deep=True)

x_train.shape = (445, 235) | y_train.shape = (445,) | x_test.shape = (192, 235) | y_test.shape = (192,)


In [10]:
# get constant features
def get_const_features(df):
    const_list = []
    for col in df.columns: 
        if (len(df[col].unique())==1):
            const_list.append(col)
    return(const_list)

# remove constant features
const_list = get_const_features(df)
df = df.drop(columns=const_list)
df.shape

(445, 235)

In [11]:
# get quasi-constant features
def get_quasi_const_features(df, threshold=0.01):
    qconst_list = []
    for col in df.columns: 
        if (df[col].var() <= threshold):
            qconst_list.append(col)
    return(qconst_list)

# remove constant features
qconst_list = get_quasi_const_features(df, threshold=0.01)
df = df.drop(columns=qconst_list)
df.shape

(445, 226)

In [12]:
# view missing values
def missing_value(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_val_df = pd.DataFrame({'percent_missing': percent_missing})
    missing_val_df.sort_values(by='percent_missing', ascending=False, inplace=True)
    return missing_val_df

missing_value(df)

Unnamed: 0,percent_missing
stddev_loans_amt_1M,96.179775
stddev_ecom_amt_1M,93.033708
stddev_loans_amt_3M,91.235955
ratio_stddev_loan_amt_3M_6M,91.235955
loans_amt_1M,91.011236
...,...
third_party_flag,0.000000
ecom_flag,0.000000
pos_flag,0.000000
txn_each_mth_flag,0.000000


In [13]:
t = missing_value(df)
drop_cols = list(t[t['percent_missing']>80].index)
df.drop(drop_cols,axis=1,inplace=True)
df.shape

(445, 187)

In [14]:
# Treating missing values
df = df.fillna(0)

In [15]:
# get boolean columns
def findbool(df):
    bool_arr = []
    for col in df.columns: 
        if (len(df[col].unique())<=2):
            bool_arr.append(col)
    return(bool_arr)

# get datatypes frequency
def get_datatypes_freq(df):
    type_dct = {str(k): list(v) for k, v in df.groupby(df.dtypes, axis=1)}
    type_dct_info = {k: len(v) for k, v in type_dct.items()}
    return type_dct, type_dct_info

type_dct, type_dct_info = get_datatypes_freq(df)
print(type_dct_info)

bool_col_list = findbool(df)
print(len(bool_col_list))

type_dct, type_dct_info = get_datatypes_freq(df)
col_list = (type_dct['float64'])
col_list_excpt_bool = [column for column in col_list if column not in bool_col_list]
print(len(col_list_excpt_bool))

{'float64': 187}
10
177


In [16]:
## iv_woe

def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})

        
        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        
        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [17]:
# remove features on basis of IV
# y_train.reset_index(drop=True, inplace=True)
df['target'] = y_train
df['target'] = df['target'].astype(float)
temp = df.copy()

t1, t2 = iv_woe(temp[np.append(col_list_excpt_bool,['target'])], 'target', bins=5, show_woe=False)
feature_list = list(t1[ (t1['IV']<iv_upper_limit) & (t1['IV']>iv_lower_limit) ]['Variable'].values)
len(feature_list)

Information value of total_credit_count_1M is 0.046417
Information value of total_credit_count_3M is 0.123842
Information value of total_credit_count_6M is 0.106814
Information value of total_credit_amt_1M is 0.039476
Information value of total_credit_amt_3M is 0.087181
Information value of total_credit_amt_6M is 0.046794
Information value of credit_size_1M is 0.117904
Information value of credit_size_3M is 0.286985
Information value of credit_size_6M is 0.244407
Information value of stddev_credit_amt_1M is 0.142908
Information value of median_credit_amt_1M is 0.14747
Information value of stddev_credit_amt_3M is 0.151783
Information value of median_credit_amt_3M is 0.195674
Information value of stddev_credit_amt_6M is 0.090132
Information value of median_credit_amt_6M is 0.254997
Information value of ratio_stddev_credit_amt_1M_6M is 0.042563
Information value of ratio_stddev_credit_amt_1M_3M is 0.016118
Information value of ratio_median_credit_amt_1M_6M is 0.056231
Information value of

128

In [18]:
# view correlation
corr_df, subset_df = sel.get_correlated_features(df, feature_list, thresh=corr_arr)
corr_df


# remove correlated features
feature_list = sel.corr_iter(df, np.array(feature_list), thresh=corr_arr)
feature_list = list(feature_list)
len(feature_list)

28

In [19]:
# get feature list after iterative VIF elimination
def vif_iter(df, iv, threshold=10):
    vif_data = pd.DataFrame()
    vif_data["feature"] = iv
    vif_data["VIF"] = [variance_inflation_factor(df[iv].values, i) for i in range(len(iv))]
    if len(vif_data[vif_data['VIF'] == np.inf]) > 0:
        feature = vif_data[vif_data['VIF'] == np.inf]['feature'].iloc[0]
        iv.remove(feature)
        vif_iter(df, iv, threshold)
    elif len(vif_data[vif_data['VIF'] > threshold]) > 0:
        feature = vif_data.sort_values(by='VIF', ascending=False)['feature'].iloc[0]
        iv.remove(feature)
        vif_iter(df, iv, threshold)
    vif_data = pd.DataFrame()
    vif_data["feature"] = iv
    vif_data["VIF"] = [variance_inflation_factor(df[iv].values, i) for i in range(len(iv))]
    return iv, vif_data

feature_list, vif_df = vif_iter(df, feature_list, threshold=vif_arr)
len(feature_list)

28

In [20]:
# Backward feature elimination
feat_list = ft.backward_feature_selection(df[feature_list], y_train, num_features=features_arr)
feat_list

['stddev_credit_amt_3M',
 'median_shops_amt_6M',
 'stddev_payroll_amt_3M',
 'shops_count_3M',
 'pos_amt_1M',
 'ratio_stddev_third_party_amt_3M_6M',
 'ratio_pos_size_credit_size_1M',
 'count_credits_lessthan_100_6M',
 'count_credits_grt_500_6M',
 'ratio_credits_grt_1500_1M_3M']

In [21]:
## optimal binning woe

import optbinning as optb
from optbinning import Scorecard, BinningProcess, OptimalBinning
from optbinning.binning.binning_statistics import BinningTable

df_temp = df[feat_list].copy()
df_temp['target'] = df['target']

(CVXPY) Jun 29 04:14:24 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Jun 29 04:14:24 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.6.2534). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')


In [None]:
# 1
Xt= df_temp['stddev_credit_amt_3M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='stddev_credit_amt_3M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending',special_codes=[0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_stddev_credit_amt_3M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_stddev_credit_amt_3M

In [None]:
# 1
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'stddev_credit_amt_3M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 578), 
                (transformed_vars[col] >= 578 ) & (transformed_vars[col] < 1274), 
                (transformed_vars[col] >= 1274) & (transformed_vars[col] < 7646),
                transformed_vars[col] >= 7646 ]

choices     = [0.090405,-0.551449,-0.013445, 0.40452,1.951157]
    
transformed_vars["stddev_credit_amt_3M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 2
Xt= df_temp['median_shops_amt_6M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='median_shops_amt_6M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending',special_codes = [0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_median_shops_amt_6M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_median_shops_amt_6M

In [None]:
# 2
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'median_shops_amt_6M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 109), 
                (transformed_vars[col] >= 109 ) & (transformed_vars[col] < 954),
                transformed_vars[col] >= 954 ]

choices     = [0.175146,-0.548348,-0.31506, 0.734762]
    
transformed_vars["median_shops_amt_6M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 3
Xt= df_temp['stddev_payroll_amt_3M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='stddev_payroll_amt_3M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending',special_codes = [0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_stddev_payroll_amt_3M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_stddev_payroll_amt_3M

In [None]:
# 3
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'stddev_payroll_amt_3M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 901),
                transformed_vars[col] >= 901 ]

choices     = [0.028132,-0.36385,0.814805]
    
transformed_vars["stddev_payroll_amt_3M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 4
Xt= df_temp['shops_count_3M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='shops_count_3M', dtype="numerical", max_n_prebins=3, monotonic_trend='descending',special_codes = [0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_shops_count_3M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_shops_count_3M

In [None]:
# 3
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'shops_count_3M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 4.5),
                transformed_vars[col] >= 4.5 ]

choices     = [0.226839,-0.347537,0.018625]
    
transformed_vars["shops_count_3M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 5
Xt= df_temp['pos_amt_1M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='pos_amt_1M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending')
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_pos_amt_1M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_pos_amt_1M

In [None]:
# 5
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'pos_amt_1M'
conditions  = [ transformed_vars[col] < 1780,
                transformed_vars[col] >= 1780 ]

choices     = [-0.102424,0.586842]
    
transformed_vars["pos_amt_1M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 6
Xt= df_temp['ratio_stddev_third_party_amt_3M_6M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='ratio_stddev_third_party_amt_3M_6M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending',special_codes = [0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_ratio_stddev_third_party_amt_3M_6M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_ratio_stddev_third_party_amt_3M_6M

In [None]:
# 6
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'ratio_stddev_third_party_amt_3M_6M'
conditions  = [ transformed_vars[col] <= 0,
                transformed_vars[col] > 0 ]

choices     = [0.290734,-0.271068]
    
transformed_vars["ratio_stddev_third_party_amt_3M_6M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 7
Xt= df_temp['ratio_pos_size_credit_size_1M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='ratio_pos_size_credit_size_1M', dtype="numerical", max_n_prebins=2, monotonic_trend='descending')
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_ratio_pos_size_credit_size_1M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_ratio_pos_size_credit_size_1M

In [None]:
# 7
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'ratio_pos_size_credit_size_1M'
conditions  = [ transformed_vars[col] < 0.27,
                transformed_vars[col] >= 0.27 ]

choices     = [-0.129343,0.326452]
    
transformed_vars["ratio_pos_size_credit_size_1M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 8
Xt= df_temp['count_credits_lessthan_100_6M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='count_credits_lessthan_100_6M', dtype="numerical", max_n_prebins=2, monotonic_trend='descending',special_codes=[0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_count_credits_lessthan_100_6M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_count_credits_lessthan_100_6M

In [None]:
# 7
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'count_credits_lessthan_100_6M'
conditions  = [ transformed_vars[col] <= 0,
                transformed_vars[col] > 0 ]

choices     = [0.656801,-0.054845]
    
transformed_vars["count_credits_lessthan_100_6M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 9
Xt= df_temp['count_credits_grt_500_6M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='count_credits_grt_500_6M', dtype="numerical", max_n_prebins=4, monotonic_trend='descending',special_codes=[0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_count_credits_grt_500_6M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_count_credits_grt_500_6M

In [None]:
# 9
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'count_credits_grt_500_6M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 22.5),
                transformed_vars[col] >= 22.5 ]

choices     = [-0.57149,-0.129657,0.25218]
    
transformed_vars["count_credits_grt_500_6M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# 10
Xt= df_temp['ratio_credits_grt_1500_1M_3M']
yt = df_temp['target'].astype(int)

optb = OptimalBinning(name='ratio_credits_grt_1500_1M_3M', dtype="numerical", max_n_prebins=5, monotonic_trend='descending',special_codes = [0])
optb.fit(Xt, yt)  

Xt_binned = optb.transform(Xt)

ob_ratio_credits_grt_1500_1M_3M = optb.binning_table.build()

optb.binning_table.plot(metric="event_rate")

In [None]:
ob_ratio_credits_grt_1500_1M_3M

In [None]:
# 10
# Var tranform
transformed_vars = x_train[feat_list]

# transform
col         = 'ratio_credits_grt_1500_1M_3M'
conditions  = [ transformed_vars[col] <= 0, 
                (transformed_vars[col] > 0 ) & (transformed_vars[col] < 22.5),
                transformed_vars[col] >= 22.5 ]

choices     = [-0.57149,-0.129657,0.25218]
    
transformed_vars["ratio_credits_grt_1500_1M_3M"] = np.select(conditions, choices, default=np.nan)

In [None]:
# hyperparameters
params_log_reg = {'penalty': 'l2',
                  'random_state': seed,
                  'solver': 'liblinear',
                  'class_weight': 'balanced'}

# model fit
logreg_model = classification_models(df[feat_list], y_train, params_log_reg, models=['log_reg'])

In [None]:
# train cv scores
cv_scores = cross_validation(logreg_model, df[feat_list], y_train, scoring='roc_auc', folds=3, seed=seed)
print('CV Scores -',np.round(cv_scores, 2))
print('Mean of CV Scores -',np.round(np.mean(cv_scores),2))

In [None]:
# Feature importance
feat_imp = feature_importance(logreg_model, df[feat_list], show_plot=True)

feat_imp.sort_values(by='importance', ascending=False)

In [None]:
## Test model


# reset index
# x_test.reset_index(drop=True, inplace=True)

# binning variable transform
x_test = x_test[feat_list]
x_test.fillna(0, inplace=True)
x_test.shape

In [None]:
x_test.columns

In [None]:
# test cv scores
cv_scores = cross_validation(logreg_model, x_test[feat_list], y_test, scoring='roc_auc', folds=3, seed=seed)
print('CV Scores -',np.round(cv_scores, 2))
print('Mean of CV Scores -',np.round(np.mean(cv_scores),2))

In [None]:
## Model Evaluation - KS & ROC AUC

def ks(target=None, prob=None):
    data = pd.DataFrame()
    data['y'] = target
    data['y'] = data['y'].astype(float)
    data['p'] = prob
    data['y0'] = 1- data['y']
    data['bucket'] = pd.qcut(data['p'], 5)
    grouped = data.groupby('bucket', as_index=False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()['p']
    kstable['max_prob'] = grouped.max()['p']
    kstable['events'] = grouped.sum()['y']
    kstable['nonevents'] = grouped.sum()['y0']
    kstable = kstable.sort_values(by='min_prob', ascending=False).reset_index(drop=True)
    kstable['event_rate'] = (kstable.events / data['y'].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable['nonevents'] /  data['y0'].sum()).apply('{0:2%}'.format)
    kstable['cum_eventrate'] = (kstable.events / data['y'].sum()).cumsum()
    kstable['cum_noneventrate'] = (kstable.nonevents / data['y0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate'] - kstable['cum_noneventrate'], 3) * 100
    kstable['bad_rate'] = (kstable['events'] / (kstable['events'] + kstable['nonevents'])) * 100
    
    # formatting
    kstable['cum_eventrate'] = kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate'] = kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    kstable.index = range(1,6)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 9)
    print(kstable)
    
    # Display KS
    print("KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
    return kstable

In [None]:
# predicted proability
train_pred = logreg_model.predict_proba(df[feat_list])[:,1]
                                                                     
test_pred = logreg_model.predict_proba(x_test[feat_list])[:,1]


train_ks = ks(y_train, train_pred)
test_ks = ks(y_test, test_pred)

from sklearn.metrics import  roc_auc_score

print(roc_auc_score(y_train, train_pred))   
  
print(roc_auc_score(y_test, test_pred))    

In [None]:
import plotly.express as px
# copy df
df_all = df_raw.copy()

# reset index
df_all.reset_index(drop=True, inplace=True)

# treat missing values
df_all = df_all.fillna(0)

predicted_probas = logreg_model.predict_proba(df_all[feat_list])
df_all['proba'] = predicted_probas[:,1:].flatten()
px.histogram(df_all['proba'], nbins=100)

In [None]:
# train
x_train = df.copy()
x_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)

df_train = x_train.copy()
df_train['target'] = y_train
df_train['proba'] = logreg_model.predict_proba(x_train[feat_list])[:,1:].flatten()

df_train['proba'] = np.round(df_train['proba'], 3)
df_train['DecileRank']= pd.qcut(df_train['proba'], q = 4)
df_stats = pd.DataFrame(np.round(df_train.groupby(by='DecileRank')['target'].mean(),3))
df_stats['volume'] = df_train.groupby(by='DecileRank')['target'].count()
df_stats

In [None]:
# test
x_test.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

df_test = x_test.copy()
df_test['target'] = y_test
df_test['proba'] = logreg_model.predict_proba(x_test[feat_list])[:,1:].flatten()

df_test['proba'] = np.round(df_test['proba'], 3)
df_test['DecileRank']= pd.qcut(df_test['proba'], q = 4)
df_stats = pd.DataFrame(df_test.groupby(by='DecileRank')['target'].mean())
df_stats['volume'] = df_test.groupby(by='DecileRank')['target'].count()
df_stats