## One hot encoding

In [1]:
# sample code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scorecardpy as sc
import warnings

In [None]:
## pls change the columns to be one-hot encoded
merged_working = pd.get_dummies(merged_working, columns=['REASON', "JOB"], drop_first = False)

## Fine Classing

In [None]:
# sample code
bin_count = 20
variable = 'CLAGE'

# create a copy to work on
hmeq_data_working = hmeq_data.copy() 

# reduce to just the variable and the target
hmeq_data_working = hmeq_data_working[[variable, 'BAD']]

# create equal frequency bin ranges using qcut
hmeq_data_working['Bin_Range'] = pd.qcut(hmeq_data[variable],q=bin_count, precision=0)

# missing values have been assigned NaN when binning with pd.qcut 
# rename NaN to 'Missing' and create bin range for those
hmeq_data_working['Bin_Range'] = hmeq_data_working['Bin_Range'].astype('object')
hmeq_data_working['Bin_Range'].fillna('Missing', inplace = True)

# check the counts in each bin
hmeq_data_working.groupby('Bin_Range').count()

### To calculate WOE and IV for class variable; this func is necessary for woe_iv_plot function which is right below this sect
- woe_iv(data, variable_name) --> To calculate IV and WOE of the variable

In [None]:
def woe_iv(data, variable_name):

    working_data = data.copy() 

    # missing values have been assigned NaN when binning with pd.qcut
    # rename this bin as 'Missing' to consider into WOE calculation
    working_data['Bin_Range'] = working_data['Bin_Range'].astype('object')
    working_data['Bin_Range'].fillna('Missing', inplace = True)
    
    variable_data = pd.DataFrame()
    variable_data['Bin_Range'] = working_data.groupby(by='Bin_Range', as_index=False).count()['Bin_Range']

    variable_data['Count'] = working_data.groupby(by='Bin_Range', as_index=False).count()['BAD']

    variable_data['Events'] =working_data.groupby(by='Bin_Range', as_index=False).sum()['BAD']

    variable_data['Non_Events'] = variable_data['Count'] - variable_data['Events']

    variable_data['%_of_Events'] = variable_data['Events']/sum(variable_data['Events'])

    variable_data['%_of_Non_Events'] = variable_data['Non_Events']/sum(variable_data['Non_Events'])
    variable_data

    variable_data['WOE'] = np.log(variable_data['%_of_Non_Events'] / variable_data['%_of_Events'])

    variable_data['IV'] = (variable_data['%_of_Non_Events']-variable_data['%_of_Events']) * variable_data['WOE']

    IV = sum(variable_data['IV'])
    return(IV, variable_data)

**Create function to calculate WOE, IV and plot IV**

Combine the code above into a parameterised function that you can use going forward to print IV rounded to 4 decimal places and plot WOE.

In [None]:
# sample code
# create IV calc and WOE plotting function
def woe_iv_plot(data, variable_name):
    IV, variable_data = woe_iv(data, variable_name)
    print('IV for', variable_name, 'with', variable_data.shape[0], 'bins:', round(IV,4))
    ffig, ax1 = plt.subplots(figsize=(20,6))

    # if 'missing' bin, do not include in plot

    if variable_data['Bin_Range'].isin(['Missing']).sum() > 0:
        sns.lineplot(data = variable_data['WOE'].iloc[0:-1], marker='o', sort = False, ax=ax1)
    else:
        sns.lineplot(data = variable_data['WOE'], marker='o', sort = False, ax=ax1)
    plt.xticks(rotation=45)

    # if 'missing' bin, create the scatter plot to plot the Missing WOE data point
    if variable_data['Bin_Range'].isin(['Missing']).sum() > 0: 
        last_point = len(variable_data['WOE']) - 1
        scatter_plot = sns.scatterplot(x=[last_point], y=[variable_data['WOE'][last_point]], color='b', s=30, ax=ax1)
    
    ax2 = ax1.twinx()
        
    # plot the bin counts
    sns.barplot(x = variable_data['Bin_Range'] , y = variable_data['Count'], alpha=0.5, ax = ax2, color = 'deepskyblue')
    plt.title("WOE plot for " + variable_name)
    plt.show()

### Can use the function to do variable screening and drop variables with low IV

In [None]:
## EXAMPLE OF HOW TO USE THE FUNCTION

# your code here
large_val_numeric = ['LOAN', 'MORTDUE', 'CLAGE', 'DEBTINC']

# add code to obtain IV for each of the variables
for val in large_val_numeric:
    woe_bin_data = hmeq_data.loc[:,(val, 'BAD')]
    woe_bin_data['Bin_Range'] = pd.qcut(hmeq_data[val],q=20)
    woe_bin_data['Bin_Range'] = woe_bin_data['Bin_Range'].astype('object')
    woe_bin_data['Bin_Range'].fillna('Missing', inplace = True)
    woe_iv_plot(woe_bin_data, val)


## Coarse Classing

### To calculate WOE and IV for class variable
- woe_iv(data, variable_name) --> To calculate IV and WOE of the variable

In [None]:
def woe_iv(data, variable_name):

    working_data = data.copy() 

    # missing values have been assigned NaN when binning with pd.qcut
    # rename this bin as 'Missing' to consider into WOE calculation
    working_data['Bin_Range'] = working_data['Bin_Range'].astype('object')
    working_data['Bin_Range'].fillna('Missing', inplace = True)
    
    variable_data = pd.DataFrame()
    variable_data['Bin_Range'] = working_data.groupby(by='Bin_Range', as_index=False).count()['Bin_Range']

    variable_data['Count'] = working_data.groupby(by='Bin_Range', as_index=False).count()['BAD']

    variable_data['Events'] =working_data.groupby(by='Bin_Range', as_index=False).sum()['BAD']

    variable_data['Non_Events'] = variable_data['Count'] - variable_data['Events']

    variable_data['%_of_Events'] = variable_data['Events']/sum(variable_data['Events'])

    variable_data['%_of_Non_Events'] = variable_data['Non_Events']/sum(variable_data['Non_Events'])
    variable_data

    variable_data['WOE'] = np.log(variable_data['%_of_Non_Events'] / variable_data['%_of_Events'])

    variable_data['IV'] = (variable_data['%_of_Non_Events']-variable_data['%_of_Events']) * variable_data['WOE']

    IV = sum(variable_data['IV'])
    return(IV, variable_data)

## Train test split

In [None]:
X = merged_working.drop('BAD', axis = 1)
y = merged_working['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100, stratify = y)

## Logistic Regression without WOE
- adding this in if u think it might be relevant

In [None]:
# sample code

# create a logistic regression model and fit the training data
logreg = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
logreg.fit(X_train, y_train)

# print out the intercept and coeeficients as a dataframe
coeff = logreg.coef_.reshape(-1)
df_coeff = pd.DataFrame({'Variable': ['Intercept'] + X_train.columns.tolist(), 'coefficient': np.insert(coeff,0, logreg.intercept_[0])})
df_coeff

# concatenate intercept and coefficients to a single array
coeff = np.concatenate([logreg.intercept_, logreg.coef_.reshape(-1)])
# create a pandas Series with the features corresponding to the coefficients
pd.Series(coeff, index = ['Intercept'] + X_train.columns.tolist())

# check the accuracy
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Testing accuracy is {accuracy:.4%}')
p_bad = y_pred.sum() / y_test.shape[0]
print(f'Percentage of bads predicted in the test data is {p_bad:.2%}')

## Logistic Regression with WOE (using scorecardpy)

In [None]:

# sample code
# split data into 70% train and 30% test
train, test = sc.split_df(hmeq_data, y = 'BAD', ratio = .7).values()
print(train.shape)
print(test.shape)

## Generate WOE bins
# automatically calculate bin ranges, bins is a dictionary
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    bins = sc.woebin(train, y = 'BAD')

for variables, bindetails in bins.items():
    print(variables, " : ")
    display(bindetails)
    print("--"*50)



In [None]:
## Generate LR with WOE encoding
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    train_woe = sc.woebin_ply(train, bins)
    test_woe = sc.woebin_ply(test, bins)
train_woe

In [None]:
from sklearn import linear_model, metrics
# sample code

# create the X, y parts of data for train and test
y_train = train_woe.loc[:, 'BAD']
X_train = train_woe.loc[:, train_woe.columns != 'BAD']
y_test = test_woe.loc[:, 'BAD']
X_test = test_woe.loc[:, train_woe.columns != 'BAD']

# create a logistic regression model object
lr = linear_model.LogisticRegression()
lr.fit(X_train, y_train)
print(lr.coef_)
print(lr.intercept_)
pd.Series(np.concatenate([lr.intercept_, lr.coef_[0]]), index = np.concatenate([['intercept'], lr.feature_names_in_]) )

## Generate scorecard

In [None]:
import pprint
# sample code

# generate a card from the model and bins. The scores will be based on probability of default from the model
# bins = bins created from sc.woebin
# lr = fitted logistic regression model
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    card = sc.scorecard(bins, lr, X_train.columns, points0 = 600, odds0 = 1/20, pdo = 20, basepoints_eq0 = True)

pprint.pprint(card)

## Generate scorecard with basepoints

In [None]:
# sample code

# generate a card from the model and bins. The scores will be based on probability of default from the model
# bins = bins created from sc.woebin
# lr = fitted logistic regression model
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    card = sc.scorecard(bins, lr, X_train.columns, points0 = 600, odds0 = 1/20, pdo = 20, basepoints_eq0 = False)

pprint.pprint(card)

## Score a new appplication

In [None]:
# sample code

# calulate credit score for new application
col = ['LOAN','VALUE','REASON','JOB','YOJ','DEROG','DELINQ','CLAGE','NINQ','CLNO','DEBTINC']
val = [[88900,57264,'DebtCon','Other',16.0,0.0,0.0,221.8,0.0,16.0,36.1]]
new_appl = pd.DataFrame(val, columns = col)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    new_appl_score = sc.scorecard_ply(new_appl, card, only_total_score = False).transpose()
new_appl_score.index = new_appl_score.index.str.replace('_points', '')

summary = pd.concat([new_appl.transpose(), new_appl_score], axis=1)
summary.columns = ['App Value', 'Points']
print(summary)

## To score train and test data
Examine the distribution of the scores

In [None]:
# sample code

# credit score for samples in test and train
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    train_score = sc.scorecard_ply(train, card)
    test_score = sc.scorecard_ply(test, card)

# distribution of scores on train and test data
fig, ax = plt.subplots(2, 1, figsize = (7, 8), sharex = True)
train_score.hist(figsize = (7, 4), bins = 60, ax = ax[0])
ax[0].set_title('train data scores', fontsize = 9)
test_score.hist(figsize = (7, 4), bins = 60, ax = ax[1])
ax[1].set_title('test data scores', fontsize = 9)
plt.tight_layout()

## Evaluate model performance
**Calculate Percentage Correctly Classified measures on LR model**

Using `predict` function to generate predictions based on 50% probability threshold

In [None]:
# sample code
# print evaluation metrics of the model
y_pred = lr.predict(X_test)

print('Confusion matrix:')
print(metrics.confusion_matrix(y_test, y_pred))
print('PCC measures:')
print(metrics.classification_report(y_test, y_pred))

## Performance of LR with ROC 

In [None]:
# sample code 

# evaluate the performance of the logistic regression
train_pred = lr.predict_proba(X_train)[:, 1]
test_pred = lr.predict_proba(X_test)[:, 1]

# performance roc
train_perf = sc.perf_eva(y_train, train_pred, plot_type = ['roc'], title = 'train')
test_perf = sc.perf_eva(y_test, test_pred, plot_type = ['roc'], title = 'test')

In [2]:
## i cannot find code for PSI