In [419]:
#import libraries for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
sns.set()

In [420]:
df_train = pd.read_csv('cs-training.csv')
df_test = pd.read_csv('cs-test.csv')

In [421]:
df_train.drop(df_train.columns.tolist()[0], axis = 1, inplace = True)
df_test.drop('Unnamed: 0', axis = 1, inplace = True)

In [422]:
features = df_train.columns.to_list()

In [423]:
df_train.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [424]:
df_test.isnull().sum()

SeriousDlqin2yrs                        101503
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
NumberOfTime30-59DaysPastDueNotWorse         0
DebtRatio                                    0
MonthlyIncome                            20103
NumberOfOpenCreditLinesAndLoans              0
NumberOfTimes90DaysLate                      0
NumberRealEstateLoansOrLines                 0
NumberOfTime60-89DaysPastDueNotWorse         0
NumberOfDependents                        2626
dtype: int64

In [425]:
#REPLACE MISSING VALUES WITH THE MEAN OF THE FEATURE
df_train['MonthlyIncome'].fillna(np.mean(df_train['MonthlyIncome']), inplace = True)
df_train['NumberOfDependents'].fillna(df_train['NumberOfDependents'].value_counts().argmax(), inplace = True)
df_test['MonthlyIncome'].fillna(np.mean(df_test['MonthlyIncome']), inplace = True)
df_test['NumberOfDependents'].fillna(df_test['NumberOfDependents'].value_counts().argmax(), inplace = True)

In [426]:
def univariate_analysis(data, feature, rotation=0):

    """ function returns 
      histogram, 
      box-plot, 
      5-point statistics as DataFrame   
    """
    fig, ax = plt.subplots(figsize=(10,4), ncols=2, nrows=1)
    sns.distplot(data[feature], kde=False, rug=True, ax=ax[0])
    sns.boxplot(feature, orient='h', data=data, ax=ax[1])
   
    for ax in ax:
        for tick in ax.get_xticklabels():
            tick.set_rotation(rotation)
            
    return data[feature].describe().to_frame().T

In [427]:
def woe(data, feature, target, bins=10):
    
    
    #Run WOE and IV on all the independent variables
    if (data[feature].dtype.kind in 'bifc') and (len(np.unique(data[feature]))>10):
        binned_x = pd.qcut(data[feature], bins,  duplicates='drop')
        d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
    else:
        d0 = pd.DataFrame({'x': data[feature], 'y': data[target]})
    d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
    d.columns = ['Cutoff', 'N', 'Events']
    d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
    d['Non-Events'] = d['N'] - d['Events']
    d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
    d['WoE'] = np.log(d['% of Non-Events']/d['% of Events'])
    d['IV'] = d['WoE'] * (d['% of Non-Events'] - d['% of Events'])
    d.insert(loc=0, column='Variable', value=feature)
    return d

In [428]:
woe_1 = woe(df_train, features[1], features[0], 10)
woe_2 = woe(df_train, features[2], features[0], 10)
woe_4 = woe(df_train, features[4], features[0], 10)
woe_5 = woe(df_train, features[5], features[0], 10)
woe_6 = woe(df_train, features[6], features[0], 10)

In [429]:
#weight of evidence for feature 3
bins = pd.IntervalIndex.from_tuples([(-0.001, 0), (0.1, 1), (1, 2), (2, 3), (3, 5), (5, 9), (9, 13),(13, 96),(96, 98)])
binned_x = pd.cut(df_train[features[3]], bins = bins, precision = 0, right = True)
d0 = pd.DataFrame({'x': binned_x, 'y': df_train[features[0]]})
past_due = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
past_due.columns = ['Cutoff', 'N', 'Events']
past_due['% of Events'] = np.maximum(past_due['Events'], 0.5) / past_due['Events'].sum()
past_due['Non-Events'] = past_due['N'] - past_due['Events']
past_due['% of Non-Events'] = np.maximum(past_due['Non-Events'], 0.5) / past_due['Non-Events'].sum()
past_due['WoE'] = np.log(past_due['% of Non-Events']/past_due['% of Events'])
past_due['IV'] = past_due['WoE'] * (past_due['% of Non-Events'] - past_due['% of Events'])
past_due.insert(loc=0, column='Variable', value=features[3])

In [430]:
#weight of evidence for feature 7
bins = pd.IntervalIndex.from_tuples([(-0.001, 0), (0.1, 2), (2, 5), (5, 9), (9, 17), (17, 96), (96, 98)])
binned_x = pd.cut(df_train[features[7]], bins = bins, precision = 0, right = True)
d0 = pd.DataFrame({'x': binned_x, 'y': df_train[features[0]]})
d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
d.columns = ['Cutoff', 'N', 'Events']
d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
d['Non-Events'] = d['N'] - d['Events']
d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
d['WoE'] = np.log(d['% of Non-Events']/d['% of Events'])
d['IV'] = d['WoE'] * (d['% of Non-Events'] - d['% of Events'])
d.insert(loc=0, column='Variable', value=features[7])

In [431]:
#weight of evidence for feature 8
bins = pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 1), (1, 2), (2, 4), (4, 7), (7, 9), (9, 12), (12, 54)])
binned_x = pd.cut(df_train[features[8]], bins = bins, precision = 0, right = True)
d0 = pd.DataFrame({'x': binned_x, 'y': df_train[features[0]]})
real_estate = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
real_estate.columns = ['Cutoff', 'N', 'Events']
real_estate['% of Events'] = np.maximum(real_estate['Events'], 0.5) / real_estate['Events'].sum()
real_estate['Non-Events'] = real_estate['N'] - real_estate['Events']
real_estate['% of Non-Events'] = np.maximum(real_estate['Non-Events'], 0.5) / real_estate['Non-Events'].sum()
real_estate['WoE'] = np.log(real_estate['% of Non-Events']/real_estate['% of Events'])
real_estate['IV'] = real_estate['WoE'] * (real_estate['% of Non-Events'] - real_estate['% of Events'])
real_estate.insert(loc=0, column='Variable', value=features[8])

In [432]:
#weight of evidence for feature 9
groups = pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 1), (1, 2), (2, 4), (4, 6), (6, 9), (9, 12), (12, 96),(96, 99)])
binned_x = pd.cut(df_train[features[9]], bins = groups, precision = 0, right = True)
d0 = pd.DataFrame({'x': binned_x, 'y': df_train[features[0]]})
past_due_90 = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
past_due_90.columns = ['Cutoff', 'N', 'Events']
past_due_90['% of Events'] = np.maximum(past_due_90['Events'], 0.5) / past_due_90['Events'].sum()
past_due_90['Non-Events'] = past_due_90['N'] - past_due_90['Events']
past_due_90['% of Non-Events'] = np.maximum(past_due_90['Non-Events'], 0.5) / past_due_90['Non-Events'].sum()
past_due_90['WoE'] = np.log(past_due_90['% of Non-Events']/past_due_90['% of Events'])
past_due_90['IV'] = past_due_90['WoE'] * (past_due_90['% of Non-Events'] - past_due_90['% of Events'])
past_due_90.insert(loc=0, column='Variable', value=features[9])

In [433]:
#weight of evidence for feature 10
cats = pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 2), (2, 3), (3, 4), (4, 5), (5, 8), (8, 43)])
binned_x = pd.cut(df_train[features[10]], bins = cats, precision = 0, right = True)
d0 = pd.DataFrame({'x': binned_x, 'y': df_train[features[0]]})
dep = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
dep.columns = ['Cutoff', 'N', 'Events']
dep['% of Events'] = np.maximum(dep['Events'], 0.5) / dep['Events'].sum()
dep['Non-Events'] = dep['N'] - dep['Events']
dep['% of Non-Events'] = np.maximum(dep['Non-Events'], 0.5) / dep['Non-Events'].sum()
dep['WoE'] = np.log(dep['% of Non-Events']/dep['% of Events'])
dep['IV'] = dep['WoE'] * (dep['% of Non-Events'] - dep['% of Events'])
dep.insert(loc=0, column='Variable', value=features[10])  

In [434]:
var = features[1:]

In [441]:
df_bins = pd.DataFrame()
df_bins[var[0]] = woe_1['Cutoff']
df_bins[var[1]] = woe_2['Cutoff']
df_bins[var[2]] = past_due['Cutoff']
df_bins[var[3]] = woe_4['Cutoff']
df_bins[var[4]] = woe_5['Cutoff']
df_bins[var[5]] = woe_6['Cutoff']
df_bins[var[6]] = d['Cutoff']
df_bins[var[7]] = real_estate['Cutoff']
df_bins[var[8]] = past_due_90['Cutoff']
df_bins[var[9]] = dep['Cutoff']
df_bins[var[2]].fillna(df_bins[var[2]][8], inplace = True)
df_bins[var[4]].fillna(df_bins[var[4]][8], inplace = True)
df_bins[var[6]].fillna(df_bins[var[6]][6], inplace = True)
df_bins[var[7]].fillna(df_bins[var[7]][7], inplace = True)
df_bins[var[8]].fillna(df_bins[var[8]][8], inplace = True)
df_bins[var[9]].fillna(df_bins[var[9]][6], inplace = True)

In [442]:
df_woe = pd.DataFrame()
df_woe[var[0]] = woe_1['WoE']
df_woe[var[1]] = woe_2['WoE']
df_woe[var[2]] = past_due['WoE']
df_woe[var[3]] = woe_4['WoE']
df_woe[var[4]] = woe_5['WoE']
df_woe[var[5]] = woe_6['WoE']
df_woe[var[6]] = d['WoE']
df_woe[var[7]] = real_estate['WoE']
df_woe[var[8]] = past_due_90['WoE']
df_woe[var[9]] = dep['WoE']
df_woe.iloc[:, 2].fillna(df_woe[var[2]][8], inplace = True)
df_woe.iloc[:, 4].fillna(df_woe[var[4]][8], inplace = True)
df_woe.iloc[:, 6].fillna(df_woe[var[6]][6], inplace = True)
df_woe.iloc[:, 7].fillna(df_woe[var[7]][7], inplace = True)
df_woe.iloc[:, 8].fillna(df_woe[var[8]][8], inplace = True)
df_woe.iloc[:, 9].fillna(df_woe[var[9]][6], inplace = True)

In [403]:
df_train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [404]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [405]:
clf.fit(np.array(df_train.iloc[:, 1:]), np.array(df_train.iloc[:, 0]))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [406]:
coef = clf.coef_[0]

In [407]:
intercept = clf.intercept_[0]

In [408]:
factor = 20/np.log(2)
offset = 600 - factor * np.log(50)
print('factor:', round(factor, 2),'| offset', round(offset, 2))

factor: 28.85 | offset 487.12


In [409]:
def score(data, feature, position, groups, woe, coef, intercept, factor, offset):
    weight = list()
    x = data.columns.to_list()[1:].index(feature)
    for bins in groups[feature]:
        if data[feature][position] not in bins:
            continue
        else:
            pos = groups[feature].to_list().index(bins)
            weight.append(woe[feature][pos])
    score = (coef[x] * weight[0] + intercept/len(coef)) * factor + offset/len(coef)
    return score

In [410]:
score(df_test, var[0], 0, df_bins, df_woe, coef, intercept, factor, offset)

47.01115929410097

In [447]:
lst = list()
for feature in var:
    lst.append(score(df_test, feature, 0, df_bins, df_woe, coef, intercept, factor, offset))