In [603]:
#import libraries for analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
sns.set()

In [604]:
df_train = pd.read_csv('cs-training.csv')
df_test = pd.read_csv('cs-test.csv')

In [605]:
df_train.drop(df_train.columns.tolist()[0], axis = 1, inplace = True)
df_test.drop('Unnamed: 0', axis = 1, inplace = True)

In [606]:
features = df_train.columns.to_list()

In [607]:
#REPLACE MISSING VALUES WITH THE MEAN OF THE FEATURE
df_train['MonthlyIncome'].fillna(np.mean(df_train['MonthlyIncome']), inplace = True)
df_train['NumberOfDependents'].fillna(df_train['NumberOfDependents'].value_counts().argmax(), inplace = True)
df_test['MonthlyIncome'].fillna(np.mean(df_test['MonthlyIncome']), inplace = True)
df_test['NumberOfDependents'].fillna(df_test['NumberOfDependents'].value_counts().argmax(), inplace = True)

In [608]:
def woe(data, feature, target, bins=10):
    
    
    #Run WOE and IV on all the independent variables
    if (data[feature].dtype.kind in 'bifc') and (len(np.unique(data[feature]))>10):
        binned_x = pd.qcut(data[feature], bins,  duplicates='drop')
        d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
    else:
        d0 = pd.DataFrame({'x': data[feature], 'y': data[target]})
    d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
    d.columns = ['Cutoff', 'N', 'Events']
    d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
    d['Non-Events'] = d['N'] - d['Events']
    d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
    d['WoE'] = np.log(d['% of Non-Events']/d['% of Events'])
    d['IV'] = d['WoE'] * (d['% of Non-Events'] - d['% of Events'])
    d.insert(loc=0, column='Variable', value=feature)
    return d

In [609]:
def woe_x(data, feature, target, bins):
    binned_x = pd.cut(data[feature], bins = bins, precision = 0, right = True)
    d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
    d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
    d.columns = ['Cutoff', 'N', 'Events']
    d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
    d['Non-Events'] = d['N'] - d['Events']
    d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
    d['WoE'] = np.log(d['% of Non-Events']/d['% of Events'])
    d['IV'] = d['WoE'] * (d['% of Non-Events'] - d['% of Events'])
    d.insert(loc=0, column='Variable', value=feature)
    return d

In [610]:
woe_1 = woe(df_train, features[1], features[0], 10)
woe_2 = woe(df_train, features[2], features[0], 10)
woe_3 = woe_x(df_train, features[3], features[0], pd.IntervalIndex.from_tuples([(-0.001, 0), (0.1, 1), (1, 2), (2, 3), (3, 5), (5, 9), (9, 13),(13, 96),(96, 98)]))
woe_4 = woe(df_train, features[4], features[0], 10)
woe_5 = woe(df_train, features[5], features[0], 10)
woe_6 = woe(df_train, features[6], features[0], 10)
woe_7 = woe_x(df_train, features[7], features[0], pd.IntervalIndex.from_tuples([(-0.001, 0), (0.1, 2), (2, 5), (5, 9), (9, 17), (17, 96), (96, 98)]))
woe_8 = woe_x(df_train, features[8], features[0], pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 1), (1, 2), (2, 4), (4, 7), (7, 9), (9, 12), (12, 54)]))
woe_9 = woe_x(df_train, features[9], features[0], pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 1), (1, 2), (2, 4), (4, 6), (6, 9), (9, 12), (12, 96),(96, 99)]))
woe_10 = woe_x(df_train, features[10], features[0], pd.IntervalIndex.from_tuples([(-0.001, 0), (0, 2), (2, 3), (3, 4), (4, 5), (5, 8), (8, 43)]))

In [611]:
var = features[1:]

In [612]:
df_bins = pd.DataFrame()
df_bins[var[0]] = woe_1['Cutoff']
df_bins[var[1]] = woe_2['Cutoff']
df_bins[var[2]] = woe_3['Cutoff']
df_bins[var[3]] = woe_4['Cutoff']
df_bins[var[4]] = woe_5['Cutoff']
df_bins[var[5]] = woe_6['Cutoff']
df_bins[var[6]] = woe_7['Cutoff']
df_bins[var[7]] = woe_8['Cutoff']
df_bins[var[8]] = woe_9['Cutoff']
df_bins[var[9]] = woe_10['Cutoff']
df_bins[var[2]].fillna(df_bins[var[2]][8], inplace = True)
df_bins[var[4]].fillna(df_bins[var[4]][8], inplace = True)
df_bins[var[6]].fillna(df_bins[var[6]][6], inplace = True)
df_bins[var[7]].fillna(df_bins[var[7]][7], inplace = True)
df_bins[var[8]].fillna(df_bins[var[8]][8], inplace = True)
df_bins[var[9]].fillna(df_bins[var[9]][6], inplace = True)

In [613]:
df_woe = pd.DataFrame()
df_woe[var[0]] = woe_1['WoE']
df_woe[var[1]] = woe_2['WoE']
df_woe[var[2]] = woe_3['WoE']
df_woe[var[3]] = woe_4['WoE']
df_woe[var[4]] = woe_5['WoE']
df_woe[var[5]] = woe_6['WoE']
df_woe[var[6]] = woe_7['WoE']
df_woe[var[7]] = woe_8['WoE']
df_woe[var[8]] = woe_9['WoE']
df_woe[var[9]] = woe_10['WoE']

In [614]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter = 1000)

In [615]:
clf.fit(np.array(df_train.iloc[:, 1:]), np.array(df_train.iloc[:, 0]))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [616]:
coef = clf.coef_[0]

In [617]:
intercept = clf.intercept_[0]

In [618]:
factor = 20/np.log(2)
offset = 600 - factor * np.log(50)
print('factor:', round(factor, 2),'| offset', round(offset, 2))

factor: 28.85 | offset 487.12


In [619]:
def score(data, feature, groups, woe, coef, intercept, factor, offset):
    score = list()
    for element, pos in zip(data, range(len(data))):
        for bins in groups[feature[pos]]:
            if element not in bins:
                continue
            else:
                x = groups[feature[pos]].to_list().index(bins)
                weight = woe[feature[pos]][x]
                score.append((coef[pos] * weight + intercept/len(coef)) * factor + offset/len(coef))
    return sum(score)

In [620]:
info = df_test.iloc[80, 1:].to_list()
score(info, var, df_bins, df_woe, coef, intercept, factor, offset)

475.84873440906875

In [602]:
info

[0.085647252, 47.0, 0.0, 577.0, 6855.0355896805895, 9.0, 0.0, 0.0, 0.0, 0.0]

In [544]:
def score(data, feature, position, groups, woe, coef, intercept, factor, offset):
    weight = list()
    x = data.columns.to_list()[1:].index(feature)
    for bins in groups[feature]:
        if data[feature][position] not in bins:
            continue
        else:
            pos = groups[feature].to_list().index(bins)
            weight.append(woe[feature][pos])
    score = (coef[x] * weight[0] + intercept/len(coef)) * factor + offset/len(coef)
    return score

In [549]:
score(df_test, var[9], 0, df_bins, df_woe, coef, intercept, factor, offset)

47.28870671702511

In [550]:
lst = list()
for feature in var:
    lst.append(score(df_test, feature, 80, df_bins, df_woe, coef, intercept, factor, offset))
lst

[47.00828179255837,
 47.25963136600174,
 55.140154434369485,
 47.00972462037358,
 47.00948262866533,
 46.96014833685812,
 52.703734347789116,
 46.75346159775704,
 38.71540856767086,
 47.28870671702511]

In [623]:
df_test.iloc[1,:]

SeriousDlqin2yrs                                NaN
RevolvingUtilizationOfUnsecuredLines       0.463295
age                                       57.000000
NumberOfTime30-59DaysPastDueNotWorse       0.000000
DebtRatio                                  0.527237
MonthlyIncome                           9141.000000
NumberOfOpenCreditLinesAndLoans           15.000000
NumberOfTimes90DaysLate                    0.000000
NumberRealEstateLoansOrLines               4.000000
NumberOfTime60-89DaysPastDueNotWorse       0.000000
NumberOfDependents                         2.000000
Name: 1, dtype: float64

In [None]:
[0.046, 57.0, 0.0, 0.527, 9141.0, 15.0, 0.0, 4.0, 0.0, 2.0]