In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/loan-default-prediction/Default_Fin.csv')
data.head()

## Understanding Data Distribution

In [None]:
print(data.groupby(['Defaulted?', 'Employed'])[['Bank Balance', 'Annual Salary']].describe())
pd.crosstab(data['Employed'], data['Defaulted?'], normalize = 'index').round(2)

3% are defaulters in Employed<br>
4.5% are defaulters in Unemployed

In [None]:
# defaulters vs bank balance distributions
fig = plt.figure(figsize=(10,6))

sns.kdeplot(data[data['Defaulted?']==1]['Bank Balance'])
sns.kdeplot(data[data['Defaulted?']==0]['Bank Balance'])

fig.legend(labels=['Defaulted', 'Not Defaulted'])
plt.title('Based on Bank Balance')
plt.show()

On average, defaulters have higher bank balance than non-defaulters.

In [None]:
# defaulters vs bank balance distributions
fig = plt.figure(figsize=(10,6))

sns.kdeplot(data[data['Defaulted?']==1]['Annual Salary'])
sns.kdeplot(data[data['Defaulted?']==0]['Annual Salary'])

fig.legend(labels=['Defaulted', 'Not Defaulted'])
plt.title('Based on Annual Salary')
plt.show()

On average, number of defaulters with higher salary are lesser than lower annual salary

In [None]:
data.groupby('Defaulted?').size().plot.pie(autopct='%1.1f%%', title='Defaulted?');
plt.show()
label=[1,0]
plt.pie(data['Employed'].value_counts(), autopct='%1.1f%%',labels=label);
plt.title('Employed?');

# correlation between features or with target

In [None]:
import scipy.stats as st
print(st.ks_2samp(data['Annual Salary'], data['Bank Balance']))
# based on the pvalue we may say that these two features are highly correlated
# as the number of features are already small we are ignoring it

# Training Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [None]:
feature_cols = ['Employed', 'Bank Balance', 'Annual Salary']

''' Rescaling to [0,1] '''
scaler = MinMaxScaler()
data[feature_cols] = scaler.fit_transform(data[feature_cols])

In [None]:
X = data[feature_cols]
y = data['Defaulted?']

validation_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, 
                                                    random_state=4, stratify=y)

# Model 1: Logistic Regression

In [None]:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)
print(model.coef_)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Test metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual defaulted?'] * n
    pred = ['prediction defaulted ?'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(data['Defaulted?'], y_pred),['No','Yes'])

display(df_matrix)
print(classification_report(data['Defaulted?'], y_pred))

# Model 2: SVM

In [None]:
model = SVC(class_weight={0: 1, 1: 100})
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Test metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual defaulted?'] * n
    pred = ['prediction defaulted ?'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(data['Defaulted?'], y_pred),['No','Yes'])

display(df_matrix)
print(classification_report(data['Defaulted?'], y_pred))