In [33]:
# Import basic libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt        
%matplotlib inline

import sklearn.preprocessing as skp
import sklearn.model_selection as skm

#import classification modules
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#new modules added
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, confusion_matrix,precision_score, recall_score, roc_auc_score,roc_curve, auc, f1_score

import warnings
warnings.filterwarnings('ignore')

# Importing Data

In [34]:
datadf = pd.read_csv("creditriskmodeling.csv")
print("Shape is: ", datadf.shape)
print("\n Columns with Types: \n", datadf.dtypes)

Shape is:  (424, 20)

 Columns with Types: 
 Default                                    object
Long Term Financing of Working Capital    float64
Working Capital Requirement               float64
Debt Cash Flow Coverage Ratio             float64
Liability to Equity                       float64
Net Debt to Equity Ratio                  float64
Debt to Capital Ratio                     float64
Long Term Debt to Asset                   float64
Long Term Debt to Tangible Asset          float64
Interest Coverage Ratio                   float64
Net Profit Margin                         float64
Gross Profit Margin                       float64
Return on Invested Capital                float64
Return on Equity                          float64
Fixed Asset to Debt Ratio                 float64
Short Term Debt to Sales Ratio            float64
Expense to Revenue Ratio                  float64
Fixed Asset Turnover                      float64
Collateral                                float64
Firm 

## Checking For Null Values

In [35]:
datadf.isnull().sum()

Default                                   34
Long Term Financing of Working Capital     0
Working Capital Requirement                0
Debt Cash Flow Coverage Ratio              0
Liability to Equity                        0
Net Debt to Equity Ratio                   0
Debt to Capital Ratio                      0
Long Term Debt to Asset                    0
Long Term Debt to Tangible Asset           0
Interest Coverage Ratio                    0
Net Profit Margin                          0
Gross Profit Margin                        0
Return on Invested Capital                 0
Return on Equity                           0
Fixed Asset to Debt Ratio                  0
Short Term Debt to Sales Ratio             0
Expense to Revenue Ratio                   0
Fixed Asset Turnover                       0
Collateral                                 0
Firm Size                                  0
dtype: int64

## Removing Null Values 

In [36]:
datadf = datadf.dropna(axis=0)
datadf.isnull().sum()

Default                                   0
Long Term Financing of Working Capital    0
Working Capital Requirement               0
Debt Cash Flow Coverage Ratio             0
Liability to Equity                       0
Net Debt to Equity Ratio                  0
Debt to Capital Ratio                     0
Long Term Debt to Asset                   0
Long Term Debt to Tangible Asset          0
Interest Coverage Ratio                   0
Net Profit Margin                         0
Gross Profit Margin                       0
Return on Invested Capital                0
Return on Equity                          0
Fixed Asset to Debt Ratio                 0
Short Term Debt to Sales Ratio            0
Expense to Revenue Ratio                  0
Fixed Asset Turnover                      0
Collateral                                0
Firm Size                                 0
dtype: int64

## Preprocessing Data

In [40]:
y = datadf['Default'].copy()
X = datadf.drop('Default',axis=1)
print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (390, 19)
Shape of y:  (390,)


In [41]:
X = skp.StandardScaler().fit(X).transform(X)   
X[10]

array([-0.98641683, -0.13399376,  1.69312385, -0.52589396, -0.9955322 ,
       -0.7693831 , -0.59663551, -0.44572108, -1.16854263,  0.55572255,
        1.53728781, -0.34193101,  0.70173341, -0.33827082,  0.84224927,
       -0.40105185, -1.1052961 ,  2.69069014, -0.50084072])

In [42]:
level_map = {'Yes': 1, 'No': 0}
y = y.map(level_map)

## Dividing data into Training and Testing Data

In [43]:
trainX, testX, trainy, testy= skm.train_test_split(X, y, test_size=0.25, random_state=99)   

# Models

# 1. Logistic Regression

In [48]:
#Logistic Regression
clf = LogisticRegression()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 93.87755102040816
AUC: 
 0.9194520547945205


# 2. K-NN

In [49]:
#K-NN
clf = KNeighborsClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 89.79591836734694
AUC: 
 0.8394520547945206


# Support Vector Machines (SVM)


In [50]:
#Support Vector machines
clf = SVC()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 92.85714285714286
AUC: 
 0.8731506849315067


# Random Forest Classifier

In [51]:
#Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 92.85714285714286
AUC: 
 0.8863013698630137


# Decision Tree

In [52]:
#Decision Tree
clf = DecisionTreeClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 87.75510204081633
AUC: 
 0.852054794520548


# Multilayer Perceptron (MLP)

In [53]:
#MLP
clf = MLPClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 91.83673469387756
AUC: 
 0.8794520547945207


# Naive Baye's 

In [54]:
#NB
clf = GaussianNB()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 95.91836734693877
AUC: 
 0.9463013698630137


# Gradient Boosting Classifier

In [55]:
#Gradient Boosting Classifier
clf = GradientBoostingClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 93.87755102040816
AUC: 
 0.9063013698630137


# Adaboost Classifier

In [56]:
#Adaboost Classifier
clf = AdaBoostClassifier()
clf.fit(trainX,trainy)
predictions = clf.predict(testX)
print("Accuracy: \n", accuracy_score(testy, predictions)*100)#get FPR (specificity) and TPR (sensitivity)
fpr , tpr, _ = roc_curve(testy, predictions)
print("AUC: \n",auc(fpr, tpr))

Accuracy: 
 94.89795918367348
AUC: 
 0.9263013698630136


# Conclusion

Naive Bayes gives a score of approx 96%.