### Import libraries 

In [1]:
# Importing required Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='white')

## Load data from final csv

In [2]:
train = pd.read_csv("final_train.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          614 non-null    object 
 1   Married         614 non-null    object 
 2   Dependents      614 non-null    int64  
 3   Education       614 non-null    object 
 4   SelfEmployed    614 non-null    object 
 5   LoanAmountTerm  614 non-null    float64
 6   CreditHistory   614 non-null    float64
 7   PropertyArea    614 non-null    object 
 8   LoanStatus      614 non-null    int64  
 9   LoanAmount_log  614 non-null    float64
 10  Income_log      614 non-null    float64
dtypes: float64(4), int64(2), object(5)
memory usage: 52.9+ KB


In [4]:
X = train.drop(columns=['LoanStatus'])
y = train.LoanStatus

In [5]:
X = pd.get_dummies(X)

### Split data into train and test data 

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [7]:
from sklearn.metrics import confusion_matrix

In [8]:
def print_scores(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn = cm[0,0]
    tp = cm[1,1]
    fp = cm[0,1]
    fn = cm[1,0]
    print("Overall Accuracy             : ", (tp + tn) / (tp + fp + tn + fn))
    print("Precision                    : ", tp / (tp + fp))
    print("Recall or TPR or Sensitivity : ", tp / (tp + fn))
    print("Specificity or TNR           : ", tn / (tn + fp))  
    

## Logistic Regression

In [9]:
# Importing packages logistic regression and evaluation 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [10]:
# logistic regression   
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Check model's performance with train data 
model.score(X_train,y_train)

0.8044806517311609

In [12]:
y_pred = model.predict(X_test)

In [13]:
confusion_matrix(y_test,y_pred)

array([[14, 19],
       [ 2, 88]], dtype=int64)

In [14]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.8292682926829268
Precision                    :  0.822429906542056
Recall or TPR or Sensitivity :  0.9777777777777777
Specificity or TNR           :  0.42424242424242425


### Display classification report

In [15]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.42      0.57        33
           1       0.82      0.98      0.89        90

    accuracy                           0.83       123
   macro avg       0.85      0.70      0.73       123
weighted avg       0.84      0.83      0.81       123



## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
model = DecisionTreeClassifier( max_depth=5)
model.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [18]:
model.score(X_train,y_train)

0.845213849287169

In [19]:
y_pred = model.predict(X_test)

In [20]:
confusion_matrix(y_test,y_pred)

array([[14, 19],
       [ 4, 86]], dtype=int64)

In [21]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.8130081300813008
Precision                    :  0.819047619047619
Recall or TPR or Sensitivity :  0.9555555555555556
Specificity or TNR           :  0.42424242424242425


In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.42      0.55        33
           1       0.82      0.96      0.88        90

    accuracy                           0.81       123
   macro avg       0.80      0.69      0.72       123
weighted avg       0.81      0.81      0.79       123



### Decision Tree Created by Classifier

In [23]:
# Print tree generated by DecisionTreeClassifier
from sklearn.tree import export_text
tree_rules = export_text(model, feature_names=list(X_train))
print(tree_rules)

|--- CreditHistory <= 0.50
|   |--- LoanAmount_log <= 6.30
|   |   |--- LoanAmount_log <= 5.07
|   |   |   |--- LoanAmount_log <= 4.86
|   |   |   |   |--- class: 0
|   |   |   |--- LoanAmount_log >  4.86
|   |   |   |   |--- LoanAmount_log <= 4.88
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- LoanAmount_log >  4.88
|   |   |   |   |   |--- class: 0
|   |   |--- LoanAmount_log >  5.07
|   |   |   |--- LoanAmount_log <= 5.15
|   |   |   |   |--- PropertyArea_Urban <= 0.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- PropertyArea_Urban >  0.50
|   |   |   |   |   |--- class: 0
|   |   |   |--- LoanAmount_log >  5.15
|   |   |   |   |--- class: 0
|   |--- LoanAmount_log >  6.30
|   |   |--- class: 1
|--- CreditHistory >  0.50
|   |--- Income_log <= 7.78
|   |   |--- PropertyArea_Semiurban <= 0.50
|   |   |   |--- Gender_Male <= 0.50
|   |   |   |   |--- Education_Not Graduate <= 0.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- Education_Not Graduate >  0.50
|

### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
model = RandomForestClassifier(n_estimators=5)
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
model.score(X_train,y_train)

0.9572301425661914

In [27]:
y_pred = model.predict(X_test)

In [28]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.7642276422764228
Precision                    :  0.8080808080808081
Recall or TPR or Sensitivity :  0.8888888888888888
Specificity or TNR           :  0.42424242424242425


In [29]:
# Print internal decision trees
for t in model.estimators_:
      print(t)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1841109041, splitter='best')
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=352167164, splitter='best')
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features='aut

## KNN 

In [30]:
from sklearn.neighbors import KNeighborsClassifier

In [31]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [32]:
model.score(X_train,y_train)

0.790224032586558

In [33]:
y_pred = model.predict(X_test)

In [34]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.7723577235772358
Precision                    :  0.7870370370370371
Recall or TPR or Sensitivity :  0.9444444444444444
Specificity or TNR           :  0.30303030303030304


## Naive Bayes 

In [35]:
from sklearn.naive_bayes import GaussianNB

In [36]:
model = GaussianNB()
model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
model.score(X_train,y_train)

0.8065173116089613

In [38]:
y_pred = model.predict(X_test)

In [39]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.8292682926829268
Precision                    :  0.822429906542056
Recall or TPR or Sensitivity :  0.9777777777777777
Specificity or TNR           :  0.42424242424242425


## Support Vector Machines

In [40]:
from sklearn.svm import SVC

In [41]:
model = SVC()
model.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
model.score(X_train,y_train)

0.6761710794297352

In [43]:
y_pred = model.predict(X_test)

In [44]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.7317073170731707
Precision                    :  0.7317073170731707
Recall or TPR or Sensitivity :  1.0
Specificity or TNR           :  0.0


### GradientBoostingClassifier

In [45]:
from sklearn.ensemble import GradientBoostingClassifier

In [46]:
model = GradientBoostingClassifier()
model.fit(X_train,y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [47]:
model.score(X_train,y_train)

0.890020366598778

In [48]:
y_pred = model.predict(X_test)

In [49]:
print_scores(y_test,y_pred)

Overall Accuracy             :  0.7967479674796748
Precision                    :  0.8282828282828283
Recall or TPR or Sensitivity :  0.9111111111111111
Specificity or TNR           :  0.48484848484848486
