# Classification Case Study

### Import libraries 

In [15]:
# Importing required packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='white')

## Load data from final csv

In [16]:
train = pd.read_csv("final_train.csv")

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          564 non-null    object 
 1   Married         564 non-null    object 
 2   Dependents      564 non-null    int64  
 3   Education       564 non-null    object 
 4   SelfEmployed    564 non-null    object 
 5   LoanAmountTerm  564 non-null    float64
 6   CreditHistory   564 non-null    float64
 7   PropertyArea    564 non-null    object 
 8   LoanStatus      564 non-null    object 
 9   LoanAmountLog   564 non-null    float64
 10  IncomeLog       564 non-null    float64
dtypes: float64(4), int64(1), object(6)
memory usage: 48.6+ KB


In [18]:
train = train.astype( {'Dependents' : 'str'})  # Convert Dependents to str 

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          564 non-null    object 
 1   Married         564 non-null    object 
 2   Dependents      564 non-null    object 
 3   Education       564 non-null    object 
 4   SelfEmployed    564 non-null    object 
 5   LoanAmountTerm  564 non-null    float64
 6   CreditHistory   564 non-null    float64
 7   PropertyArea    564 non-null    object 
 8   LoanStatus      564 non-null    object 
 9   LoanAmountLog   564 non-null    float64
 10  IncomeLog       564 non-null    float64
dtypes: float64(4), object(7)
memory usage: 48.6+ KB


In [20]:
X = train.drop(columns=['LoanStatus'])
y = train.LoanStatus

In [21]:
X = pd.get_dummies(X)  # One Hot Encoding 

In [22]:
X.columns

Index(['LoanAmountTerm', 'CreditHistory', 'LoanAmountLog', 'IncomeLog',
       'Gender_Female', 'Gender_Male', 'Married_No', 'Married_Yes',
       'Dependents_0', 'Dependents_1', 'Dependents_2', 'Dependents_3',
       'Education_Graduate', 'Education_Not Graduate', 'SelfEmployed_No',
       'SelfEmployed_Yes', 'PropertyArea_Rural', 'PropertyArea_Semiurban',
       'PropertyArea_Urban'],
      dtype='object')

In [23]:
X.sample(5)

Unnamed: 0,LoanAmountTerm,CreditHistory,LoanAmountLog,IncomeLog,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3,Education_Graduate,Education_Not Graduate,SelfEmployed_No,SelfEmployed_Yes,PropertyArea_Rural,PropertyArea_Semiurban,PropertyArea_Urban
206,360.0,1.0,5.164786,9.058121,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
16,360.0,0.0,4.330733,8.163371,True,False,True,False,True,False,False,False,True,False,True,False,False,False,True
125,360.0,1.0,4.477337,8.827028,False,True,False,True,False,False,True,False,True,False,False,True,False,False,True
344,180.0,0.0,4.70953,8.526945,False,True,True,False,False,True,False,False,True,False,True,False,False,False,True
429,180.0,0.0,4.248495,8.4362,False,True,False,True,False,False,False,True,False,True,True,False,False,False,True


In [24]:
X.shape

(564, 19)

### Split data into train and test data 

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [26]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [27]:
def print_scores(y_true, y_pred):
    cm = confusion_matrix(y_true,y_pred)
    tn = cm[0,0]
    tp = cm[1,1]
    fp = cm[0,1]
    fn = cm[1,0]
    print(f"Overall Accuracy                      : {(tp + tn) / (tp + fp + tn + fn):.2f}")
    print(f"Precision of Positive cases           : {tp / (tp + fp):.2f}")
    print(f"Precision of Negative cases           : {tn / (tn + fn):.2f}")
    print(f"Positive Recall or TPR or Sensitivity : {tp / (tp + fn):.2f}")
    print(f"Negative Recall or TNR or Specificity : {tn / (tn + fp):.2f}")      

## Logistic Regression

In [28]:
# Importing packages logistic regression and evaluation 
from sklearn.linear_model import LogisticRegression

In [29]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)

In [30]:
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

In [31]:
# logistic regression   - train model 
model = LogisticRegression()
model.fit(X_train_scaled,y_train)

In [32]:
# Check model's performance with train data 
model.score(X_train_scaled,y_train)

0.8248337028824834

In [33]:
y_train_pred = model.predict(X_train_scaled)

In [34]:
accuracy_score(y_train,y_train_pred)

0.8248337028824834

In [35]:
y_pred = model.predict(X_test_scaled)

In [36]:
accuracy_score(y_test,y_pred)

0.7787610619469026

In [37]:
confusion_matrix(y_test,y_pred)

array([[13, 22],
       [ 3, 75]], dtype=int64)

In [24]:
print_scores(y_test,y_pred)

Overall Accuracy                      : 0.78
Precision of Positive cases           : 0.77
Precision of Negative cases           : 0.81
Positive Recall or TPR or Sensitivity : 0.96
Negative Recall or TNR or Specificity : 0.37


### Display classification report

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           N       0.81      0.37      0.51        35
           Y       0.77      0.96      0.86        78

    accuracy                           0.78       113
   macro avg       0.79      0.67      0.68       113
weighted avg       0.79      0.78      0.75       113



In [26]:
model.coef_

array([[-0.06979934,  1.53244325, -0.0685536 ,  0.02285822, -0.03793511,
         0.03793511, -0.0905659 ,  0.0905659 , -0.05276279, -0.06274928,
         0.08283854,  0.06011098,  0.04866868, -0.04866868, -0.0189709 ,
         0.0189709 , -0.14771433,  0.22034042, -0.08670964]])

In [27]:
y_pred_prob = model.predict_proba(X_test_scaled)

In [28]:
y_pred_prob[5:10], y_pred[5:10]

(array([[0.21602546, 0.78397454],
        [0.93865142, 0.06134858],
        [0.14906319, 0.85093681],
        [0.18778176, 0.81221824],
        [0.34574458, 0.65425542]]),
 array(['Y', 'N', 'Y', 'Y', 'Y'], dtype=object))

## Decision Tree

In [38]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
model = DecisionTreeClassifier(max_depth=3, min_samples_split=5)
model.fit(X_train,y_train)

In [40]:
model.score(X_train,y_train)

0.8403547671840355

In [33]:
y_train_pred = model.predict(X_train)

In [34]:
print_scores(y_train,y_train_pred)

Overall Accuracy                      : 0.84
Precision of Positive cases           : 0.82
Precision of Negative cases           : 0.91
Positive Recall or TPR or Sensitivity : 0.97
Negative Recall or TNR or Specificity : 0.56


In [35]:
y_pred = model.predict(X_test)

In [36]:
print_scores(y_test,y_pred)

Overall Accuracy                      : 0.79
Precision of Positive cases           : 0.79
Precision of Negative cases           : 0.79
Positive Recall or TPR or Sensitivity : 0.95
Negative Recall or TNR or Specificity : 0.43


In [37]:
confusion_matrix(y_test,y_pred)

array([[15, 20],
       [ 4, 74]], dtype=int64)

In [123]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           N       0.79      0.43      0.56        35
           Y       0.79      0.95      0.86        78

    accuracy                           0.79       113
   macro avg       0.79      0.69      0.71       113
weighted avg       0.79      0.79      0.77       113



### Decision Tree Created by Classifier

In [41]:
# Print tree generated by DecisionTreeClassifier
from sklearn.tree import export_text
tree_rules = export_text(model, feature_names=list(X_train))
print(tree_rules)

|--- CreditHistory <= 0.50
|   |--- LoanAmountLog <= 6.30
|   |   |--- LoanAmountLog <= 4.86
|   |   |   |--- class: N
|   |   |--- LoanAmountLog >  4.86
|   |   |   |--- class: N
|   |--- LoanAmountLog >  6.30
|   |   |--- class: Y
|--- CreditHistory >  0.50
|   |--- IncomeLog <= 9.90
|   |   |--- IncomeLog <= 7.78
|   |   |   |--- class: N
|   |   |--- IncomeLog >  7.78
|   |   |   |--- class: Y
|   |--- IncomeLog >  9.90
|   |   |--- IncomeLog <= 10.59
|   |   |   |--- class: N
|   |   |--- IncomeLog >  10.59
|   |   |   |--- class: Y



## KNN 

In [42]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
model = KNeighborsClassifier(n_neighbors=3, n_jobs = 1)
model.fit(X_train_scaled,y_train)

In [45]:
y_train_pred = model.predict(X_train_scaled)

In [46]:
print_scores(y_train,y_train_pred)

Overall Accuracy                      : 0.84
Precision of Positive cases           : 0.83
Precision of Negative cases           : 0.86
Positive Recall or TPR or Sensitivity : 0.95
Negative Recall or TNR or Specificity : 0.60


In [47]:
y_pred = model.predict(X_test_scaled)

In [48]:
print_scores(y_test,y_pred)

Overall Accuracy                      : 0.67
Precision of Positive cases           : 0.71
Precision of Negative cases           : 0.43
Positive Recall or TPR or Sensitivity : 0.90
Negative Recall or TNR or Specificity : 0.17


## Naive Bayes 

In [49]:
from sklearn.naive_bayes import GaussianNB

In [50]:
model = GaussianNB()
model.fit(X_train,y_train)

In [51]:
y_train_pred = model.predict(X_train)

In [52]:
print_scores(y_train,y_train_pred)

Overall Accuracy                      : 0.82
Precision of Positive cases           : 0.80
Precision of Negative cases           : 0.88
Positive Recall or TPR or Sensitivity : 0.97
Negative Recall or TNR or Specificity : 0.50


In [53]:
y_pred = model.predict(X_test)

In [55]:
print_scores(y_test,y_pred)

Overall Accuracy                      : 0.79
Precision of Positive cases           : 0.79
Precision of Negative cases           : 0.79
Positive Recall or TPR or Sensitivity : 0.95
Negative Recall or TNR or Specificity : 0.43


## Support Vector Machines

In [54]:
from sklearn.svm import SVC

In [55]:
model = SVC()
model.fit(X_train_scaled,y_train)

In [56]:
y_train_pred = model.predict(X_train_scaled)

In [57]:
print_scores(y_train,y_train_pred)

Overall Accuracy                      : 0.83
Precision of Positive cases           : 0.80
Precision of Negative cases           : 0.96
Positive Recall or TPR or Sensitivity : 0.99
Negative Recall or TNR or Specificity : 0.49


In [58]:
y_pred = model.predict(X_test_scaled)

In [62]:
print_scores(y_test,y_pred)

Overall Accuracy                      : 0.78
Precision of Positive cases           : 0.77
Precision of Negative cases           : 0.81
Positive Recall or TPR or Sensitivity : 0.96
Negative Recall or TNR or Specificity : 0.37
