# CLASSIFICATION CASE STUDY

### IMPORT LIBRARIES

In [1]:
# IMPORT REQUIRED PACKAGES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style = "white")

# LOAD DATA FROM FINAL TRAIN CSV

In [2]:
train = pd.read_csv("final_train.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          564 non-null    object 
 1   Married         564 non-null    object 
 2   Dependents      564 non-null    int64  
 3   Education       564 non-null    object 
 4   SelfEmployed    564 non-null    object 
 5   LoanAmountTerm  564 non-null    float64
 6   CreditHistory   564 non-null    float64
 7   PropertyArea    564 non-null    object 
 8   LoanStatus      564 non-null    int64  
 9   LoanAmountLog   564 non-null    float64
 10  IncomeLog       564 non-null    float64
dtypes: float64(4), int64(2), object(5)
memory usage: 48.6+ KB


In [4]:
X = train.drop(columns = ["LoanStatus"])
y = train.LoanStatus

In [5]:
X = pd.get_dummies(X)  # ONE HOT ENCODING

In [6]:
X.columns

Index(['Dependents', 'LoanAmountTerm', 'CreditHistory', 'LoanAmountLog',
       'IncomeLog', 'Gender_Female', 'Gender_Male', 'Married_No',
       'Married_Yes', 'Education_Graduate', 'Education_Not Graduate',
       'SelfEmployed_No', 'SelfEmployed_Yes', 'PropertyArea_Rural',
       'PropertyArea_Semiurban', 'PropertyArea_Urban'],
      dtype='object')

In [7]:
X.sample(5)

Unnamed: 0,Dependents,LoanAmountTerm,CreditHistory,LoanAmountLog,IncomeLog,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,SelfEmployed_No,SelfEmployed_Yes,PropertyArea_Rural,PropertyArea_Semiurban,PropertyArea_Urban
558,0,360.0,1.0,4.682131,8.552946,0,1,0,1,1,0,1,0,1,0,0
246,3,360.0,0.0,5.075174,8.723231,0,1,0,1,1,0,1,0,0,0,1
314,0,360.0,1.0,3.828641,7.774015,1,0,1,0,1,0,1,0,1,0,0
162,0,360.0,1.0,4.75359,8.605387,0,1,0,1,1,0,1,0,1,0,0
471,0,360.0,1.0,5.739793,9.157045,0,1,1,0,1,0,1,0,1,0,0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Dependents              564 non-null    int64  
 1   LoanAmountTerm          564 non-null    float64
 2   CreditHistory           564 non-null    float64
 3   LoanAmountLog           564 non-null    float64
 4   IncomeLog               564 non-null    float64
 5   Gender_Female           564 non-null    uint8  
 6   Gender_Male             564 non-null    uint8  
 7   Married_No              564 non-null    uint8  
 8   Married_Yes             564 non-null    uint8  
 9   Education_Graduate      564 non-null    uint8  
 10  Education_Not Graduate  564 non-null    uint8  
 11  SelfEmployed_No         564 non-null    uint8  
 12  SelfEmployed_Yes        564 non-null    uint8  
 13  PropertyArea_Rural      564 non-null    uint8  
 14  PropertyArea_Semiurban  564 non-null    ui

In [9]:
X.shape

(564, 16)

# SPLIT DATA INTO TRAIN AND TEST

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [11]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [12]:
def print_scores(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    tp = cm[1, 1]
    fp = cm[0, 1]
    fn = cm[1, 0]
    print(f"Overall Accuracy                           :{(tp + tn) / (tp + tn + fp + fn):.2f}")
    print(f"Precision of Positive cases                :{tp / (tp + fp):.2f}")
    print(f"Precision of Negative cases                :{tn / (tn + fn):.2f}")
    print(f"Positive Recall or TPR or Sensitivity      :{tp / (tp + fn):.2f}")
    print(f"Negative Recall or FPR or Specificity      :{tn / (tn + fp):.2f}")

# LOGISTIC REGRESSION

In [13]:
# IMPORTING PACKAGES LOGISTIC REGRESSION AND EVALUATION
from sklearn.linear_model import LogisticRegression

In [14]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)

StandardScaler()

In [15]:
X_train_scaled = ss.transform(X_train)
X_test_scaled  = ss.transform(X_test)

In [16]:
# LOGISTIC REGRESSION - TRAIN MODEL
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

LogisticRegression()

In [17]:
# CHECK MODEL'S PERFORMANCE WITH TRAIN DATA
model.score(X_train_scaled, y_train)

0.8248337028824834

In [18]:
y_pred = model.predict(X_test_scaled)

In [19]:
accuracy_score(y_test, y_pred)

0.7787610619469026

In [20]:
confusion_matrix(y_test, y_pred)

array([[13, 22],
       [ 3, 75]], dtype=int64)

In [21]:
print_scores(y_test, y_pred)

Overall Accuracy                           :0.78
Precision of Positive cases                :0.77
Precision of Negative cases                :0.81
Positive Recall or TPR or Sensitivity      :0.96
Negative Recall or FPR or Specificity      :0.37


### DISPLAY CLASSIFICATION REPORT

In [22]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.81      0.37      0.51        35\n           1       0.77      0.96      0.86        78\n\n    accuracy                           0.78       113\n   macro avg       0.79      0.67      0.68       113\nweighted avg       0.79      0.78      0.75       113\n'

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.37      0.51        35
           1       0.77      0.96      0.86        78

    accuracy                           0.78       113
   macro avg       0.79      0.67      0.68       113
weighted avg       0.79      0.78      0.75       113



In [24]:
model.coef_

array([[ 0.12504115, -0.06339225,  1.52887566, -0.07144193,  0.01978037,
        -0.04273509,  0.04273509, -0.08791328,  0.08791328,  0.04988399,
        -0.04988399, -0.01659069,  0.01659069, -0.14173176,  0.21747696,
        -0.08953919]])

In [25]:
model.intercept_

array([0.76037075])

In [26]:
y_pred_prob = model.predict_proba(X_test_scaled)

In [27]:
y_pred_prob[5:10], y_pred[5:10]

(array([[0.18926033, 0.81073967],
        [0.93918012, 0.06081988],
        [0.16626366, 0.83373634],
        [0.19272632, 0.80727368],
        [0.35005484, 0.64994516]]),
 array([1, 0, 1, 1, 1], dtype=int64))

In [28]:
y_pred_prob[5:10]

array([[0.18926033, 0.81073967],
       [0.93918012, 0.06081988],
       [0.16626366, 0.83373634],
       [0.19272632, 0.80727368],
       [0.35005484, 0.64994516]])

In [29]:
y_pred_prob[0:5], y_pred[5:10]

(array([[0.24778533, 0.75221467],
        [0.10699829, 0.89300171],
        [0.12604105, 0.87395895],
        [0.12853865, 0.87146135],
        [0.23119301, 0.76880699]]),
 array([1, 0, 1, 1, 1], dtype=int64))

### DECISION TREE

In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
model = DecisionTreeClassifier(max_depth = 3)
model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=3)

In [32]:
model.score(X_train, y_train)

0.8403547671840355

In [33]:
y_train_pred = model.predict(X_train)

In [34]:
print_scores(y_train, y_train_pred)

Overall Accuracy                           :0.84
Precision of Positive cases                :0.82
Precision of Negative cases                :0.91
Positive Recall or TPR or Sensitivity      :0.97
Negative Recall or FPR or Specificity      :0.56


In [35]:
y_pred = model.predict(X_test)

In [36]:
print_scores(y_test, y_pred)

Overall Accuracy                           :0.78
Precision of Positive cases                :0.78
Precision of Negative cases                :0.75
Positive Recall or TPR or Sensitivity      :0.94
Negative Recall or FPR or Specificity      :0.43


In [37]:
confusion_matrix(y_test, y_pred)

array([[15, 20],
       [ 5, 73]], dtype=int64)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.43      0.55        35
           1       0.78      0.94      0.85        78

    accuracy                           0.78       113
   macro avg       0.77      0.68      0.70       113
weighted avg       0.77      0.78      0.76       113



### DECISION TREE BY CLASSIFIER

In [39]:
# PRINT TREE GENERATED BY DECISIONTREECLASSIFIER
from sklearn.tree import export_text
tree_rules = export_text(model, feature_names = list(X_train))
print(tree_rules)

|--- CreditHistory <= 0.50
|   |--- LoanAmountLog <= 6.30
|   |   |--- LoanAmountLog <= 4.86
|   |   |   |--- class: 0
|   |   |--- LoanAmountLog >  4.86
|   |   |   |--- class: 0
|   |--- LoanAmountLog >  6.30
|   |   |--- class: 1
|--- CreditHistory >  0.50
|   |--- IncomeLog <= 9.90
|   |   |--- IncomeLog <= 7.78
|   |   |   |--- class: 0
|   |   |--- IncomeLog >  7.78
|   |   |   |--- class: 1
|   |--- IncomeLog >  9.90
|   |   |--- Dependents <= 2.00
|   |   |   |--- class: 0
|   |   |--- Dependents >  2.00
|   |   |   |--- class: 1



### KNN

In [40]:
from sklearn.neighbors import KNeighborsClassifier

In [41]:
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(X_train_scaled, y_train)

KNeighborsClassifier()

In [42]:
y_train_pred = model.predict(X_train_scaled)

In [43]:
print_scores(y_train, y_train_pred)

Overall Accuracy                           :0.82
Precision of Positive cases                :0.80
Precision of Negative cases                :0.89
Positive Recall or TPR or Sensitivity      :0.97
Negative Recall or FPR or Specificity      :0.49


In [44]:
y_pred = model.predict(X_test_scaled)

In [45]:
print_scores(y_test, y_pred)

Overall Accuracy                           :0.75
Precision of Positive cases                :0.76
Precision of Negative cases                :0.71
Positive Recall or TPR or Sensitivity      :0.94
Negative Recall or FPR or Specificity      :0.34


### NAIVE BAYES

In [46]:
from sklearn.naive_bayes import GaussianNB

In [47]:
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB()

In [48]:
y_train_pred = model.predict(X_train)

In [49]:
print_scores(y_train, y_train_pred)

Overall Accuracy                           :0.82
Precision of Positive cases                :0.80
Precision of Negative cases                :0.89
Positive Recall or TPR or Sensitivity      :0.97
Negative Recall or FPR or Specificity      :0.49


In [50]:
y_pred = model.predict(X_test)

In [51]:
print_scores(y_test, y_pred)

Overall Accuracy                           :0.79
Precision of Positive cases                :0.79
Precision of Negative cases                :0.79
Positive Recall or TPR or Sensitivity      :0.95
Negative Recall or FPR or Specificity      :0.43


### SUPPORT VECTOR MACHINES

In [53]:
from sklearn.svm import SVC

In [54]:
model = SVC()
model.fit(X_train_scaled, y_train)

SVC()

In [55]:
y_train_pred = model.predict(X_train_scaled)

In [57]:
print_scores(y_train, y_train_pred)

Overall Accuracy                           :0.83
Precision of Positive cases                :0.81
Precision of Negative cases                :0.96
Positive Recall or TPR or Sensitivity      :0.99
Negative Recall or FPR or Specificity      :0.50


In [58]:
y_pred = model.predict(X_test_scaled)

In [59]:
print_scores(y_test, y_pred)

Overall Accuracy                           :0.78
Precision of Positive cases                :0.77
Precision of Negative cases                :0.81
Positive Recall or TPR or Sensitivity      :0.96
Negative Recall or FPR or Specificity      :0.37
