# We will try to solve this problem using SVM and Naive Bias Algorithm

In [1]:
import pandas as pd

In [17]:
#Step 1: Read the data
bank = pd.read_csv(r"E:/Dataset/McGill Bank.csv")
bank.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,128,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141,360.0,1.0,Urban,Y


In [18]:
# Data Pre-processing : Lets check for the missing values

bank.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [20]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    float64
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    int64  
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(2), object(7)
memory usage: 62.5+ KB


In [5]:
# Missing values columns:

# Categorical : ['Gender','Married','Dependents','Self_Employed','Credit_History']
        # we can use Mode here 
# Numerical : ['Loan_Amount_Term']
        # we can use Mean/Median, depending upon skewness value

In [19]:
bank['Gender'].value_counts()
# To find Mode
bank['Gender'].mode()[0]

'Male'

In [21]:
cat_col = ['Gender', 'Married', 'Dependents','Self_Employed','Credit_History']

for i in cat_col:
    bank[i].fillna(bank[i].mode()[0],inplace=True)

In [22]:
bank.isnull().sum() #We can see the values has been replaced by the mode value

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [23]:
bank['Loan_Amount_Term'].skew()

-2.362414124216269

Skew value is less than 1 so we should replace the col with median value.

In [25]:
bank['Loan_Amount_Term'].fillna(bank['Loan_Amount_Term'].median(), inplace=True)

In [26]:
# Lets check our data again
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    float64
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    int64  
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(2), object(7)
memory usage: 62.5+ KB


 We need to convert text data into numerical form as per dummy variable concept

In [27]:
# For changing the Loan_Status col into numerical col
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#whatever comes first it will convert it to 0 next it will convert to 1 as N-0 and Y-1 
bank['Loan_Status'] = le.fit_transform(bank['Loan_Status'])

In [28]:
bank.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,128,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120,360.0,1.0,Urban,1
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141,360.0,1.0,Urban,1


Gender,Married, Education, Self_Employed, Property_Area 
We will convert them as per dummy variable concept 


Machine learning algo can not understand text values so we need to convert them into respective numerical values 

In [29]:
bank_final = pd.get_dummies(bank, columns = ['Gender','Married','Education','Self_Employed','Property_Area'], drop_first = True) #To reduce number of col
bank_final.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,0.0,5849,0.0,128,360.0,1.0,1,1,0,0,0,0,1
1,LP001003,1.0,4583,1508.0,128,360.0,1.0,0,1,1,0,0,0,0
2,LP001005,0.0,3000,0.0,66,360.0,1.0,1,1,1,0,1,0,1
3,LP001006,0.0,2583,2358.0,120,360.0,1.0,1,1,1,1,0,0,1
4,LP001008,0.0,6000,0.0,141,360.0,1.0,1,1,0,0,0,0,1


We ca use either LabelEncoder or Dummy variable

In [30]:
# Step 2: Defining X and Y

Y = bank_final[['Loan_Status']]
X = bank_final.drop(columns=['Loan_ID','Loan_Status'])

Since there are some col where max value and min value is in very high range, we can standardize/scale
our data using min max scaler

Formula = (X1-avg of col) / (max of col - min of col)

In [31]:
# Snce there are some columns where the maximum value and minimum value is in very High range. 
# we can standardize / scale our data using min-max scaler

# (X1 - average of column) / (maximum of column - minimum of column)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_scal = sc.fit_transform(X)

In [32]:
pd.DataFrame(X_scal)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.737806,0.072991,-0.554487,-0.211241,0.273231,0.411733,0.472343,-1.372089,-0.528362,-0.392601,-0.782016,1.428147
1,0.253470,-0.134412,-0.038732,-0.211241,0.273231,0.411733,0.472343,0.728816,-0.528362,-0.392601,-0.782016,-0.700208
2,-0.737806,-0.393747,-0.554487,-0.948996,0.273231,0.411733,0.472343,0.728816,-0.528362,2.547117,-0.782016,1.428147
3,-0.737806,-0.462062,0.251980,-0.306435,0.273231,0.411733,0.472343,0.728816,1.892641,-0.392601,-0.782016,1.428147
4,-0.737806,0.097728,-0.554487,-0.056551,0.273231,0.411733,0.472343,-1.372089,-0.528362,-0.392601,-0.782016,1.428147
...,...,...,...,...,...,...,...,...,...,...,...,...
609,-0.737806,-0.410130,-0.554487,-0.889500,0.273231,0.411733,-2.117107,-1.372089,-0.528362,-0.392601,-0.782016,-0.700208
610,2.236021,-0.212557,-0.554487,-1.258378,-2.522836,0.411733,0.472343,0.728816,-0.528362,-0.392601,-0.782016,-0.700208
611,0.253470,0.437174,-0.472404,1.276168,0.273231,0.411733,0.472343,0.728816,-0.528362,-0.392601,-0.782016,1.428147
612,1.244745,0.357064,-0.554487,0.490816,0.273231,0.411733,0.472343,0.728816,-0.528362,-0.392601,-0.782016,1.428147


In [33]:
# STep 3:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_scal, Y, train_size=0.8, random_state=10)

In [34]:
# Step 4: building the model:

# create model object
from sklearn.svm import SVC
svm = SVC()

# fit the object on training data
model = svm.fit(X_train,Y_train)
model

  y = column_or_1d(y, warn=True)


SVC()

In [35]:
# Predict the test cases

Y_test['predicted_svm'] = model.predict(X_test)
Y_test

Unnamed: 0,Loan_Status,predicted_svm
285,1,1
323,1,1
482,1,1
173,1,1
518,0,1
...,...,...
554,0,0
6,1,1
242,1,1
192,0,1


In [36]:
# Check the accuracy of the model
from sklearn.metrics import classification_report

print(classification_report(Y_test['Loan_Status'], Y_test['predicted_svm']))

              precision    recall  f1-score   support

           0       0.92      0.33      0.49        36
           1       0.78      0.99      0.87        87

    accuracy                           0.80       123
   macro avg       0.85      0.66      0.68       123
weighted avg       0.82      0.80      0.76       123



In [37]:
# Step 4: building the model:

# create model object
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# fit the object on training data
model_nb = nb.fit(X_train,Y_train)
model_nb

  y = column_or_1d(y, warn=True)


GaussianNB()

In [38]:
Y_test['predicted_nb'] = model_nb.predict(X_test)
Y_test

Unnamed: 0,Loan_Status,predicted_svm,predicted_nb
285,1,1,1
323,1,1,1
482,1,1,1
173,1,1,1
518,0,1,1
...,...,...,...
554,0,0,0
6,1,1,1
242,1,1,0
192,0,1,1


In [39]:
# Check the accuracy of the model
from sklearn.metrics import classification_report

print(classification_report(Y_test['Loan_Status'], Y_test['predicted_nb']))

              precision    recall  f1-score   support

           0       0.75      0.33      0.46        36
           1       0.78      0.95      0.86        87

    accuracy                           0.77       123
   macro avg       0.76      0.64      0.66       123
weighted avg       0.77      0.77      0.74       123

