# Fine-Tuned SVM for Predicting Loans

In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split  # Correct import for train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV # Import GridSearchCV from sklearn.model_selection


In [2]:
from google.colab import drive


### Loading the data and Preprocessing

In [3]:
# Mount the Google Drive at /content/drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
df = pd.read_csv('/content/loan_data.csv')

In [5]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [7]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [8]:
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [9]:
#Filling null values for numerical columns with mean
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)


In [10]:
#Filling null values for categorical columns with mode
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

#### Dropping irrelevent columns

In [11]:
df.drop('Loan_ID', axis=1, inplace=True)

### Preparing the model for SVM

#### Determinnign the target variable

In [12]:
#Load status is the target variable
df['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
Y,422
N,192


There is imbalance in load status values (Y is more)

#### Label encoding for categorical columns

In [13]:
#(Y :1 , N:0)
df.replace({"Loan_Status": {"Y": 1, "N": 0}}, inplace=True)

  df.replace({"Loan_Status": {"Y": 1, "N": 0}}, inplace=True)


In [14]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [15]:
#Encoding other columns
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0})

In [16]:
#One-hot encoding for education and property area
df = pd.get_dummies(df, columns=['Education', 'Property_Area'], drop_first=True)

In [17]:
# checking out the value counts of the dependents variable
df['Dependents'].value_counts()

Unnamed: 0_level_0,count
Dependents,Unnamed: 1_level_1
0,360
1,102
2,101
3+,51


In [18]:
# replacing all the 3+ in the Dependents variable with 3
df = df.replace(to_replace='3+', value=3)

we replaced 3+ with 3 for Dependents column

### Training SVM model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [20]:
# Separating the features X and the target variable y
X = df[['Gender', 'Married', 'Dependents', 'Self_Employed', 'ApplicantIncome',
         'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History',
         'Education_Not Graduate', 'Property_Area_Semiurban', 'Property_Area_Urban']]
y = df['Loan_Status']

In [21]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

#### Kernel selection

In [22]:
# training the SVM model base on sinlge kernel linear
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)

In [23]:
# Evaluating the Model
print('Kernel linear')
X_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, X_train_pred)
print(f"Train Accuracy: {train_accuracy}")

X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Predicting a Loan Status for a New Input
user_input = [0, 1, 0, 0, 0, 5849, 0.0, 146.412162, 360.0, 1.0, 1, 0]
user_input = np.array(user_input).reshape(1, -1)

if model.predict(user_input) == 1:
    print("Loan Status: Approved")
else:
    print("Loan Status: Not Approved")

Train Accuracy: 0.7983706720977597
Test Accuracy: 0.7723577235772358
Loan Status: Approved


In [24]:
# training the SVM model base on sinlge kernel sigmoid
model = svm.SVC(kernel='sigmoid')
model.fit(X_train, y_train)

In [25]:
print('Kernel sigmoid')

# Evaluating the Model
X_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, X_train_pred)
print(f"Train Accuracy: {train_accuracy}")

X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Predicting a Loan Status for a New Input
user_input = [0, 1, 0, 0, 0, 5849, 0.0, 146.412162, 360.0, 1.0, 1, 0]
user_input = np.array(user_input).reshape(1, -1)

if model.predict(user_input) == 1:
    print("Loan Status: Approved")
else:
    print("Loan Status: Not Approved")

Train Accuracy: 0.5845213849287169
Test Accuracy: 0.6097560975609756
Loan Status: Not Approved


In [26]:
# training the SVM model base on sinlge kernel sigmoid
model = svm.SVC(kernel='poly')
model.fit(X_train, y_train)

In [27]:
print('Kernel poly')

# Evaluating the Model
X_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, X_train_pred)
print(f"Train Accuracy: {train_accuracy}")

X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Predicting a Loan Status for a New Input
user_input = [0, 1, 0, 0, 0, 5849, 0.0, 146.412162, 360.0, 1.0, 1, 0]
user_input = np.array(user_input).reshape(1, -1)

if model.predict(user_input) == 1:
    print("Loan Status: Approved")
else:
    print("Loan Status: Not Approved")

Train Accuracy: 0.7026476578411406
Test Accuracy: 0.6504065040650406
Loan Status: Approved


In [28]:
# training the SVM model base on sinlge kernel sigmoid
model = svm.SVC(kernel='rbf')
model.fit(X_train, y_train)

In [29]:
print('kernel rbf')
# Evaluating the Model
X_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, X_train_pred)
print(f"Train Accuracy: {train_accuracy}")

X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_pred)
print(f"Test Accuracy: {test_accuracy}")

# Predicting a Loan Status for a New Input
user_input = [0, 1, 0, 0, 0, 5849, 0.0, 146.412162, 360.0, 1.0, 1, 0]
user_input = np.array(user_input).reshape(1, -1)

if model.predict(user_input) == 1:
    print("Loan Status: Approved")
else:
    print("Loan Status: Not Approved")

Train Accuracy: 0.7026476578411406
Test Accuracy: 0.6504065040650406
Loan Status: Approved


### Integrating grid search for fine-tuning the SVM model
- using best parameters found for evaluation

In [None]:
# Defining the parameter grid for fine-tuning
param_grid = {
    'C': [0.1, 1, 10],                  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernels to experiment with
    'gamma': ['scale', 'auto'],         # Kernel coefficient
    'degree': [2, 3, 4]                 # Degree for polynomial kernel
}

# Initialize GridSearchCV
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Extract the best model and parameters
best_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best Training Accuracy:", grid_search.best_score_)

# Evaluate the best model on the test data
X_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, X_test_pred)
print("Test Accuracy of Best Model:", test_accuracy)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
# Predicting a Loan Status for a New Input
user_input = [0, 1, 0, 0, 0, 5849, 0.0, 146.412162, 360.0, 1.0, 1, 0]
user_input = np.array(user_input).reshape(1, -1)

if best_model.predict(user_input) == 1:
    print("Loan Status: Approved")
else:
    print("Loan Status: Not Approved")