In [1]:
# Importing the librairies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Link to the dataset
url = './datasets/Loan_Train.csv'

In [3]:
# Important the dataset
loan_data = pd.read_csv(url, sep = ',')

# Checking the size of the dataset
loan_data.shape

(614, 13)

In [4]:
# Display the first 5 rows
loan_data.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
# Drop the column “Load_ID.”
loan_data.drop(columns=['Loan_ID'], inplace = True) 

# Display the first 5 rows
loan_data.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
# Drop any rows with missing data.
loan_data.dropna(inplace=True)

# Size of the dataset
loan_data.shape

(480, 12)

In [7]:
# Convert the categorical features into dummy variables.
loan_data = pd.get_dummies(loan_data, drop_first=True)

# View the dataset
loan_data.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,True,True,True,False,False,False,False,False,False,False
2,3000,0.0,66.0,360.0,1.0,True,True,False,False,False,False,True,False,True,True
3,2583,2358.0,120.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True
4,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,False,False,False,True,True
5,5417,4196.0,267.0,360.0,1.0,True,True,False,True,False,False,True,False,True,True


In [8]:
# Splitting the data into features and target
features = loan_data.drop(columns=['Loan_Status_Y'])
target = loan_data['Loan_Status_Y']

# Splitting the data into training and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [9]:
# Creating the min-max scaler
scaler = MinMaxScaler()

# Creating a knn classifier
knn_classifier = KNeighborsClassifier()

# Creating the pipeline
pipe = Pipeline([('scaler', scaler),('knn', knn_classifier)])

In [10]:
# Fitting the pipeline to the training data
pipe.fit(features_train, target_train)

In [11]:
# Predictions on the test set
pred = pipe.predict(features_test)

# Calculating the accuracy
accuracy = accuracy_score(target_test, pred)

# Printing the accuracy
print('The accuracy of the model is ', accuracy)

The accuracy of the model is  0.6979166666666666


In [12]:
# creating a space of candidate values
search_space = [{'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]}]

In [13]:
# Creating a grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fitting the grid search
classifier.fit(features_train, target_train)

# Finding the best value for the “n_neighbors” parameter
best_estimator = classifier.best_estimator_.get_params()['knn__n_neighbors']

# Printing the best estimator
print('The best value for the n_neighbors parameter is', best_estimator)

The best value for the n_neighbors parameter is 8


In [14]:
# Grid search best model
best_model = classifier.best_estimator_

# Making predictions on the test set
pred = best_model.predict(features_test)

# Calculating the accuracy
accuracy = accuracy_score(target_test, pred)

# Printing the best model
print('Best model: ',best_model)

# Printing the accuracy
print('The accuracy of the model is ', accuracy)

Best model:  Pipeline(steps=[('scaler', MinMaxScaler()),
                ('knn', KNeighborsClassifier(n_neighbors=8))])
The accuracy of the model is  0.7604166666666666


In [15]:
# Hyperparameter values in section 12.3
search_space = [{'classifier': [LogisticRegression(max_iter=500, 
        solver='liblinear')],
                 'classifier__penalty': ['l1', 'l2'],
        'classifier__C': np.logspace(0, 4, 10)}, 
        {'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [10, 100, 1000],
        'classifier__max_features': [1, 2, 3]}]

# Pipeline with logistic regression
pipe = Pipeline([('scaler', scaler), ('classifier', LogisticRegression())])

# Creating a grid search
gridsearch = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fitting the grip search
gridsearch.fit(features_train, target_train)

In [16]:
# Best model
best_model2 = gridsearch.best_estimator_

# Predictions
pred2 = best_model2.predict(features_test)

# Accuracy
accuracy2 = accuracy_score(target_test, pred2)

# Printing tge results
print('The best model and hyperparameters are ', best_model2)
print('The accuracy is ', accuracy2)

The best model and hyperparameters are  Pipeline(steps=[('scaler', MinMaxScaler()),
                ('classifier',
                 RandomForestClassifier(max_features=3, n_estimators=1000))])
The accuracy is  0.7604166666666666
