In [1]:
# Load the libraries
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, roc_curve, auc

## 1. Import the dataset and ensure that it loaded properly.

In [2]:
# read a csv file and create a dataframe
df = pd.read_csv("data/Loan_Train.csv")

In [3]:
# Display the top 10 rows of a dataframe
print("First ten rows of the dataset:")
df.head(10)

First ten rows of the dataset:


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [4]:
# Display the bottom 5 rows of a dataframe
df.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [5]:
print("Dimensions of the data frame:", df.shape)

Dimensions of the data frame: (614, 13)


In [6]:
df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

##  2. Prepare the data for modeling by performing the following steps:
- Drop the column “Load_ID.”
- Drop any rows with missing data.
- Convert the categorical features into dummy variables.

### Drop the column “Load_ID.”

In [7]:
# Drop the column 'Loan_ID'
df = df.drop(columns=['Loan_ID'])

###  Drop any rows with missing data.

In [8]:
# check for missing data and dispaly the count
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
# drop rows with missing data
df = df.dropna()

In [10]:
df.shape

(480, 12)

### Convert the categorical features into dummy variables.

In [11]:
# Create dummy vaiables and handle dummy variable trap 
df = pd.get_dummies(df, drop_first=True)

In [12]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
5,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,0,1,0,1,1


In [13]:
df.shape

(480, 15)

## 3. Split the data into a training and test set, where the “Loan_Status” column is the target.

In [14]:
# Store input features in X and target feature in y
X = df.drop(columns=['Loan_Status_Y'])  # 'Loan_Status_Y' is the target column after get_dummies
y = df['Loan_Status_Y']

In [15]:
# Split data into train and test data with the ratio 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Display the train and test data dimensions
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((384, 14), (96, 14), (384,), (96,))

## 4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [17]:
# create a pipeline with knn classifier and minmax scaler
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('knn', KNeighborsClassifier())
])

## 5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [18]:
# fit a KNN classifer to the train data with the pipeline
pipeline.fit(X_train, y_train)

In [19]:
# run predict function on test data
y_pred = pipeline.predict(X_test)

In [20]:
# calcualate accuracy score
knn_accuracy = accuracy_score(y_test, y_pred)

In [21]:
# display KNN accuracy score
print(f"Base KNN Model Accuracy : {knn_accuracy:.4f}")

Base KNN Model Accuracy : 0.7812


## 6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).


In [22]:
# create grid search space with n_neighbors from 1 to 10
param_grid_knn = {'knn__n_neighbors': range(1, 11)}
param_grid_knn

{'knn__n_neighbors': range(1, 11)}

## 7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.


In [23]:
# set a grid search with pipeline with 5-fold cross-validation
grid_search_knn = GridSearchCV(pipeline, 
                               param_grid_knn, 
                               cv=5, 
                               scoring='accuracy')

In [24]:
# fit a grid search to train data
grid_search_knn.fit(X_train, y_train)

## 8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.


In [25]:
# Find the best knn model parameters
best_knn_model = grid_search_knn.best_estimator_

In [26]:
# predict on test data and get the best knn accuracy
best_knn_accuracy = accuracy_score(y_test, best_knn_model.predict(X_test))

In [27]:
print(f"Best KNN Model Accuracy with  grid search : {best_knn_accuracy:.4f} with n_neighbors={grid_search_knn.best_params_['knn__n_neighbors']}")

Best KNN Model Accuracy with  grid search : 0.7917 with n_neighbors=3


## 9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.


In [28]:
# expanding search space to include logistic regression and random forest moodels with hyper parameter values
param_grid_expanded = [
    {'knn__n_neighbors': range(1, 11)},  # KNN parameter range
    {
        'knn': [LogisticRegression(max_iter=200)],
        'knn__C': [0.1, 1.0, 10.0],
        'knn__solver': ['liblinear', 'lbfgs']
    },
    {
        'knn': [RandomForestClassifier()],
        'knn__n_estimators': [50, 100, 200],
        'knn__max_depth': [None, 10, 20],
        'knn__min_samples_split': [2, 5, 10]
    }
]

## 10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [29]:
# initialize expanded gridsearchcv parameters with the GridSearchCV pipeline
grid_search_expanded = GridSearchCV(pipeline, 
                                    param_grid_expanded, 
                                    cv=5, 
                                    scoring='accuracy')

In [30]:
# fit the expanded grid search pipeline with train data
grid_search_expanded.fit(X_train, y_train)

In [31]:
# get the best model with the best estimators
best_model = grid_search_expanded.best_estimator_

In [32]:
# calculate the best model accuracy
best_model_accuracy = accuracy_score(y_test, 
                                     best_model.predict(X_test))

In [33]:
print(f"Best Accuracy with expanded grid search : {best_model_accuracy:.4f}")

Best Accuracy with expanded grid search : 0.8229


In [34]:
print(f"Best Model and Hyperparameters: {grid_search_expanded.best_params_}")

Best Model and Hyperparameters: {'knn': RandomForestClassifier(), 'knn__max_depth': None, 'knn__min_samples_split': 5, 'knn__n_estimators': 50}


## 11. Summarize your results.

###  Base KNN Model :
- Model performance using KNN with default settings. 78.12 %.
- The model can classify reasonably well, it’s not optimal.

### Best KNN  Model with Grid Search CV :
- Grid search was used to find the best n_neighbors value by testing values from 1 to 10.
- Found n_neighbors=3 as the best setting, slightly improving accuracy to 79.17 %. 
- This shows that tuning the n_neighbors parameter positively impacts performance.

### Expanded Grid Search CV - Best Model :
- Included two other classifiers: Logistic Regression and Random Forest as part of expanded grid serch cv with various hyper parameters.
- The search found Logistic Regression as the best model, which outperformed KNN with an accuracy of 82.29 %.
- It demonstrates that selecting the right model architecture and hyperparameters is critical to achieving the best results.