# 9.2 Exercise: Best Model Selection and Hyperparameter Tuning

### 1. Import the dataset and ensure that it loaded properly

In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Datasets/Loan_Train.csv')

In [3]:
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [5]:
df.shape

(614, 13)

### 2. Prepare the data for modeling by performing the following steps:
- Drop the column “Load_ID.”.
- Drop any rows with missing data.
- Convert the categorical features into dummy variables.

In [6]:
# dropping the 'Loan_ID' column
df.drop(['Loan_ID'], axis = 1, inplace = True)

In [7]:
# getting a count of NaN values per column
df.isna().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
# dropping rows with NaN values
df.dropna(inplace = True)

In [9]:
# getting a count of NaN values per column after dropping rows
df.isna().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [10]:
# Viewing which columns have categorical (object) values to convert to dummies
df.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [11]:
# using get dummies to convert object data types to dummies of int variable type 
df = pd.get_dummies(df)

In [12]:
df.drop(['Loan_Status_N'], axis=1, inplace = True)

In [13]:
df.shape

(480, 21)

In [14]:
df.columns.tolist()

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Gender_Female',
 'Gender_Male',
 'Married_No',
 'Married_Yes',
 'Dependents_0',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Education_Graduate',
 'Education_Not Graduate',
 'Self_Employed_No',
 'Self_Employed_Yes',
 'Property_Area_Rural',
 'Property_Area_Semiurban',
 'Property_Area_Urban',
 'Loan_Status_Y']

### 3. Split the data into a training and test set, where the “Loan_Status” column is the target.

In [15]:
# import additional libraries
from sklearn.model_selection import train_test_split

In [16]:
X = df.loc[:, df.columns != 'Loan_Status_Y']
y = df['Loan_Status_Y']

In [17]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

### 4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [18]:
from sklearn.neighbors import KNeighborsClassifier # The k-nearest neighbor classifier
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.pipeline import Pipeline # For setting up pipeline
# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV # For optimization

In [19]:
# creating standarizer
standardizer = MinMaxScaler()

#Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

In [20]:
# creating the pipe object with minmax scaler and the knn classifier
pipe = Pipeline([
('standardizer', standardizer),
('knn', knn)
])

The pipe object is simple to understand. It says, scale first, then run the classifier

### 5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [21]:
# using the pipe object on the train and test data
pipe.fit(X_train, y_train)
 
print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))

Training set score: 0.8151041666666666
Test set score: 0.6666666666666666


### 6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [22]:
# creating a search space for the knn classifier
search_space = {'knn__n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

### 7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [23]:
# creating the grid search
gridsearch = GridSearchCV(
    pipe, search_space, cv = 5, verbose = 0, scoring='accuracy'
)

In [24]:
gridsearch.fit(X_train,y_train)

### 8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [25]:
# reviewing the resuls of the grid search 
gridsearch.cv_results_

{'mean_fit_time': array([0.00094342, 0.00414085, 0.00657487, 0.0076242 , 0.00557418,
        0.00412579, 0.00312705, 0.01280122, 0.00725312, 0.0049077 ]),
 'std_fit_time': array([0.00125114, 0.00607277, 0.00745379, 0.00722963, 0.00594314,
        0.00606678, 0.0062541 , 0.00793941, 0.00707799, 0.00576933]),
 'mean_score_time': array([0.04716196, 0.04901614, 0.04662013, 0.04301047, 0.05122609,
        0.04631171, 0.04712043, 0.04857769, 0.05211616, 0.04501038]),
 'std_score_time': array([0.00538945, 0.00810753, 0.00237086, 0.00664471, 0.00848173,
        0.00203814, 0.00031137, 0.01538578, 0.01139976, 0.01028932]),
 'param_knn__n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'knn__n_neighbors': 1},
  {'knn__n_neighbors': 2},
  {'knn__n_neighbors': 3},
  {'knn__n_neighbors': 4},
  {'knn__n_neighbor

In [26]:
# reviewing best esitmator
gridsearch.best_estimator_

In [27]:
# retrieving the best score/ accuracy
gridsearch.best_score_

0.7343814080656187

### 9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [28]:
# importing additonal models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [29]:
# set random seed
np.random.seed(0)

In [30]:
new_pipe = Pipeline([('standardizer', standardizer), ('classifier',knn)])


In [31]:
# creating a dictionary with candidate learning algorithms are their hyperparameters
new_search_space = [{'classifier': [LogisticRegression(max_iter=500,
                                                  solver='liblinear')],
                'classifier__penalty': ['l1','l2'],
                'classifier__C': np.logspace(0, 4, 10)},
               {'classifier': [RandomForestClassifier()],
               'classifier__n_estimators': [10, 100, 1000],
               'classifier__max_features': [1 ,2, 3]}]

In [32]:
# creating the grid search
gridsearch = GridSearchCV(
    new_pipe, new_search_space, cv = 5, verbose = 0, scoring='accuracy'
)

In [33]:
# fitting grid search to training data
best_model = gridsearch.fit(X_train, y_train)

### 10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [34]:
# reviewing the best model 
best_model.best_estimator_

In [35]:
# using the gridsearch on the test data to verify the model is the same
best_model = gridsearch.fit(X_test, y_test)

In [36]:
# reviewing the results of the gridsearch
best_model.best_estimator_

In [37]:
# reviewing the accuracy of the model 
best_model.best_score_

0.771578947368421

### 11. Summarize your results.

Using the GridSearchCV we are able to determine the best model to use on various sets of data while also using hyperparameters for different models. When using the hyperparameters found in the text with the models on the training data we find that the Linear Regression model is the model with the highest accuracy. The accuracy of the returned model is 0.77. 