# Week 9: Best Model Extraction and Hyperparameter Tuning
# Rahul Rajeev

In [53]:
# libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
# 1. import the dataset
loan_df = pd.read_csv('loan_train.csv')
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [19]:
# 2. drop load_id column, remove any rows with NaN, and convert the categorical columns to dummy variables

# drop column
dropped = loan_df.drop('Loan_ID', axis=1)

# remove any rows wih NaN
cleaned = dropped.dropna()
print('Before:', dropped.shape, 'After:', cleaned.shape)

# convert categorical to dummy

# making every categorical value except loan_status dummy variables
categorical = cleaned.select_dtypes(exclude=['int64','float64']).columns[:-1].to_list()
final = pd.get_dummies(cleaned, columns = [col for col in categorical])

Before: (614, 12) After: (480, 12)


In [22]:
# preview
final.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,4583,1508.0,128.0,360.0,1.0,N,0,1,0,1,...,1,0,0,1,0,1,0,1,0,0
2,3000,0.0,66.0,360.0,1.0,Y,0,1,0,1,...,0,0,0,1,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,Y,0,1,0,1,...,0,0,0,0,1,1,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,Y,0,1,1,0,...,0,0,0,1,0,1,0,0,0,1
5,5417,4196.0,267.0,360.0,1.0,Y,0,1,0,1,...,0,1,0,1,0,0,1,0,0,1


In [36]:
# 3. Split the data into a training and test set, where the “Loan_Status” column is the target.
X = final.drop('Loan_Status', axis=1)
Y = final['Loan_Status']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [61]:
# 4. Create a pipeline with a min-max scaler and a KNN classifier
mmscaler = MinMaxScaler()

# create a knn classifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs=-1)

# create a pipeline
pipe = Pipeline([('mmscaler', mmscaler), ('knn', knn)])

In [62]:
# 5. fit classifier to data
pipe.fit(X_train, Y_train)

# model accuracy
pipe.score(X_test,Y_test)

0.75

In [63]:
# 6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10.
search_space = [{'knn__n_neighbors':list(range(1,11))}]

In [64]:
# 7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation 
# to find the best value for the “n_neighbors” parameter.
grid_classifier = GridSearchCV(pipe, search_space, cv=5).fit(X_train, Y_train)

In [65]:
# 8. Find the accuracy of the grid search best model on the test set.
grid_classifier.score(X_test,Y_test)

0.7604166666666666

In [74]:
# 9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression 
# and random forest models with the hyperparameter values

pipe2 = Pipeline([('standardization', mmscaler), ('classifier', KNeighborsClassifier(n_neighbors = 5))])

search_space2 = [{'classifier__n_neighbors':list(range(1,11))},
               {"classifier": [LogisticRegression()],
                "classifier__penalty": ['l1', 'l2'],
                "classifier__C": np.logspace(0, 4, 10)},
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_features": [1, 2, 3]}]
grid_classifier2 = GridSearchCV(pipe2, search_space2, cv=5).fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
50 fits failed out of a total of 195.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the fa

In [79]:
# 10. What are the best model and hyperparameters found in the grid search? 
grid_classifier2.best_estimator_.get_params()

{'memory': None,
 'steps': [('standardization', MinMaxScaler()),
  ('classifier', LogisticRegression(C=7.742636826811269))],
 'verbose': False,
 'standardization': MinMaxScaler(),
 'classifier': LogisticRegression(C=7.742636826811269),
 'standardization__clip': False,
 'standardization__copy': True,
 'standardization__feature_range': (0, 1),
 'classifier__C': 7.742636826811269,
 'classifier__class_weight': None,
 'classifier__dual': False,
 'classifier__fit_intercept': True,
 'classifier__intercept_scaling': 1,
 'classifier__l1_ratio': None,
 'classifier__max_iter': 100,
 'classifier__multi_class': 'auto',
 'classifier__n_jobs': None,
 'classifier__penalty': 'l2',
 'classifier__random_state': None,
 'classifier__solver': 'lbfgs',
 'classifier__tol': 0.0001,
 'classifier__verbose': 0,
 'classifier__warm_start': False}

In [77]:
# Find the accuracy of this model on the test set. 
grid_classifier2.score(X_test,Y_test)

0.8229166666666666

**11. Summarize Results.**

With the search space expanded to other classifiers, logistic and random forest, the best model appears to be logistic regression, with the best hyperparameters shown above. C = 7.74. The minmaxscaler is also chosen for scaling. The resulting score has improved from using the knn model from 0.74 to 0.82. There were some warnings above for when the search returned non-finite test scores, and therefore can't return an accuracy. I'm hoping to understand a bit more about these warnings in the next few weeks.