# Week 9 Exercises

In [127]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [153]:
# import loan data
loans = pd.read_csv("Loan_Train.csv")
loans

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [31]:
# drop load_id col
loans.drop("Loan_ID", axis=1, inplace=True)

In [32]:
# drop missing values
loans.dropna(inplace=True)

In [33]:
# create dummy variables for cat feats
loans2 = pd.get_dummies(loans, 
                         columns=["Married", "Dependents", "Gender", "Education", 
                                  "Self_Employed", "Property_Area"])
loans2

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Married_No,Married_Yes,Dependents_0,Dependents_1,...,Dependents_3+,Gender_Female,Gender_Male,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
1,4583,1508.0,128.0,360.0,1.0,N,0,1,0,1,...,0,0,1,1,0,1,0,1,0,0
2,3000,0.0,66.0,360.0,1.0,Y,0,1,1,0,...,0,0,1,1,0,0,1,0,0,1
3,2583,2358.0,120.0,360.0,1.0,Y,0,1,1,0,...,0,0,1,0,1,1,0,0,0,1
4,6000,0.0,141.0,360.0,1.0,Y,1,0,1,0,...,0,0,1,1,0,1,0,0,0,1
5,5417,4196.0,267.0,360.0,1.0,Y,0,1,0,0,...,0,0,1,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,2900,0.0,71.0,360.0,1.0,Y,1,0,1,0,...,0,1,0,1,0,1,0,1,0,0
610,4106,0.0,40.0,180.0,1.0,Y,0,1,0,0,...,1,0,1,1,0,1,0,1,0,0
611,8072,240.0,253.0,360.0,1.0,Y,0,1,0,1,...,0,0,1,1,0,1,0,0,0,1
612,7583,0.0,187.0,360.0,1.0,Y,0,1,0,0,...,0,0,1,1,0,1,0,0,0,1


In [35]:
# features and target
features = loans2.loc[:, loans2.columns!= "Loan_Status"]
target = loans2.Loan_Status

In [87]:
# split set
feat_train, feat_test, target_train, target_test = train_test_split(
    features, target, test_size=.2, random_state=1)

In [88]:
# create minmax scaler
minmax = MinMaxScaler()

In [89]:
# create knn
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs=-1)

In [90]:
# create pipeline
pipe = Pipeline([("scaler", minmax), ("knn", knn)])

In [91]:
# k nearest neighbor
pipe.fit(feat_train, target_train)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('knn', KNeighborsClassifier(n_jobs=-1))])

In [92]:
# predictions
predictions = pipe.predict(feat_test)

In [93]:
# accuracy score for default model
acc = accuracy_score(target_test, predictions)
acc

0.6666666666666666

In [94]:
# space for candidate values
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [121]:
# create grid search
classifier = GridSearchCV(
    pipe, search_space, cv=5, n_jobs=-1, verbose=1).fit(feat_train, target_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [122]:
# see best model parameter
classifier.best_estimator_.get_params()["knn__n_neighbors"]

9

In [145]:
# find accuracy of best model
predictions2 = classifier.predict(feat_test)
acc2 = accuracy_score(target_test, predictions2)
acc2

0.65625

In [140]:
# grid search with log regression and random forest
search_space2 = [{"classifier": [LogisticRegression()],
                 "classifier__penalty": ["l1","l2"],
                 "classifier__C": np.logspace(0, 4, 10)},
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_features": [1, 2, 3]}]

In [150]:
# create grid search
gridsearch = GridSearchCV(pipe2, search_space2, cv=5, n_jobs=-1, verbose=1)

In [151]:
# fit grid search
best_model = gridsearch.fit(feat_train, target_train)

Fitting 5 folds for each of 29 candidates, totalling 145 fits


        nan 0.81760082        nan 0.81760082        nan 0.81760082
        nan 0.81760082        nan 0.81760082        nan 0.81760082
        nan 0.81760082 0.71093643 0.74736842 0.75252905 0.70844156
 0.78120301 0.78116883 0.74757348 0.78892686 0.79155844]


In [149]:
# find best estimator
best_model.best_estimator_.get_params(deep=True)["classifier"]

LogisticRegression()

In [152]:
# find accuracy of best model
predictions3 = best_model.predict(feat_test)
acc3 = accuracy_score(target_test, predictions3)
acc3

0.7395833333333334

### Summary

In the resulting analysis, we can see the result of selecting various models and testing them via accuracy. K Nearest Neighbor gave an accuracy of 66.67% with 5 folds and 65.62% for 9 folds which is the optimal number according to our grid search. The discrepancy and accuracy may be in the relation between the cross fold evaluation and the split done by train_test_split.

When selecting the best model out of the 3 classifier options, logistic regression, KNN, and random forest classifier, the logistic regression came out on top, with an accuracy of 73.96%.