In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [46]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score, roc_auc_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate


from sklearn.preprocessing import StandardScaler

from catboost import CatBoostClassifier

In [4]:
import pickle

First I am going to try out a Random Forest Classifier. In my first iterations, I'll use just intake data as that will likely be my final product. With any luck I'll be able to use these models.

I'll use feature importance as a method to trim the number of features for the other models, so I'll fit the random forest with all of my features and see how it does.

Also, I will use gridsearchCV for my hyper paramater tuning and cross validation.

Since my models will be predicting whether or not a given cat will be transferred to a different facility, it will be important to have a high precision rate - we wouldn't want to tell someone to send the cat to a different facility if they would in fact be adopted (in this case a false negative would be saying the cat will be adopted, but it's actually transferred. The more of these we have, the lower the recall, but lower recall is preferable over low precision, which would be transferring adoptable cats ahead of time).

In [5]:
with open('/Users/robertpagano/src/metis_project_mcnulty/data/final_for_modeling/final_features_intake_only.pickle', 'rb') as f:
    df_X = pickle.load(f)

In [6]:
df_X.head()

Unnamed: 0_level_0,outcome_type_Transfer,intakes_last_week,is_male,is_fixed,intake_temp,is_rain_intake,real_age_intake_days,is_kitten,is_cfa_breed,is_domestic_breed,...,intake_month_3,intake_month_4,intake_month_5,intake_month_6,intake_month_7,intake_month_8,intake_month_9,intake_month_10,intake_month_11,intake_month_12
date_intake,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-10-01 08:33:00,1,1.0,0,1,75.4,0,7,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2013-10-01 08:33:00,1,2.0,0,1,75.4,0,7,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2013-10-01 08:33:00,1,3.0,0,1,75.4,0,7,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2013-10-01 10:59:00,1,4.0,1,1,75.4,0,6209,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2013-10-01 11:21:00,1,5.0,0,0,75.4,0,21,1,0,1,...,0,0,0,0,0,0,0,1,0,0


In [7]:
df_X = df_X.reset_index()
df_X.head()

Unnamed: 0,date_intake,outcome_type_Transfer,intakes_last_week,is_male,is_fixed,intake_temp,is_rain_intake,real_age_intake_days,is_kitten,is_cfa_breed,...,intake_month_3,intake_month_4,intake_month_5,intake_month_6,intake_month_7,intake_month_8,intake_month_9,intake_month_10,intake_month_11,intake_month_12
0,2013-10-01 08:33:00,1,1.0,0,1,75.4,0,7,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2013-10-01 08:33:00,1,2.0,0,1,75.4,0,7,1,0,...,0,0,0,0,0,0,0,1,0,0
2,2013-10-01 08:33:00,1,3.0,0,1,75.4,0,7,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2013-10-01 10:59:00,1,4.0,1,1,75.4,0,6209,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2013-10-01 11:21:00,1,5.0,0,0,75.4,0,21,1,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
df_X.columns

Index(['date_intake', 'outcome_type_Transfer', 'intakes_last_week', 'is_male',
       'is_fixed', 'intake_temp', 'is_rain_intake', 'real_age_intake_days',
       'is_kitten', 'is_cfa_breed', 'is_domestic_breed', 'is_weekend_intake',
       'intake_type_Euthanasia Request', 'intake_type_Owner Surrender',
       'intake_type_Stray', 'intake_type_Public Assist',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Injured', 'intake_condition_Sick',
       'intake_condition_Aged', 'intake_condition_Feral',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'weekday_intake_Friday', 'weekday_intake_Monday',
       'weekday_intake_Saturday', 'weekday_intake_Sunday',
       'weekday_intake_Thursday', 'weekday_intake_Tuesday',
       'weekday_intake_Wednesday', 'intake_month_1', 'intake_month_2',
       'intake_month_3', 'intake_month_4', 'intake_month_5', 'intake_month_6',
       'intake_month_7', 'intake_month_8', 'intake_month_9', 'intake_m

In [10]:
df_X.shape

(28517, 43)

In [18]:
df_X_rf = df_X.loc[:, ['outcome_type_Transfer', 'intakes_last_week', 'is_male', 'is_fixed', 'intake_temp', 'is_rain_intake',
       'is_kitten', 'is_cfa_breed', 'is_domestic_breed', 'is_weekend_intake',
       'intake_type_Euthanasia Request', 'intake_type_Owner Surrender',
       'intake_type_Stray', 'intake_type_Public Assist',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Injured', 'intake_condition_Sick',
       'intake_condition_Aged', 'intake_condition_Feral',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'weekday_intake_Friday', 'weekday_intake_Monday',
       'weekday_intake_Saturday', 'weekday_intake_Sunday',
       'weekday_intake_Thursday', 'weekday_intake_Tuesday',
       'weekday_intake_Wednesday', 'intake_month_1', 'intake_month_2',
       'intake_month_3', 'intake_month_4', 'intake_month_5', 'intake_month_6',
       'intake_month_7', 'intake_month_8', 'intake_month_9', 'intake_month_10',
       'intake_month_11', 'intake_month_12']]

In [20]:
pd.options.display.max_columns = None


In [21]:
df_X_rf.head()

Unnamed: 0,outcome_type_Transfer,intakes_last_week,is_male,is_fixed,intake_temp,is_rain_intake,is_kitten,is_cfa_breed,is_domestic_breed,is_weekend_intake,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Stray,intake_type_Public Assist,intake_condition_Normal,intake_condition_Nursing,intake_condition_Injured,intake_condition_Sick,intake_condition_Aged,intake_condition_Feral,intake_condition_Other,intake_condition_Pregnant,weekday_intake_Friday,weekday_intake_Monday,weekday_intake_Saturday,weekday_intake_Sunday,weekday_intake_Thursday,weekday_intake_Tuesday,weekday_intake_Wednesday,intake_month_1,intake_month_2,intake_month_3,intake_month_4,intake_month_5,intake_month_6,intake_month_7,intake_month_8,intake_month_9,intake_month_10,intake_month_11,intake_month_12
0,1,1.0,0,1,75.4,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,2.0,0,1,75.4,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,3.0,0,1,75.4,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,4.0,1,1,75.4,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1,5.0,0,0,75.4,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [137]:
X = df_X_rf.iloc[:, 1:]
y = df_X_rf.iloc[:, 0]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
rf_clf = RandomForestClassifier(random_state=42)

In [25]:
from pprint import pprint

In [26]:
print('Parameters currently in use:\n')
pprint(rf_clf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [27]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [28]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_clf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 72.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 95.2min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In case I lose this output:

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)
          
          [Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 95.2min finished

In [29]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

so best params are:
{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': True}

In [138]:
feature_list = list(X.columns)

In [144]:
rf_classifier_hold = rf_random.best_estimator_.steps[1][1]['rf_clf']

AttributeError: 'RandomForestClassifier' object has no attribute 'steps'

In [147]:
importances = list(rf_random.best_estimator_.feature_importances_)

**BELOW I AM SAVING MY FINAL MODEL**

In [149]:
final_model_rf_clf = rf_random.best_estimator_

In [151]:
type(final_model_rf_clf)

sklearn.ensemble.forest.RandomForestClassifier

In [150]:
with open('/Users/robertpagano/src/metis_project_mcnulty/final_model/final_model.pickle', 'wb') as to_write:
    pickle.dump(final_model_rf_clf, to_write)

In [1]:
with open('/Users/robertpagano/src/metis_project_mcnulty/final_model/final_model_small.pickle', 'wb') as to_write:
    pickle.dump(final_model_rf_clf, to_write, -1)

NameError: name 'pickle' is not defined

In [139]:
rf_random_feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

In [140]:
rf_random_feature_importances

[('intakes_last_week', 0.27),
 ('is_male', 0.03),
 ('is_fixed', 0.09),
 ('intake_temp', 0.2),
 ('is_rain_intake', 0.02),
 ('is_kitten', 0.09),
 ('is_cfa_breed', 0.01),
 ('is_domestic_breed', 0.01),
 ('is_weekend_intake', 0.01),
 ('intake_type_Euthanasia Request', 0.0),
 ('intake_type_Owner Surrender', 0.04),
 ('intake_type_Stray', 0.03),
 ('intake_type_Public Assist', 0.0),
 ('intake_condition_Normal', 0.02),
 ('intake_condition_Nursing', 0.01),
 ('intake_condition_Injured', 0.01),
 ('intake_condition_Sick', 0.01),
 ('intake_condition_Aged', 0.0),
 ('intake_condition_Feral', 0.0),
 ('intake_condition_Other', 0.0),
 ('intake_condition_Pregnant', 0.0),
 ('weekday_intake_Friday', 0.01),
 ('weekday_intake_Monday', 0.01),
 ('weekday_intake_Saturday', 0.01),
 ('weekday_intake_Sunday', 0.01),
 ('weekday_intake_Thursday', 0.01),
 ('weekday_intake_Tuesday', 0.01),
 ('weekday_intake_Wednesday', 0.01),
 ('intake_month_1', 0.01),
 ('intake_month_2', 0.0),
 ('intake_month_3', 0.01),
 ('intake_month

In [30]:
rf_random.best_score_

0.7162144391355806

In [49]:
rf_random.best_score_

0.7162144391355806

In [48]:
f1_random_rf_test = f1_score(rf_random.predict(X_test), y_test)
accuracy_random_rf_test = accuracy_score(rf_random.predict(X_test), y_test)
precision_random_rf_test = precision_score(rf_random.predict(X_test), y_test)
recall_random_rf_test = recall_score(rf_random.predict(X_test), y_test)
AUC_random_rf_test = roc_auc_score(rf_random.predict(X_test), y_test)

print(f'F1 test score is {f1_random_rf_test}')
print(f'Accuracy test score is {accuracy_random_rf_test}')
print(f'precision test score is {precision_random_rf_test}')
print(f'recall test score is {recall_random_rf_test}')
print(f'AUC test score is {AUC_random_rf_test}')

F1 test score is 0.7424137931034483
Accuracy test score is 0.738078541374474
precision test score is 0.7300779925398441
recall test score is 0.7551736232900736
AUC test score is 0.7380845333414967


In [35]:
scores



{'fit_time': array([0.28459597, 0.23566008, 0.21082187, 0.20573902, 0.21250224]),
 'score_time': array([0.04025602, 0.01811481, 0.02119899, 0.02164102, 0.01942492]),
 'test_score': array([0.70092025, 0.69077361, 0.69048663, 0.69158264, 0.69530907]),
 'train_score': array([0.96175133, 0.96378082, 0.96230344, 0.96257739, 0.96361843])}

So running random search improved my validation score by around 4% in accuracy

## Below I will try a logistic model using gridsearch. My hop is to improve my precision score, as on my mvp, it was very high

First I need new matrices, because I need to drop dummies

In [52]:
with open('/Users/robertpagano/src/metis_project_mcnulty/data/final_for_modeling/final_features_intake_only_drop_one.pickle', 'rb') as f:
    df_X_log = pickle.load(f)

In [54]:
df_X_log = df_X_log.reset_index()
df_X_log.head()

Unnamed: 0,date_intake,outcome_type_Transfer,is_male,intakes_last_week,is_fixed,intake_temp,is_rain_intake,real_age_intake_days,is_kitten,is_cfa_breed,is_weekend_intake,intake_type_Owner Surrender,intake_type_Stray,intake_type_Public Assist,intake_condition_Normal,intake_condition_Nursing,intake_condition_Injured,intake_condition_Sick,intake_condition_Feral,intake_condition_Other,intake_condition_Pregnant,weekday_intake_Monday,weekday_intake_Saturday,weekday_intake_Sunday,weekday_intake_Thursday,weekday_intake_Tuesday,weekday_intake_Wednesday,intake_month_2,intake_month_3,intake_month_4,intake_month_5,intake_month_6,intake_month_7,intake_month_8,intake_month_9,intake_month_10,intake_month_11,intake_month_12
0,2013-10-01 08:33:00,1,0,1.0,1,75.4,0,7,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
1,2013-10-01 08:33:00,1,0,2.0,1,75.4,0,7,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,2013-10-01 08:33:00,1,0,3.0,1,75.4,0,7,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,2013-10-01 10:59:00,1,1,4.0,1,75.4,0,6209,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,2013-10-01 11:21:00,1,0,5.0,0,75.4,0,21,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0


In [55]:
df_X_log = df_X_log.loc[:, ['outcome_type_Transfer', 'intakes_last_week', 'is_male', 'is_fixed', 'intake_temp', 'is_rain_intake',
       'is_kitten', 'is_cfa_breed', 'is_weekend_intake',
       'intake_type_Owner Surrender',
       'intake_type_Stray', 'intake_type_Public Assist',
       'intake_condition_Normal', 'intake_condition_Nursing',
       'intake_condition_Injured', 'intake_condition_Sick',
       'intake_condition_Feral',
       'intake_condition_Other', 'intake_condition_Pregnant',
       'intake_month_2',
       'intake_month_3', 'intake_month_4', 'intake_month_5', 'intake_month_6',
       'intake_month_7', 'intake_month_8', 'intake_month_9', 'intake_month_10',
       'intake_month_11', 'intake_month_12']]

In [56]:
X_log = df_X_log.iloc[:, 1:]
y_log = df_X_log.iloc[:, 0]

In [60]:
logreg = LogisticRegression()

In [61]:
print('Parameters currently in use:\n')
pprint(logreg.get_params())

Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


In [62]:
from sklearn.preprocessing import StandardScaler

In [85]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y_log, test_size=0.2, random_state=42)

In [84]:
scaler = StandardScaler()

In [86]:
scaler.fit(X_train.values)
X_tr = scaler.transform(X_train_log.values)
X_te = scaler.transform(X_test_log.values)

In [87]:
# Create regularization penalty space
penalty = ['l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# pick a solver
solver = ['lbfgs', 'liblinear']

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty, solver=solver)

In [88]:
# Create grid search using 5-fold cross validation
logreg_clf = GridSearchCV(logreg, hyperparameters, cv=5, verbose=0)

# Fit grid search with scaled x values and y tain
best_logreg_model = logreg_clf.fit(X_tr, y_train_log)

In [89]:
print('Best C:', best_logreg_model.best_estimator_.get_params()['C'])

Best C: 2.7825594022071245


In [90]:
best_logreg_model.best_score_

0.6549335904966467

I'm going to take a step back here and pair down my model to MVP and iteratively add features

In [124]:
df_X_log_small = df_X_log.loc[:, ['outcome_type_Transfer', 'is_fixed',
       'is_kitten']]

In [125]:
df_X_log_small.head()

Unnamed: 0,outcome_type_Transfer,is_fixed,is_kitten
0,1,1,1
1,1,1,1
2,1,1,1
3,1,1,0
4,1,0,1


In [126]:
X_log_small = df_X_log_small.iloc[:, 1:]
y_log_small = df_X_log_small.iloc[:, 0]

In [127]:
X_train_log_small, X_test_log_small, y_train_log_small, y_test_log_small = train_test_split(X_log_small, y_log_small, test_size=0.2, random_state=42)

In [128]:
logreg_small = LogisticRegression()

In [129]:
# Create grid search using 5-fold cross validation
logreg_small_clf = GridSearchCV(logreg_small, hyperparameters, cv=5, verbose=0)

# Fit grid search with scaled x values and y tain
best_logreg_small_model = logreg_small_clf.fit(X_train_log_small, y_train_log_small)

In [130]:
print('Best C:', best_logreg_small_model.best_estimator_.get_params()['C'])

Best C: 1.0


In [131]:
best_logreg_small_model.best_score_

0.6330162626572569