In [None]:
# Tuning Multiple Hyperparameters
# We could tune them one by one. 
# If we are tuning a random forest model, we could find the best max_depth, then the best min_leaf_size, then the best n_estimators.
# But, what if a specific combination of values works best? 
# For instance maybe max_depth=5 works best with n_estimators=100, but max_depth=10 works even better when n_estimators=200?
# How can we try all possible combinations of multiple hyperparameters?

In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV

In [6]:
iris = load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# Create and visualize our parameter grid

param_grid = {'max_depth': [1,2,3],
              'min_sample_leaf': [3, 15, 20],
              'min_samples_split': [2, 10, 100]}

pd.DataFrame(param_grid).T

# One way we can do this is with nested for loops.

scores = {}

for d in param_grid['max_depth']:
    for l in param_grid['min_sample_leaf']:
        for s in param_grid['min_samples_split']:
            
            #fit a model for each combination of hyperparameter values
            model = DecisionTreeClassifier(max_depth=d,
                                           min_samples_leaf=l,
                                           min_samples_split=s)
            
            model.fit(X_train, y_train)
            score = model.score(X_test, y_test)

            # add the model accuracy to a dictionary with the parameter settings as the
            # keys and the accuracies as the values.

            scores[f'depth {d}, min_samples_leaf {l}, min_samples_split {s} accuracy'] = score.round(6) * 100

#display dictionary of scores
scores

{'depth 1, min_samples_leaf 3, min_samples_split 2 accuracy': 68.4211,
 'depth 1, min_samples_leaf 3, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 3, min_samples_split 100 accuracy': 68.4211,
 'depth 1, min_samples_leaf 15, min_samples_split 2 accuracy': 68.4211,
 'depth 1, min_samples_leaf 15, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 15, min_samples_split 100 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 2 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 100 accuracy': 68.4211,
 'depth 2, min_samples_leaf 3, min_samples_split 2 accuracy': 97.3684,
 'depth 2, min_samples_leaf 3, min_samples_split 10 accuracy': 97.3684,
 'depth 2, min_samples_leaf 3, min_samples_split 100 accuracy': 68.4211,
 'depth 2, min_samples_leaf 15, min_samples_split 2 accuracy': 97.3684,
 'depth 2, min_samples_leaf 15, min_samples_split 10 accur

In [None]:
# GridSearchCV

# GridSearchCV is a model wrapper that will fit many versions of a model with different combinations of hyperparameter settings. 
# It will even perform cross-validation to more thoroughly check the performance of each model variant.

# Just like the code above, GridSearchCV takes a model type (or pipeline!) and a dictionary of hyperparameters and ranges of values for each
# hyperparameter. 
# By default, it will perform a 5-fold cross-validation on a model using each possible combination of hyperparameters values specified in
# the parameter grid dictionary.
# Then GridSearchCV can automatically select the best model, based on a given scoring function, to return.


In [None]:
# The first step in using GridSearchCV in Python is to instantiate a model or pipeline and create a parameter grid. 
# The parameter grid will be a dictionary
# with the name of the hyperparameter as the key and a list or range of values for the GridSearchCV to try as the values.

model = DecisionTreeClassifier()

param_grid = {'max_depth': [1,2,3],
              'min_samples_leaf': [3,15,20],
              'min_samples_split': [2, 10, 100]}



# If no scoring function is specified, the GridSearchCV object will use the model default .score() function. 
# For classification models the default is accuracy, and for regression models, the default is R2 score.
dt_grid_search = GridSearchCV(model, param_grid)


# Fit the GridSearchCV on the training data.
# When the GridSearchCV is fit on the training data, it fits many times. It fits and evaluates a new model for every possible combination of hyperparameters
# a number of times equal to the number of folds in the cross-validation. By default, this is 5 folds, but it can be adjusted using the ‘cv=’ argument. It
# records the scores for each combination of hyperparameters for each fold, as well as the average score for each fold as a dictionary attribute. You can
# extract and inspect it later, or you can just retrieve the best model.

# While the below is only one line of code, many models are being fit and it can sometimes take a long time to complete.
dt_grid_search.fit(X_train, y_train)

# Examine the best parameters found by the search.
dt_grid_search.best_params_


# Even though we can extract the best model directly, it’s useful to examine the best parameters that the GridSearchCV object found. For instance, the
# search found that the maximum value for depth and minimum values for the samples per split and leaf yielded the best performance. We may want to run
# the search again and explore a higher range of values for the max_depth, a lower range for min_samples_leaf, and since 2 is the lowest allowed value for
# min_samples_split, we can explore other values nearer to 2 to see if values in these ranges perform even better. For example, we might explore the grid:

param_grid2 = {'max_depth': [3, 5, 10],
               'min_samples_leaf': [1, 2, 3],
               'min_samples_split': [2, 4, 7]}


dt_grid_search2 = GridSearchCV(model, param_grid2)
dt_grid_search2.fit(X_train, y_train)
dt_grid_search2.best_params_

# We are zeroing in on the best hyperparameters by adjusting the ranges according to what the previous best hyperparameters were.

# If one of the best hyperparameter values were the highest value in a range, we might explore a high range of values for that hyperparameter. If the best
# value for a hyperparameter were in the middle, we might tighten my search and explore values close to the best one, above and below.
# Notice that I included the best parameters from the first search in the second search in case those were indeed the best values.

# Retrieve the best model, refit, and evaluate.

#retrieve the best version of the model
best_model = dt_grid_search2.best_estimator_

#refit the model on the whole training set
best_model.fit(X_train, y_train)

#score the model on the test set
best_model.score(X_test, y_test)


In [None]:
# GridSearchCV and Pipelines

# GridSearchCV uses cross-validation to choose the best values for hyperparameters, so we need to be aware of data leakage when doing any kind of data
# preprocessing. GridSearchCV makes it easy to use a pipeline with preprocessing. To do this you would replace the model in the GridSearchCV
# constructor with the pipeline.
# The difference is in the keys you will need to use for your parameter grid. Since a pipeline contains multiple steps, both the step and the hyperparameter
# need to be specified in the parameter grid. The format is: <step>__<hyperparameter>. Notice the double underscore, also called a ‘dunder’ that
# separates the step name and the hyperparameter.
# For example: if we were going to use a KNei ghborsClassifier and needed to scale our data before fitting, we could use a pipeline to prevent data leakage
# during cross-validation. The pipeline.get_params() function will list all available parameters of all steps that can be tuned.


knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_pipe.get_params()


# We can reference these to use for the key values in our parameter grid dictionary. 
# Also notice that we are using range() objects to create our set of values for n_neighbors and p.
param_grid = {'kneighborsclassifier__n_neighbors': range(1,10),
              'kneighborsclassifier__p': range(1,5),
              'kneighborsclassifier__weights': ['balanced','uniform']}


# From here the process is the same as previously, but we substitute the pipe as the estimator in the first argument of the GridSearchCV constructor.
knn_pipe_gs = GridSearchCV(knn_pipe, pipe_param_grid)

knn_pipe_gs.fit(X_train, y_train)
print('Best KNN Parameters:')

print(knn_pipe_gs.best_params_)
best_pipe = knn_pipe_gs.best_estimator_
print(f'Accuracy of best KNN model is: {best_pipe.score(X_test, y_test)}')


# With the above code, we were able to use a pipeline to scale data and tune the hyperparameters of a KNN model using GridSearchCV while avoiding
# data leakage during the cross-validation. This can, of course, be done using more complex preprocessing transformers like ColumnTransformer which
# includes one-hot encoding, imputation, etc.

In [None]:
# The key here is:
# The pipeline goes INSIDE the GridSearchCV, in place of the model.
# Do NOT put the GridSearchCV inside of the pipeline


# Notes on Hyperparameter Searches
# Searching for just the right hyperparameters can take time, sometimes a lot of time! While some hyperparameters should always be tuned for each model
# you try, such as C for logistic regression, n_neighbors for KNN models, and max_depth for decision tree models, other hyperparameters will make only
# small differences. Your data and your base model choice will usually be more impactful than your hyperparameter tuning.

#  1. First, work on data.
# The choices you make in preparing your dataset, such as how you choose to clean the data, which columns you include, and any feature engineering you
# perform will usually have the greatest impact and will generally be where most machine learning engineers spend most of their time.

# 2. Second, pick the right model type
# Your choice of model type, linear models, clustering models, tree models, ensemble models, etc., will generally have the next biggest impact and it’s a
# good idea to try default versions of many model types before starting a hyperparameter search. Again, the exception would be to do some tuning on the
# few hyperparameters mentioned above when trying those models. Generally, the models that perform the best with their default settings will also perform
# the best after tuning.

# 3. Finally, tune hyperparameters.
# Hyperparameter tuning, while often very time-intensive, will also often only improve performance a small amount. This should be the last step in your
# modeling process and should be reserved for your best 1-3 default models with your best versions of your data.


# Additional Options for Tuning
# Another tool that is popular among many data scientists is sklearn.model_selection.RandomizedSearchCV. Where GridSearchCV explores all possible
# combinations of hyperparameter values in your parameter grid, RandomizedSearchCV explores a random sampling of combinations. Many data
# scientists feel that the EXACT right combination is less important than an approximately best set of hyperparameters. You might choose to experiment
# with RandomizedSearchCV, especially if you expect an exhaustive search over hyperparameter values will take too long.



# Summary
# Tuning hyperparameters of a model can improve performance. sklearn.model_selection.GridSearchCV can speed up and simplify the search for optimal
# hyperparameter settings. While tuning hyperparameters can improve your model performance, other work can be a better use of limited time, such as
# exploring data cleaning and preparation options and testing more model types. Hyperparameter tuning should be the last step in your process after your
# data preparation strategy and your base model are chosen.