#### Hyperparameter Tuning Using Grid Search and Random Search in Python

#### Grid Search

Define a parameter space or parameter grid, where we include a set of possible hyperparameter values that can be used to build the model.

Grid search looks at every possible combination of hyperparameters to find the best model

The grid search is used to place these hyperparameters in a matrix-like structure, and the model is trained on every combination of hyperparameter values.

The model with the best performance is then selected.

#### Random Search

Random search only selects and tests a random combination of hyperparameters.

Random search randomly samples from a grid of hyperparameters.

The number of total runs can be specified before returning the best model.

In [190]:
import pandas as pd
import joblib
from sklearn.exceptions import ConvergenceWarning
from sklearn.exceptions import DataConversionWarning
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=DataConversionWarning)

data_df = pd.read_csv('wine_quality.csv')
# data_df.head()
data_df


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [211]:
data_df.describe()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,target_quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,0.246114,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378,0.633061
std,0.430779,1.295751,0.164548,0.145231,4.757392,0.035031,17.7494,56.521855,0.002999,0.160637,0.148768,1.192712,0.873255,0.482007
min,0.0,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0,0.0
25%,0.0,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0,0.0
50%,0.0,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0,1.0
75%,0.0,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0,1.0
max,1.0,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0,1.0


#### Converting type to numeric

In [191]:
import numpy as np
# Creating type dictionary
# Replacing type data with numerical data by using type dictionary

# type = {'white' : 0, 'red' :1}
# data_df['type'] = data_df['type'].map(type)
# data_df.head(10)

data_df['type'] = np.where(data_df['type'] == 'white', 0, 1)

data_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,1,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,1,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,1,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,1,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


#### Checking missing data

In [192]:
data_df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

#### Replacing missing data with the mean of the existing data

In [193]:
# fixed acidity
data_df['fixed acidity'].fillna(data_df['fixed acidity'].mean(), inplace=True)

# volatile acidity
data_df['volatile acidity'].fillna(data_df['volatile acidity'].mean(), inplace=True)

# citric acid
data_df['citric acid'].fillna(data_df['citric acid'].mean(), inplace=True)

# residual sugar
data_df['residual sugar'].fillna(data_df['residual sugar'].mean(), inplace=True)

# chlorides
data_df['chlorides'].fillna(data_df['chlorides'].mean(), inplace=True)

# pH
data_df['pH'].fillna(data_df['pH'].mean(), inplace=True)

# sulphates
data_df['sulphates'].fillna(data_df['sulphates'].mean(), inplace=True)



data_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,6
1,0,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,6
2,0,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,6
3,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6
4,0,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,1,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,5
6493,1,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,6
6494,1,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,6
6495,1,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,5


#### Turn this into a binary classification task

Assigning a value of 0 to all data points with a quality value of less than or equal to 5, 

And a value of 1 to a quality value of more than 5

If value <= 5, quality == 0

if value > 5, quality == 1

In [194]:
import numpy as np

# where(condition, [x, y])
# Return elements chosen from x or y depending on condition

data_df['target_quality'] = np.where(data_df['quality']>5, 1, 0)

Split the dependent (target, label) and independent (feature) variables in this dataframe:

In [195]:
# Drop specified labels from rows or columns.
# Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names.
# When using a multi-index, labels on different levels can be removed by specifying the level. 

# axis 0 represents rows and axis 1 represents columns

data_df2 = data_df.drop(['quality'],axis=1)
x_feature = data_df2.drop(['target_quality'],axis=1)
y_label = data_df2[['target_quality']]

#### Building the Model

Instantiating a random forest classifier

Tuning the hyperparameters of this model to create the best algorithm for the dataset

In [196]:
from sklearn.ensemble import RandomForestClassifier

# A random forest classifier.
# A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset
# and uses averaging to improve the predictive accuracy and control over-fitting.
# The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.

rf = RandomForestClassifier()

#### Grid Search

#### Defining the Hyperparameter Space

“Max_depth”: It represents the maximum level of each tree in the random forest model.

“Max_features”: The maximum number of features that the random forest model is allowed to try at each split.

“N_estimators”: The number of decision trees in the forest. 

“Min_samples_leaf”: The minimum number of samples required to be at the leaf node of each tree.

“Min_samples_split”: The minimum number of samples required to split an internal node of each tree.

In [197]:
grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200],
            #   'max_features':[1,3,5,7],
            #   'min_samples_leaf':[1,2,3],
            #   'min_samples_split':[1,2,3]
           }

#### Hyperparameter tuning

In [198]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [199]:
from sklearn.model_selection import GridSearchCV

cv_grid = GridSearchCV(rf,param_grid=grid_space,cv=3,scoring='accuracy')

cv_grid.fit(x_feature, y_label.values.ravel())

print_results(cv_grid)

BEST PARAMS: {'max_depth': 3, 'n_estimators': 100}

0.706 (+/-0.031) for {'max_depth': 3, 'n_estimators': 10}
0.706 (+/-0.042) for {'max_depth': 3, 'n_estimators': 100}
0.702 (+/-0.044) for {'max_depth': 3, 'n_estimators': 200}
0.695 (+/-0.054) for {'max_depth': 5, 'n_estimators': 10}
0.693 (+/-0.04) for {'max_depth': 5, 'n_estimators': 100}
0.705 (+/-0.041) for {'max_depth': 5, 'n_estimators': 200}
0.681 (+/-0.085) for {'max_depth': 10, 'n_estimators': 10}
0.7 (+/-0.088) for {'max_depth': 10, 'n_estimators': 100}
0.697 (+/-0.099) for {'max_depth': 10, 'n_estimators': 200}
0.686 (+/-0.087) for {'max_depth': None, 'n_estimators': 10}
0.705 (+/-0.082) for {'max_depth': None, 'n_estimators': 100}
0.701 (+/-0.096) for {'max_depth': None, 'n_estimators': 200}


In [200]:
cv_grid.best_estimator_

In [201]:
joblib.dump(cv_grid.best_estimator_, 'GS_model.pkl')

['GS_model.pkl']

#### Random Search 

#### Defining the Hyperparameter Space

In [202]:
from scipy.stats import randint

rs_space={'max_depth':list(np.arange(10, 100, step=10)) + [None],
              'n_estimators':np.arange(10, 500, step=50),
            #   'max_features':randint(1,7),
            #   'criterion':['gini','entropy'],
            #   'min_samples_leaf':randint(1,4),
            #   'min_samples_split':np.arange(2, 10, step=2)
         }

In [203]:
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestClassifier()
cv_random = RandomizedSearchCV(rf, rs_space, n_iter=100, scoring='accuracy', n_jobs=-1, cv=3)

cv_random.fit(x_feature, y_label.values.ravel())

print_results(cv_random)


BEST PARAMS: {'n_estimators': 60, 'max_depth': 10}

0.674 (+/-0.101) for {'n_estimators': 10, 'max_depth': 10}
0.716 (+/-0.046) for {'n_estimators': 60, 'max_depth': 10}
0.7 (+/-0.081) for {'n_estimators': 110, 'max_depth': 10}
0.703 (+/-0.075) for {'n_estimators': 160, 'max_depth': 10}
0.7 (+/-0.091) for {'n_estimators': 210, 'max_depth': 10}
0.692 (+/-0.096) for {'n_estimators': 260, 'max_depth': 10}
0.704 (+/-0.087) for {'n_estimators': 310, 'max_depth': 10}
0.702 (+/-0.074) for {'n_estimators': 360, 'max_depth': 10}
0.701 (+/-0.083) for {'n_estimators': 410, 'max_depth': 10}
0.696 (+/-0.088) for {'n_estimators': 460, 'max_depth': 10}
0.666 (+/-0.127) for {'n_estimators': 10, 'max_depth': 20}
0.7 (+/-0.072) for {'n_estimators': 60, 'max_depth': 20}
0.701 (+/-0.078) for {'n_estimators': 110, 'max_depth': 20}
0.699 (+/-0.093) for {'n_estimators': 160, 'max_depth': 20}
0.698 (+/-0.096) for {'n_estimators': 210, 'max_depth': 20}
0.697 (+/-0.103) for {'n_estimators': 260, 'max_depth': 20

In [204]:
cv_random.best_estimator_

In [205]:
joblib.dump(cv_random.best_estimator_, 'RS_model.pkl')

['RS_model.pkl']

In [206]:
search_models = {}

for s_mdl in ['GS', 'RS']:
    search_models[s_mdl] = joblib.load('{}_model.pkl'.format(s_mdl))

In [207]:
search_models

{'GS': RandomForestClassifier(max_depth=3),
 'RS': RandomForestClassifier(max_depth=10, n_estimators=60)}

In [208]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred, average='weighted'), 3)
    recall = round(recall_score(labels, pred, average='weighted'), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [209]:
for name, mdl in search_models.items():
    evaluate_model(name, mdl, x_feature, y_label)

GS -- Accuracy: 0.748 / Precision: 0.745 / Recall: 0.748 / Latency: 158.9ms
RS -- Accuracy: 0.905 / Precision: 0.904 / Recall: 0.905 / Latency: 114.5ms


In [210]:
evaluate_model('Random Search', search_models['RS'], x_feature, y_label)

Random Search -- Accuracy: 0.905 / Precision: 0.904 / Recall: 0.905 / Latency: 75.6ms
