In [1]:
# What is Hyperparameter?
#
# The parameters that we set in the model is called Hyperparameter
#
# Every Algorithm(Model) has different set of Hyperparameters.
# Our Goal is to identify the best hyperparameter values that can improve the quality of the model
# This can be achieved using Hyperparameter Tuning
#
#
# Is Hyperparameter Tuning Mandatory?
#
# If you achieve the best quality model with default config, then this step is not required.

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv')

In [4]:
features = data.iloc[:,:-1].values
label = data.iloc[:,-1].values

In [5]:
#Lets get the goal clear
#Algo
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [6]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,
                        features,
                        label,
                        cv=10)

scores

array([1.        , 0.93333333, 1.        , 1.        , 0.86666667,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [7]:
scores.mean()

0.9666666666666668

In [8]:
#How to figure out which Hyperparameters are present in the algo of SKlearn functn?
?KNeighborsClassifier

In [9]:
#n_neighbors=5,
#weights='uniform',
#algorithm='auto',
#leaf_size=30,
#p=2,
#metric='minkowski',
#metric_params=None,
#n_jobs=None,
#**kwargs,

# Goal is to find the optimal Hyperparameter values

In [10]:
#Method 1: Technique to Perform Hyperparamter tuning using GridSearchCV

#1. Design your parameter grid. In Python Parameter grid must be represented in the form of dictionary

weightParameter = ['uniform','distance']
kvalues = np.arange(1,31)
algoParams = ['auto', 'ball_tree', 'kd_tree', 'brute']

paramGrid = dict(n_neighbors = kvalues, 
                 weights = weightParameter,
                algorithm=algoParams)

#2. Apply GridSearchCV to identify the best values for each Hyperparameter

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(model,
                   param_grid=paramGrid,
                   cv=10)

In [11]:
#3. Execute Grid Search
grid.fit(features,label)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
#Check Results

grid.best_score_

0.98

In [21]:
grid.best_params_

{'algorithm': 'auto', 'n_neighbors': 13, 'weights': 'uniform'}

In [22]:
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [16]:
finalModel = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=13, p=2,
           weights='uniform')

In [23]:
# Defect of this Method:
# GridSearchCV is the slowest method but it assures you get the best Hyperparamters.
# If you want to speed up the process of finding the optimal Hyperparameters, you can use Method2

In [24]:
#Method 2: RandomizedGridSearchCV

#1. Design your parameter grid. In Python Parameter grid must be represented in the form of dictionary

weightParameter = ['uniform','distance']
kvalues = np.arange(1,31)
algoParams = ['auto', 'ball_tree', 'kd_tree', 'brute']

paramGrid = dict(n_neighbors = kvalues, 
                 weights = weightParameter,
                algorithm=algoParams)

#2. Apply RandomizedGridSearchCV to identify the best values for each Hyperparameter

    from sklearn.model_selection import RandomizedSearchCV
rGrid = RandomizedSearchCV(model,
                   param_distributions=paramGrid,
                   cv=10,
                         n_iter=60)

In [25]:
#Execute RGrid
rGrid.fit(features,label)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid='warn', n_iter=60, n_jobs=None,
          param_distributions={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [26]:
#Check Results

rGrid.best_score_

0.98

In [27]:
rGrid.best_params_

{'weights': 'distance', 'n_neighbors': 27, 'algorithm': 'ball_tree'}

In [28]:
rGrid.best_estimator_

KNeighborsClassifier(algorithm='ball_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=27, p=2,
           weights='distance')