In [18]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Integer


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
iris = datasets.load_iris()


In [3]:
X = iris.data
Y = iris.target

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


In [5]:
model = KNeighborsClassifier()
model.fit(X_train, Y_train)


KNeighborsClassifier()

In [6]:
print(model.predict(X_test))


[2 2 1 1 0 1 1 2 0 2 2 0 2 0 1 2 1 0 1 2 0 1 0 0 1 0 2 1 1 2]


In [7]:
print(Y_test)


[2 2 1 1 0 1 2 2 0 2 2 0 2 0 1 1 1 0 1 2 0 1 0 0 1 0 2 2 1 2]


In [8]:
print(model.score(X_test, Y_test))


0.9


In [24]:
#  define parameters
param_grid = {
            'n_neighbors': (1, 21),  # integer valued parameter
            'weights': ['uniform', 'distance'],  # categorical parameter
            'metric': ['euclidean', 'manhattan', 'minkowski'] # categorical parameter
        }

# define grid search
cv = RepeatedStratifiedKFold(n_splits=5)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='accuracy')
grid = grid_search.fit(X_train, Y_train)

# summarize results
print("The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_))

# grid search parameter and accuracy score 
grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.sort_values(by=['Accuracy'], inplace=True, ascending=False)
print(grid_results)

The best parameters are {'metric': 'euclidean', 'n_neighbors': 21, 'weights': 'distance'} with a score of 0.97
       metric  n_neighbors   weights  Accuracy
3   euclidean           21  distance  0.971667
11  minkowski           21  distance  0.971667
7   manhattan           21  distance  0.965000
0   euclidean            1   uniform  0.958333
1   euclidean            1  distance  0.958333
8   minkowski            1   uniform  0.958333
9   minkowski            1  distance  0.958333
4   manhattan            1   uniform  0.957500
5   manhattan            1  distance  0.957500
6   manhattan           21   uniform  0.956667
2   euclidean           21   uniform  0.951667
10  minkowski           21   uniform  0.951667


In [22]:
# using the bayesian optimisation model - part of scikit-learn library
# define search space - each parameter must be a probabiliy model
params = {
        'n_neighbors': (1, 100),  # integer valued parameter
        'weights': ['uniform', 'distance'],  # categorical parameter
        'metric': ['euclidean', 'manhattan', 'minkowski'] # categorical parameter
    }

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the search - refer to: https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html
search = BayesSearchCV(estimator=model, search_spaces=params, n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X_train, Y_train)
# Prints the results - optimal hyper-parameters and the accuracy score
print("The best parameters are %s with a score of %0.2f" % (search_result.best_params_, search_result.best_score_))



The best parameters are OrderedDict([('metric', 'euclidean'), ('n_neighbors', 14), ('weights', 'distance')]) with a score of 0.98


## OTHER


In [2]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)

# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.985000 using {'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'uniform'}
0.910667 (0.026949) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'uniform'}
0.910667 (0.026949) with: {'metric': 'euclidean', 'n_neighbors': 1, 'weights': 'distance'}
0.949333 (0.024757) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
0.949333 (0.024757) with: {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
0.963333 (0.024944) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
0.963333 (0.024944) with: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
0.974667 (0.018927) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}
0.974667 (0.018927) with: {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'}
0.979000 (0.015990) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'uniform'}
0.979000 (0.015990) with: {'metric': 'euclidean', 'n_neighbors': 9, 'weights': 'distance'}
0.979667 

In [3]:
from pprint import pprint


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)


NameError: name 'np' is not defined

In [None]:
# example of bayesian optimization with scikit-optimize
from numpy import mean
from sklearn.datasets import make_blobs
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from skopt.space import Integer
from skopt.utils import use_named_args
from skopt import gp_minimize

# generate 2d classification dataset
X, y = make_blobs(n_samples=500, centers=3, n_features=2)

# define the model
model = KNeighborsClassifier()
# define the space of hyperparameters to search
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

search_space = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

# define the function used to evaluate a given configuration
@use_named_args(search_space)
def evaluate_model(**params):
	# something
	model.set_params(**params)
	# calculate 5-fold cross validation
	result = cross_val_score(model, X, y, cv=5, n_jobs=-1, scoring='accuracy')
	# calculate the mean of the scores
	estimate = mean(result)
	return 1.0 - estimate

# perform optimization
result = gp_minimize(evaluate_model, search_space)
# summarizing finding:
print('Best Accuracy: %.3f' % (1.0 - result.fun))
print('Best Parameters: n_neighbors=%d, p=%d' % (result.x[0], result.x[1]))

In [None]:
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define dataset
X, y = make_blobs(n_samples=1000, centers=2, n_features=100, cluster_std=20)

# define models
model = KNeighborsClassifier()

#  define parameters
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

param_grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

# define grid search
cv = RepeatedStratifiedKFold(n_splits=5)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
grid = grid_search.fit(X, y)

# summarize results
print("The best parameters are %s with a score of %0.2f"
    % (grid.best_params_, grid.best_score_))


