In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv')

In [3]:
df.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [5]:
from sklearn.model_selection import cross_val_score,KFold
from sklearn.neighbors import KNeighborsRegressor

In [6]:
knn = KNeighborsRegressor()

In [7]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = cross_val_score(knn, X, y, cv=kfold, scoring='r2')

In [8]:
scores.mean()

np.float64(0.4761976351913221)

# Grid Search CV

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
param_grid = {
    'n_neighbors':[1,3,5,7,10,12,15,17,20],
    'weights':['uniform','distance'],
    'algorithm':['ball_tree', 'kd_tree', 'brute'],
    'p':[1,2]
}

In [11]:
gcv = GridSearchCV(knn, param_grid, scoring='r2', refit=True, cv=kfold, verbose=2)

In [12]:
gcv.fit(X,y)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=1, weights=distance; total time=   0.0s
[CV] END algorithm=ball_tree, n_neighbors=1, p=2, weights=uniform; total

In [13]:
gcv.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [14]:
gcv.best_score_

np.float64(0.6117139367845081)

In [15]:
gcv.cv_results_

{'mean_fit_time': array([0.00282302, 0.00272489, 0.00311885, 0.0021502 , 0.00215139,
        0.00213799, 0.0023026 , 0.00374904, 0.00376291, 0.00392337,
        0.00526652, 0.00402699, 0.00521441, 0.0034132 , 0.00418763,
        0.00420098, 0.00288944, 0.00216336, 0.00213742, 0.0020957 ,
        0.0021121 , 0.00251179, 0.00206609, 0.00204611, 0.00203228,
        0.00207152, 0.002142  , 0.002143  , 0.00210934, 0.00243936,
        0.00204296, 0.00326018, 0.00235691, 0.00226045, 0.00234609,
        0.00214639, 0.00262003, 0.0028018 , 0.00232234, 0.00242662,
        0.00220056, 0.00246854, 0.00221171, 0.00246458, 0.00221844,
        0.00227308, 0.00230265, 0.00219779, 0.00272541, 0.00251822,
        0.00240335, 0.00230222, 0.00224428, 0.00229621, 0.00222273,
        0.00214863, 0.00241613, 0.00232468, 0.00250087, 0.0026278 ,
        0.00245066, 0.0022037 , 0.0024188 , 0.00236783, 0.00302968,
        0.00238614, 0.00349255, 0.00255437, 0.00238848, 0.00248828,
        0.00263972, 0.00268817,

In [16]:
pd.DataFrame(gcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002823,0.001180,0.002817,0.000189,ball_tree,1,1,uniform,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.573689,0.286948,0.522222,0.308922,0.440975,0.426551,0.113434,88
1,0.002725,0.000646,0.003478,0.000728,ball_tree,1,1,distance,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.573689,0.286948,0.522222,0.308922,0.440975,0.426551,0.113434,91
2,0.003119,0.000944,0.003714,0.001269,ball_tree,1,2,uniform,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.360926,0.206504,0.519969,0.272539,0.297671,0.331522,0.106425,103
3,0.002150,0.000056,0.002538,0.000019,ball_tree,1,2,distance,"{'algorithm': 'ball_tree', 'n_neighbors': 1, '...",0.360926,0.206504,0.519969,0.272539,0.297671,0.331522,0.106425,107
4,0.002151,0.000059,0.002788,0.000079,ball_tree,3,1,uniform,"{'algorithm': 'ball_tree', 'n_neighbors': 3, '...",0.630596,0.487312,0.566011,0.503420,0.622761,0.562020,0.059031,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,0.002760,0.000369,0.003824,0.000331,brute,17,2,distance,"{'algorithm': 'brute', 'n_neighbors': 17, 'p':...",0.500516,0.420652,0.566862,0.382102,0.531344,0.480295,0.068859,60
104,0.002054,0.000524,0.003297,0.000551,brute,20,1,uniform,"{'algorithm': 'brute', 'n_neighbors': 20, 'p':...",0.435499,0.391049,0.573618,0.331686,0.470302,0.440431,0.081141,85
105,0.001724,0.000030,0.003005,0.000046,brute,20,1,distance,"{'algorithm': 'brute', 'n_neighbors': 20, 'p':...",0.557940,0.495954,0.656642,0.425541,0.577024,0.542620,0.077852,31
106,0.001845,0.000195,0.002552,0.000065,brute,20,2,uniform,"{'algorithm': 'brute', 'n_neighbors': 20, 'p':...",0.369724,0.318137,0.464698,0.297535,0.398565,0.369732,0.059539,100


In [17]:
pd.DataFrame(gcv.cv_results_)[['param_algorithm',	'param_n_neighbors',	'param_p', 'param_weights', 'mean_test_score']].sort_values('mean_test_score',ascending=False)

Unnamed: 0,param_algorithm,param_n_neighbors,param_p,param_weights,mean_test_score
9,ball_tree,5,1,distance,0.611714
45,kd_tree,5,1,distance,0.611714
81,brute,5,1,distance,0.611714
13,ball_tree,7,1,distance,0.605716
49,kd_tree,7,1,distance,0.605716
...,...,...,...,...,...
38,kd_tree,1,2,uniform,0.331522
75,brute,1,2,distance,0.331522
74,brute,1,2,uniform,0.331522
39,kd_tree,1,2,distance,0.331522


# Randomized Search CV

In [18]:
from sklearn.model_selection import RandomizedSearchCV
rcv = RandomizedSearchCV(knn, param_grid, scoring='r2', refit=True, cv=kfold, verbose=2)
rcv.fit(X,y)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=20, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=20, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=20, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=20, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=20, p=1, weights=uniform; total time=   0.0s
[CV] END algorithm=brute, n_neighbors=17, p=1, weights=uniform; total time=   0.0s
[CV] END algorit

In [19]:
rcv.best_params_

{'weights': 'uniform', 'p': 1, 'n_neighbors': 5, 'algorithm': 'ball_tree'}

In [20]:
rcv.best_score_

np.float64(0.589758956010885)