In [3]:
from sklearn.datasets import load_boston
boston = load_boston()

In [4]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
X, y = boston['data'], boston['target']

In [6]:
from matplotlib import pyplot as plt

<matplotlib.collections.PathCollection at 0x7f5d39169438>

In [8]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 

In [13]:
from sklearn.neighbors import KNeighborsRegressor

In [14]:
knn = KNeighborsRegressor(n_neighbors=5, weights='uniform', p=2)

In [15]:
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [16]:
knn.predict(X_test)

array([32.78,  9.02, 32.72, 19.18,  9.86, 21.18, 20.56, 19.04, 21.92,
       13.2 , 28.38, 17.94, 19.18, 12.66, 33.78, 17.42, 42.88, 13.14,
       29.82, 21.92, 24.64, 13.18, 25.3 , 21.96,  8.42, 21.38, 16.68,
       29.18, 14.  , 22.7 , 42.88, 30.24, 34.78, 12.62, 33.82, 17.  ,
       22.98, 17.42, 29.46, 17.94, 32.78, 14.46, 17.88, 20.04, 23.14,
       33.  , 19.62, 22.22, 20.92, 33.92, 13.3 , 35.62, 26.08, 22.12,
       17.86, 17.04, 22.92, 11.4 , 31.74, 23.34, 22.12, 28.6 , 11.44,
       19.28, 32.32, 29.46, 23.4 , 28.14, 24.92, 39.02, 17.94, 23.46,
       17.96, 12.22,  9.06, 23.78, 20.84,  8.42, 26.32, 22.44, 36.16,
       21.54, 24.78, 16.74, 19.28, 13.  , 30.26, 19.02, 14.58, 22.18,
       13.86, 22.96, 26.1 , 35.72, 31.74, 21.26, 38.84, 19.82, 17.48,
       38.74, 20.34,  9.36])

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
mean_squared_error(y_test, knn.predict(X_test))

49.483203921568624

In [48]:
from sklearn.model_selection import GridSearchCV

grid_searcher = GridSearchCV(
    KNeighborsRegressor(),
    param_grid={
        'n_neighbors': [1, 2, 3, 4, 5, 10, 20],
        'weights': ['uniform', 'distance'],
        'p': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    },
    cv=5
)
grid_searcher.fit(X_train, y_train)


GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 10, 20],
                         'p': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [49]:
grid_searcher.best_params_

{'n_neighbors': 4, 'p': 1, 'weights': 'distance'}

In [33]:
grid_searcher.predict(X_test)

array([32.69770099,  8.80022062, 37.06668516, 19.40375479,  9.94168022,
       21.13875308, 20.44704954, 19.81820761, 24.19267358, 12.84816356,
       28.36410369, 17.99336673, 20.37913638, 12.91741653, 34.20770644,
       17.8166296 , 41.76781939, 13.50065643, 27.24928339, 20.39843356,
       22.36759805, 14.77017352, 19.95141878, 22.26416486,  8.4277294 ,
       21.08122549, 17.8527587 , 31.45570798, 14.40609621, 22.50024726,
       41.74434265, 30.66108984, 33.63863344, 11.85683063, 33.13291357,
       17.32781592, 21.37662472, 17.14229107, 31.9831241 , 18.24905403,
       38.34932628, 15.35837233, 21.3566536 , 21.05370966, 23.42097857,
       37.08874448, 19.59778425, 21.59199119, 20.50939207, 33.13602211,
       13.37724639, 37.26227219, 19.6135673 , 22.57010047, 17.23519367,
       15.99702788, 21.15785185, 13.43008582, 31.65226003, 24.58296853,
       21.71605328, 32.09467746, 11.85318621, 19.2602091 , 32.80030139,
       31.60382186, 22.69183619, 24.24164788, 25.05597224, 28.51

In [35]:
mean_squared_error(y_test, grid_searcher.predict(X_test))

35.42759497086916

In [43]:
errors = []
for n in range (1, 30, 1):
    knn = KNeighborsRegressor(n_neighbors=n)
    knn.fit(X_train, y_train)
    errors.append(mean_squared_error(y_test, knn.predict(X_test)))


In [None]:
plt.plot(range(1, 30, 1), errors)
plt.show()