In [1]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

In [2]:
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

pipeline = Pipeline([    
    ('std_scaler', StandardScaler()),
    ('reg', KNeighborsRegressor())
])

In [5]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('reg', KNeighborsRegressor())])

In [6]:
train_score = pipeline.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 4))

test_score = pipeline.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 4))

R2 score on the training set: 0.8073
R2 score on the test set: 0.6898


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'reg__n_neighbors': np.arange(5, 101, 5),
    'reg__p': np.arange(1, 6),
}

grid = GridSearchCV(pipeline, param_grid, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'reg__n_neighbors': 10, 'reg__p': 1}


In [8]:
train_score = grid.score(X_train, y_train)
print('R2 score on the training set:', np.round(train_score, 4))

test_score = grid.score(X_test, y_test)
print('R2 score on the test set:', np.round(test_score, 4))

R2 score on the training set: 0.7922
R2 score on the test set: 0.7366
