In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing_data = fetch_california_housing()

In [2]:
data = pd.DataFrame(housing_data['data'], columns=housing_data['feature_names']).assign(MedianPrice=housing_data['target'])
data.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianPrice
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25,2.992
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25,2.414
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26,2.267
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25,2.611


Let's add some N/A to some of the columns

In [3]:
data.MedInc.iloc[::3] = np.nan
data.AveOccup.iloc[::4] = np.nan
data.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianPrice
0,,41.0,6.984127,1.02381,322.0,,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,,37.85,-122.25,3.422
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697
6,,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25,2.992
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25,2.414
8,2.0804,42.0,4.294118,1.117647,1206.0,,37.84,-122.26,2.267
9,,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25,2.611


Split to train and test

In [4]:
from sklearn.model_selection import train_test_split
X, y = data.drop('MedianPrice', axis=1), data.MedianPrice
train_test_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_ratio, random_state=0, shuffle=True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16512, 8), (16512,), (4128, 8), (4128,))

# KNN - Filling N/A as 0's for now

In [5]:
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

X_train_zeros = X_train.fillna(0)
X_test_zeros = X_test.fillna(0)

k = 15
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train_zeros, y_train)
y_pred = knn.predict(X_test_zeros)
mean_squared_error(y_test, y_pred)

1.2216141058416101

# Scaling Data

In [6]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_zeros)
X_train_transformed = scaler.transform(X_train_zeros)
X_test_transformed = scaler.transform(X_test_zeros)

In [7]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train_transformed, y_train)
y_pred = knn.predict(X_test_transformed)
mean_squared_error(y_test, y_pred)

0.4898770908862796

# Imputation + Scaling

In [8]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train)
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_imputed.isna().mean()

MedInc        0.0
HouseAge      0.0
AveRooms      0.0
AveBedrms     0.0
Population    0.0
AveOccup      0.0
Latitude      0.0
Longitude     0.0
dtype: float64

In [9]:
scaler = StandardScaler()
scaler.fit(X_train_imputed)
X_train_transformed = scaler.transform(X_train_imputed)
X_test_transformed = scaler.transform(X_test_imputed)

In [10]:
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train_transformed, y_train)
y_pred = knn.predict(X_test_transformed)
mean_squared_error(y_test, y_pred)

0.48139798192076955

# Pipeline

In [11]:
from sklearn.pipeline import Pipeline
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=k)

pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('regressor', knn)])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mean_squared_error(y_test, y_pred)

0.48139798192076955

# Hyperparameters Search

In [12]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsRegressor(n_neighbors=k)
parameters = {'n_neighbors':[1, 5, 10, 15, 20, 25, 30], 'p':[1, 2]}
model = GridSearchCV(knn, parameters, cv=3, scoring='neg_mean_squared_error')

model.fit(X_train_transformed, y_train)

model.best_params_

{'n_neighbors': 10, 'p': 1}

In [13]:
knn = KNeighborsRegressor(n_neighbors=10, p=1)
knn.fit(X_train_transformed, y_train)
y_pred = knn.predict(X_test_transformed)
mean_squared_error(y_test, y_pred)

0.4327261890480249

# Hyperparameters Search + Pipeline

In [14]:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=10, p=1)
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('regressor', knn)])

parameters = {'imputer__strategy':['mean', 'median'], 'regressor__n_neighbors':[5, 10, 15], 'regressor__p':[1, 2]}
model = GridSearchCV(pipeline, parameters, cv=3, scoring='neg_mean_squared_error')

model.fit(X_train, y_train)

model.best_params_

{'imputer__strategy': 'median',
 'regressor__n_neighbors': 10,
 'regressor__p': 1}

In [15]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
knn = KNeighborsRegressor(n_neighbors=10, p=1)
pipeline = Pipeline([('imputer', imputer), ('scaler', scaler), ('regressor', knn)])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mean_squared_error(y_test, y_pred)

0.4306832095234469