In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set random state for reproducibility
RANDOM_STATE = 42

print("=" * 60)
print("k-NN REGRESSOR - COMPLETE IMPLEMENTATION")
print("=" * 60)

# Step 1: Load Data
print("\n1. LOADING DATA")
print("-" * 30)

# Using California Housing dataset (built-in sklearn dataset)
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

k-NN REGRESSOR - COMPLETE IMPLEMENTATION

1. LOADING DATA
------------------------------


In [3]:
X.shape

(20640, 8)

In [4]:
y.shape

(20640,)

In [5]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: target, dtype: float64

In [7]:
X.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [8]:
X.duplicated().sum()

np.int64(0)

In [9]:
X.dtypes

MedInc        float64
HouseAge      float64
AveRooms      float64
AveBedrms     float64
Population    float64
AveOccup      float64
Latitude      float64
Longitude     float64
dtype: object

In [10]:
X.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=None
)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((16512, 8), (4128, 8), (16512,), (4128,))

In [13]:
scaler = StandardScaler()

In [14]:
x_train_scl = scaler.fit_transform(X_train)
x_test_scl = scaler.transform(X_test)

In [15]:
x_train_scl.shape

(16512, 8)

In [16]:
x_train_scl

array([[-0.326196  ,  0.34849025, -0.17491646, ...,  0.05137609,
        -1.3728112 ,  1.27258656],
       [-0.03584338,  1.61811813, -0.40283542, ..., -0.11736222,
        -0.87669601,  0.70916212],
       [ 0.14470145, -1.95271028,  0.08821601, ..., -0.03227969,
        -0.46014647, -0.44760309],
       ...,
       [-0.49697313,  0.58654547, -0.60675918, ...,  0.02030568,
        -0.75500738,  0.59946887],
       [ 0.96545045, -1.07984112,  0.40217517, ...,  0.00707608,
         0.90651045, -1.18553953],
       [-0.68544764,  1.85617335, -0.85144571, ..., -0.08535429,
         0.99543676, -1.41489815]])

In [17]:
knn = KNeighborsRegressor(n_neighbors=3)

In [18]:
knn.fit(x_train_scl,y_train)

In [19]:
knn.score(x_test_scl,y_test)

0.6438795499720962

In [20]:
y_pred = knn.predict(x_test_scl)

In [21]:
y_pred

array([0.487     , 0.67366667, 4.58334   , ..., 5.00001   , 0.69666667,
       1.89133333])

In [22]:
y_test

20046    0.47700
3024     0.45800
15663    5.00001
20484    2.18600
9814     2.78000
          ...   
15362    2.63300
16623    2.66800
18086    5.00001
2144     0.72300
3665     1.51500
Name: target, Length: 4128, dtype: float64

In [23]:
mean_squared_error(y_test,y_pred)

0.4666634350517549

In [24]:
mean_absolute_error(y_test,y_pred)

0.4599198611111111

In [25]:
r2_score(y_test,y_pred)

0.6438795499720962

In [26]:
pd.DataFrame({"y_true": y_test, "y_prde":y_pred})

Unnamed: 0,y_true,y_prde
20046,0.47700,0.487000
3024,0.45800,0.673667
15663,5.00001,4.583340
20484,2.18600,2.866333
9814,2.78000,2.958667
...,...,...
15362,2.63300,2.600667
16623,2.66800,2.137000
18086,5.00001,5.000010
2144,0.72300,0.696667
