In [108]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing, load_iris
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import (
    KNeighborsClassifier, KNeighborsRegressor, 
    RadiusNeighborsClassifier, RadiusNeighborsRegressor
)
from sklearn.preprocessing import StandardScaler

In [124]:
iris = load_iris()
x = iris.data
y = iris.target

In [8]:
type(x)

numpy.ndarray

In [9]:
x.shape

(150, 4)

In [10]:
type(y)

numpy.ndarray

In [11]:
y.shape

(150,)

In [12]:
# split_train_test (8:2)

In [125]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [14]:
x_train.shape

(120, 4)

In [15]:
x_test.shape

(30, 4)

In [16]:
y_train.shape

(120,)

In [17]:
y_test.shape

(30,)

In [18]:
# declare and fit knn classifier with n_neighbors=100, metric=euclidean, weights=distance
knn_clf = KNeighborsClassifier(n_neighbors=100, metric="euclidean", weights="distance")
knn_clf.fit(x_train, y_train)

In [19]:
# predict for training and test set.

In [20]:
y_train_pred = knn_clf.predict(x_train)
y_train_prob = knn_clf.predict_proba(x_train)

In [21]:
y_test_pred = knn_clf.predict(x_test)
y_test_prob = knn_clf.predict_proba(x_test)

In [22]:
knn_clf.score(x_train, y_train)

1.0

In [23]:
knn_clf.score(x_test, y_test)

0.8666666666666667

In [24]:
# to see neighbors

In [25]:
nn_train = knn_clf.kneighbors(x_train)

In [26]:
nn_test = knn_clf.kneighbors(x_test)

In [28]:
nn_test_distances, nn_test_indexes = nn_test

In [29]:
nn_test_distances

array([[0.14142136, 0.26457513, 0.34641016, ..., 2.78926514, 2.82311884,
        2.83196045],
       [0.33166248, 0.34641016, 0.37416574, ..., 3.04138127, 3.04138127,
        3.07083051],
       [0.24494897, 0.24494897, 0.31622777, ..., 3.64965752, 3.67151195,
        3.67287353],
       ...,
       [0.26457513, 0.42426407, 0.45825757, ..., 3.66742416, 3.69052842,
        3.69459064],
       [0.36055513, 0.36055513, 0.41231056, ..., 3.93700394, 3.93700394,
        3.94588393],
       [0.17320508, 0.17320508, 0.24494897, ..., 4.88364618, 4.9010203 ,
        4.92036584]])

In [31]:
nn_test_distances.shape

(30, 100)

In [30]:
nn_test_indexes

array([[ 86,  91,  43, ...,  69,  90,  79],
       [ 30,  81,  67, ...,  16,  96,  68],
       [ 76,  48,  93, ..., 106,  35,  34],
       ...,
       [108,   1,  93, ...,  54,  96, 106],
       [  9,  80, 110, ...,  98, 106,   7],
       [ 84,  16, 106, ...,  40,  45,  83]])

In [32]:
nn_test_indexes.shape

(30, 100)

In [None]:
# how to get neighbors manually
# 1. calculate distances with all training samples
# 2. find the k-samples with smallest distances

In [36]:
dist_mat = pairwise_distances(x_test, x_train)

In [37]:
dist_mat.shape

(30, 120)

In [41]:
np.argsort(dist_mat[0, :])

array([ 86,  91,  43,  27,  38,  81,  67,  95, 119,  63, 100,   3,  89,
        28,  71,  78,  64,  14,  75,  66,  10, 117,  30,  97,  36,  20,
       115,  74,   4,  72,   1,  77,  76,  48, 108,  21, 113,  50,  51,
        25,   9,  42,  59,  80, 110,  93,  13,  57, 104,  23,  19, 107,
        33,  46,   8,  44,   6,  12,   5,  15,  22, 102,  47,  73,  40,
        94,  11, 103,  34,  17,  32,   0,  82,   7,  52,  49,  45, 101,
        99,  92, 106,  54, 111,  53,  68,  16,  98,  65,  83,  18,  61,
        39,  96,  87,  58,  26,  84,  69,  90,  79,  60,   2,  31,  62,
        88,  85,  35,  29,  37, 114,  55, 112, 105,  56, 109,  41, 118,
       116,  70,  24])

In [43]:
np.argsort(dist_mat[0, :])[:3]

array([86, 91, 43])

In [134]:
# declare and fit radius neighbor (with radius=0.9, metric=manhattan, weights=uniform)
rnn_clf = RadiusNeighborsClassifier(radius=0.9, metric="manhattan", weights="uniform")
rnn_clf.fit(x_train, y_train)

In [132]:
y_train_pred_rnn = rnn_clf.predict(x_train)
y_train_prob_rnn = rnn_clf.predict_proba(x_train)

In [133]:
y_test_pred_rnn = rnn_clf.predict(x_test)
y_test_prob_rnn = rnn_clf.predict_proba(x_test)



In [53]:
nn_train_distances_rnn, nn_train_indexes_rnn = rnn_clf.radius_neighbors(x_train)

In [54]:
nn_test_distances_rnn, nn_test_indexes_rnn = rnn_clf.radius_neighbors(x_test)

In [55]:
nn_test_indexes_rnn

array([array([43, 27, 86, 91, 38]),
       array([ 27, 117,  81,  95,  14,   4,  67,  66,  64,  30,   3,  28,  10,
              115,   1,  72])                                                 ,
       array([ 76,   4,  30, 113, 115, 108,   1,  48,  72,  93,  77,  13]),
       array([ 32,  57, 102,  73,  33,  46,  44,  12]),
       array([89, 75, 71]), array([88, 83, 31]),
       array([  0,   2, 106,  98,  96,  92,  15,  16,  84,  68,  62,  54,  52,
               47])                                                           ,
       array([  0, 106,   5,   6,   7, 103, 101,  98,  11,  96,  15,  16,  84,
               82,  22,  69,  68,  26,  61,  54,  53,  34])                   ,
       array([106,   6, 103, 101,  98,  11,  96,  92,  15,  16,  84,  22,  68,
               62,  54,  52,  47])                                            ,
       array([], dtype=int64),
       array([111, 106,   7, 101,  90,  16,  84,  82,  22,  69,  68,  26,  61,
               53,  34,  39])        

In [70]:
# load housing dataset

In [71]:
housing = fetch_california_housing()

In [73]:
x = housing.data
y = housing.target

In [82]:
# randomly select 1000 samples (only in class, for fast practice)

In [83]:
n_sample = x.shape[0]

In [85]:
sample_indexes = np.random.choice(n_sample, 1000)
sample_indexes.shape

(1000,)

In [86]:
sample_indexes

array([13076,  7124, 10045,  1408,  6177, 18821,  7314, 14333,  5076,
       13475,  2526,  3920, 13098,  7712,  1721,  2229,  2076, 16848,
       14488,  1661, 14237, 19155, 13096,  7580, 12522, 10061,  2508,
       15118,  7204, 14633, 12684,  9499,  7970, 20118, 13907, 12836,
        1714, 14446, 20310, 11482, 11676, 12727,  4190, 19396, 17165,
       16214, 12838,  2112,  9965, 15387, 14288, 15976, 13333,  6472,
        4249,  6231, 12534, 10527, 10814, 13718,   941,  7814, 14615,
        1123,  1855, 19584, 16112,  6157, 11522,  6133, 13278,  8915,
         526, 15157,  4005,  4549,  1092, 19363,  1281,  7643, 11270,
       13376, 15290,  9948, 13990,  7797,  5337, 12782,  9338, 14181,
       15546, 15905, 10108, 15891, 14695, 15255, 11064, 18429, 17816,
       11278,  1183,  2436,  8876, 14583, 18644, 16649, 13202, 14359,
        2223, 13091,   690,  2316, 10236,  1623,  2867,    55,  8112,
       17575, 18884, 10420, 19477, 20304, 11998,  6517,  4125,  1099,
       10186,  9637,

In [87]:
x = x[sample_indexes]
y = y[sample_indexes]

In [88]:
x.shape

(1000, 8)

In [89]:
y.shape

(1000,)

In [74]:
# split train test set (8:2)

In [90]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [91]:
x_train.shape

(800, 8)

In [92]:
x_test.shape

(200, 8)

In [93]:
y_train.shape

(800,)

In [94]:
y_test.shape

(200,)

In [95]:
# declare and fit k nearest neighbor regressor (n_neighbors=10, metric=euclidean, weights=uniform)

In [99]:
knn_reg = KNeighborsRegressor(n_neighbors=10, metric="euclidean", weights="uniform")
knn_reg.fit(x_train, y_train)

In [100]:
y_train_pred = knn_reg.predict(x_train)
y_test_pred = knn_reg.predict(x_test)

In [103]:
nn_dist_train, nn_idx_train = knn_reg.kneighbors(x_train)

In [104]:
nn_dist_test, nn_idx_test = knn_reg.kneighbors(x_test)

In [110]:
scaler = StandardScaler()
scaler.fit(x_train)

In [111]:
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [112]:
np.mean(x_train_scaled, axis=0)

array([ 2.90878432e-16,  7.55645546e-17, -1.25906230e-15,  2.83788618e-15,
       -8.52096171e-17, -7.32469640e-16, -6.56905086e-15, -7.72382158e-15])

In [113]:
np.std(x_train_scaled, axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1.])

In [114]:
np.mean(x_test_scaled, axis=0)

array([ 0.05432788, -0.05951009,  0.10388935,  0.06266112,  0.0963757 ,
        0.61294295,  0.0081619 ,  0.01930721])

In [115]:
np.std(x_test_scaled, axis=0)

array([1.07208641, 0.98960742, 1.85372475, 2.20160899, 1.28631757,
       7.36927594, 1.02441622, 1.03173272])

In [117]:
knn_reg.fit(x_train_scaled, y_train)

In [118]:
nn_dist_test_scaled, nn_idx_test_scaled = knn_reg.kneighbors(x_test_scaled)