## 1. Nearest Neighbours with user-defined distances

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
X = np.genfromtxt("ionosphere.txt", delimiter=",",
                  usecols=np.arange(34))
y = np.genfromtxt("ionosphere.txt", delimiter=",",
                  usecols=34, dtype='int')
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [8]:
knn.score(X_test, y_test)

0.8522727272727273

In [9]:
def my_dist(x, y):
    return np.sum((x-y)**2)
knn = KNeighborsClassifier(n_neighbors=1, metric=my_dist)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30,
           metric=<function my_dist at 0x000001F4FA3F6620>,
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [10]:
np.mean(knn.predict(X_test)==y_test)

0.8522727272727273

In [11]:
knn = KNeighborsClassifier(n_neighbors=1, p=1)
knn.fit(X_train, y_train)
np.mean(knn.predict(X_test)==y_test)

0.9204545454545454

#### Exercise 1

In [12]:
parameters = {'p': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(n_neighbors=1), parameters)
grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)



0.9204545454545454

In [26]:
print(grid_search.cv_results_)

{'mean_fit_time': array([0.00050433, 0.00072424, 0.00034841, 0.00202743, 0.00134341,
       0.        , 0.00135199, 0.00033379, 0.00067687, 0.00171367]), 'std_fit_time': array([0.00071324, 0.00051384, 0.00049272, 0.00143884, 0.00189987,
       0.        , 0.00191201, 0.00047204, 0.00047883, 0.0017646 ]), 'mean_score_time': array([0.00228707, 0.00233316, 0.03407001, 0.02933534, 0.03182308,
       0.03311706, 0.03091351, 0.026395  , 0.02900887, 0.03073215]), 'std_score_time': array([0.00042957, 0.00123186, 0.00216936, 0.00205618, 0.00112705,
       0.00108569, 0.00226308, 0.00483952, 0.00266355, 0.00124018]), 'param_p': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             mask=[False, False, False, False, False, False, False, False,
                   False, False],
       fill_value='?',
            dtype=object), 'params': [{'p': 1}, {'p': 2}, {'p': 3}, {'p': 4}, {'p': 5}, {'p': 6}, {'p': 7}, {'p': 8}, {'p': 9}, {'p': 10}], 'split0_test_score': array([0.86516854, 0.83146067,

In [23]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'p': 1}
0.8859315589353612


best value of p for Nearest Neighbour is 1. error rate for p = 1 is  0.8859315589353612

## 2. Kernel methods

In [13]:
def poly_kernel(x, y, d):
  return (1+np.dot(x,y))**d
d = 2  # trying the polynomial kernel of degree d
def poly_dist(x, y):  # squared distance
  return poly_kernel(x,x,d) + poly_kernel(y,y,d)\
    - 2*poly_kernel(x,y,d)
knn = KNeighborsClassifier(n_neighbors=1, metric=poly_dist)
knn.fit(X_train, y_train)
np.mean(knn.predict(X_test)==y_test)

0.8863636363636364

In [14]:
def rbf_kernel(x, y, gamma):
  return np.exp(-gamma*np.sum((x-y)**2))
gamma = 10  # the parameter gamma of the rbf kernel
def rbf_dist(x, y):  # squared distance
  return rbf_kernel(x,x,gamma) + rbf_kernel(y,y,gamma)\
    - 2*rbf_kernel(x,y,gamma)
knn = KNeighborsClassifier(n_neighbors=1, metric=rbf_dist)
knn.fit(X_train, y_train)
np.mean(knn.predict(X_test)==y_test)

0.9659090909090909

In [16]:
from sklearn.model_selection import cross_val_score
best_score = 0
for gamma in [0.01, 0.1, 1, 10, 100]:
  # for each parameter, train a model
  def rbf_dist(x, y):  # squared distance
    return rbf_kernel(x,x,gamma) + rbf_kernel(y,y,gamma)\
      - 2*rbf_kernel(x,y,gamma)
  knn = KNeighborsClassifier(n_neighbors=1, metric=rbf_dist)
  # perform cross-validation
  scores = cross_val_score(knn, X_train, y_train, cv=5)
  # compute mean cross-validation accuracy
  score = np.mean(scores)
  # if we got a better score, store the score and parameters
  if score > best_score:
    best_score = score
    best_gamma = gamma
# rebuild a model on the full training set
def rbf_dist(x, y):  # squared distance
  return rbf_kernel(x,x,best_gamma) + rbf_kernel(y,y,best_gamma)\
    - 2*rbf_kernel(x,y,best_gamma)
knn = KNeighborsClassifier(n_neighbors=1, metric=rbf_dist)
knn.fit(X_train, y_train)
test_score = knn.score(X_test, y_test)
print("Best CV score:", best_score)
print("Best parameter gamma:", best_gamma)
print("Test set score with best parameters:", test_score)

Best CV score: 0.8741654571843253
Best parameter gamma: 10
Test set score with best parameters: 0.9659090909090909


## 3. Creating your own estimator

In [17]:
class My_Classifier(KNeighborsClassifier):  
  """My first example of a classifier"""
  def __init__(self, n_neighbors=1):
    KNeighborsClassifier.__init__(self, n_neighbors=n_neighbors)
  def fit(self, X, y):
    KNeighborsClassifier.fit(self, X, y)
    return self
  def predict(self, X, y=None):
    return KNeighborsClassifier.predict(self, X)
  def score(self, X, y):
    return KNeighborsClassifier.score(self, X, y)

In [18]:
knn = My_Classifier()
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8522727272727273

In [19]:
class rbfClassifier(KNeighborsClassifier):  
  """Kernel K Nearest Neighbours classifier"""
  def __init__(self, n_neighbors=1, gamma=1):
    def rbf_dist(x, y):  # squared distance
      return rbf_kernel(x,x,gamma) + rbf_kernel(y,y,gamma)\
        - 2*rbf_kernel(x,y,gamma)
    KNeighborsClassifier.__init__(self, n_neighbors=n_neighbors,
      metric=rbf_dist)
    self.gamma = gamma
    self.n_neighbors=n_neighbors
  def fit(self, X, y):
    KNeighborsClassifier.fit(self, X, y)
    return self
  def predict(self, X, y=None):
    return KNeighborsClassifier.predict(self, X)
  def score(self, X, y):
    return KNeighborsClassifier.score(self, X, y)

In [20]:
knn = rbfClassifier(n_neighbors=1, gamma=10)
knn.fit(X_train, y_train)
knn.score(X_test,y_test)

0.9659090909090909

## 4. Uncertainty estimates for Nearest Neighbours

In [43]:
from sklearn.datasets import load_iris
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
  iris.target, random_state=0)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.predict(X_test)

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2])

In [44]:
knn.predict_proba(X_test)

array([[0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0.6, 0.4],
       [1. , 0. , 0. ],
       [0. , 0.8, 0.2],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.2, 0.8],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.2, 0.8],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ]])

#### Exercise 2

In [46]:
A = knn.predict_proba(X_test)

In [49]:
X = np.argmax(A,axis=1)
X

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2], dtype=int64)

We can get the predict array from probability array by using the argmax function of NumPy

#### Exercise 3

In [51]:
class rbfClassifier(KNeighborsClassifier):  
  """Kernel K Nearest Neighbours classifier"""
  def __init__(self, n_neighbors=1, gamma=1):
    def rbf_dist(x, y):  # squared distance
      return rbf_kernel(x,x,gamma) + rbf_kernel(y,y,gamma)\
        - 2*rbf_kernel(x,y,gamma)
    KNeighborsClassifier.__init__(self, n_neighbors=n_neighbors,
      metric=rbf_dist)
    self.gamma = gamma
    self.n_neighbors=n_neighbors
  def fit(self, X, y):
    KNeighborsClassifier.fit(self, X, y)
    return self
  def predict(self, X, y=None):
    return KNeighborsClassifier.predict(self, X)
  def predict_proba(self, X, y=None):
    return KNeighborsClassifier.predict_proba(self, X)
  def score(self, X, y):
    return KNeighborsClassifier.score(self, X, y)

#### Exercise 4

In [53]:
knn = rbfClassifier(n_neighbors=1, gamma=10)
knn.fit(X_train, y_train)
knn.predict_proba(X_test)

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])