In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from numpy import linalg
from scipy.spatial import distance


In [None]:
#Load the dataset
df = pd.read_csv('../input/diabetes.csv')

#Print the first 5 rows of the dataframe.
df.head()

In [None]:
#Let's observe the shape of the dataframe.
df.shape

In [None]:
#Let's create numpy arrays for features and target
X = df.drop('Outcome',axis=1).values
y = df['Outcome'].values

Let's split the data randomly into training and test set. 

We will fit/train a classifier on the training set and make predictions on the test set. Then we will compare the predictions with the known labels.

Scikit-learn provides facility to split data into train and test set using train_test_split method.

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split

It is a best practice to perform our split in such a way that out split reflects the labels in the data. In other words, we want labels to be split in train and test set as they are in the original dataset. So we use the stratify argument.

Also we create a test set of size of about 40% of the dataset.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42, stratify=y)

In [None]:
class MyKNeighborsClassifier:

    def __init__(self, k,  metric = 'euclidean'):
        self.k = k
        self.metric = metric

    def euclidean_dist(self, array1, array2):
        array1 = np.array(array1)

        array2 = np.array(array2)
        
        return linalg.norm(array1 - array2)
    
    def manhattan_dist(self, array1, array2):
    
        array1 = np.array(array1)

        array2 = np.array(array2)
        return distance.cityblock(array1, array2)


    def k_neighbors(self, test_row):
        distances = []
        for i in range(len(self.X_train)):
            if self.metric == 'euclidean':
                distance = self.euclidean_dist(test_row, self.X_train[i])
            else:
                distance = self.manhattan_dist(test_row, self.X_train[i])
            distances.append((distance, self.y_train[i]))
        distances.sort()
        return distances[:self.k]


    def get_nn(self):

        self.X_train = np.array(self.X_train)

        self.X_test = np.array(self.X_test)

        self.y_train = np.array(self.y_train)

        neighbors = []

        for j in range(len(self.X_test)):

            neighbors.append(self.k_neighbors(self.X_test[j]))

        return neighbors


    def vote_count(self, lst):

        lst_count = dict()

        for element in lst:

            if element in lst_count:

                lst_count[element] += 1

            else:

                lst_count[element] = 1

        return lst_count


    def fit(self, X_train, y_train):

        self.X_train = X_train

        self.y_train = y_train


    def predict(self, X_test):

        self.X_test = X_test

        nbrs = self.get_nn()

        predictions = []

        for row in nbrs:

            dist, labels = zip(*row)

            label_dict = self.vote_count(labels)

            predictions.append(max(label_dict, key = label_dict.get))
        
        return predictions

    def evaluate(self, y_pred, y_test):

        count = 0

        for act, pred in zip(y_pred, y_test):
            if act == pred:
                count += 1

        return count / len(y_test)

### Проверьте решение на датасете и сравните с kNeighborsClassifier из sklearn (4 балла)
1. (1 балл) Выведите accuracy_score для вашего решения и для решения из sklearn, посчитайте в 4 вариациях (Должно получиться 8 чисел)
  1. Параметры по умолчанию
  2. `weights='distance'`
  3. `metric='manhattan'`
  4. `weights='distance'`, `metric='manhattan'`
2. (2 балла) Переберите параметр k от 1 до 10 для каждой модели из пункта выше (получится 4 графика по две линии на каждом)
  1. Переберите параметр, каждый раз обучайте модель
  2. Выведите график зависимости `accuracy` от `k`
  3. На этом же графике выведите пунктирной линией такую же зависимость для модели из sklearn
3. (3 балла) Напишите вывод, сравнение всего, что получилось – получилось ли у вас достичь таких же результатов, как в sklearn, как на результат влияют параметры, какая модель и с какими параметрами оказалась лучшей.

In [None]:
k = 4

knn = MyKNeighborsClassifier(k)

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn.evaluate(y_pred, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
knn.score(X_test, y_test) 

In [None]:
knn = MyKNeighborsClassifier(k, 'manhattan')

knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn.evaluate(y_pred, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=k, metric = 'manhattan')
knn.fit(X_train, y_train)
knn.score(X_test, y_test) 

In [None]:


#Setup arrays to store training and test accuracies
neighbors = np.arange(1,11)
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = MyKNeighborsClassifier(k)
    
    #Fit the model
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    #Compute accuracy on the training set
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.evaluate(y_pred, y_test)

In [None]:
#import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#Setup arrays to store training and test accuracies
neighbors = np.arange(1,11)
test_accuracy_sklearn = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    
    #Compute accuracy on the test set
    test_accuracy_sklearn[i] = knn.score(X_test, y_test) 

In [None]:
#Generate plot
plt.title('euclidean k-NN  Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='My Accuracy')
plt.plot(neighbors, test_accuracy_sklearn, label='Sklearn accuracy')

plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:


#Setup arrays to store training and test accuracies
neighbors = np.arange(1,11)
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = MyKNeighborsClassifier(k, 'manhattan')
    
    #Fit the model
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    #Compute accuracy on the training set
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.evaluate(y_pred, y_test)

In [None]:
#import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

#Setup arrays to store training and test accuracies
neighbors = np.arange(1,11)
test_accuracy_sklearn = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k, metric = 'manhattan')
    
    #Fit the model
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    
    #Compute accuracy on the test set
    test_accuracy_sklearn[i] = knn.score(X_test, y_test) 

In [None]:
#Generate plot
plt.title('manhattan k-NN  Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='My Accuracy')
plt.plot(neighbors, test_accuracy_sklearn, label='Sklearn accuracy')

plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

# Вывод 
Sklearn работает явно лучше, хотя в при metric = 'manhattan' и при k = 4, 6 и 8 моя модель оказалась лучше и я не понимаю, почему.
Евклидово расстояние выводит результат лучше - 0.758, чем второе - 0.74.