In [1]:
from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit
import random
import numpy as np

In [2]:
iris = datasets.load_iris()
X  = iris.data
Y = iris.target
random.seed(a=0)
rand_sample = random.sample(range(0, 149), 30)
rand_dimension = []
for sample in rand_sample:
    rand_dimension.append(random.sample(range(0,3),1))
for sample, dimension in zip(rand_sample,rand_dimension):
    X[sample][dimension] = np.nan
sss  = StratifiedShuffleSplit(n_splits=3,test_size=0.33, random_state =0)
for train_index, test_index in sss.split(X,Y):
    x_train,x_test = X[train_index], X[test_index]
    y_train,y_test = Y[train_index], Y[test_index]

In [3]:
#deal with nans
x_train = np.nan_to_num(x_train)
x_test = np.nan_to_num(x_test)

In [4]:
class KNN(object):
    '''
    This is is a K-Nearest-Neighbor Classifier(KNN) written by hand.
    It only depends on numpy. 
    This will be simple KNN algo that weights the classification 
    by euclidiean distiance 
    
    METHODS
    Initializer: requires k, which is how many neighbors to use in the classification
    train: stores training data (training the model)
    predict: makes a prediction for a data point
    
    '''
    def __init__(self,k):
    # k = number of neighbours 
    # train_ = training set
    # predictions = predictions of classification 
        self.k = k
        self.train_data = 0
        self.predictions = []
        
    def train(self,x_data,y_data):
    # this trains the KNN model
    # training for KNN is just storing the training data
        self.train_data = [x_data,y_data]
    
    def predict(self,test_data):
        for vect in test_data:
            distance = np.linalg.norm(self.train_data[0] - vect,axis=1)
            
            #this is a sorting algo so I can keep the labels and the features together
            distance_target = np.rec.fromarrays([distance, self.train_data[1]])
            distance_target.sort()
            
            #take the k nearest
            k_distances = distance_target.f0[0:self.k]
            k_labels = distance_target.f1[0:self.k]
            
            #distance algo
            total_dist = np.sum(k_distances)
            k_dist_weight = np.divide(k_distances,float(total_dist))
            unique_labels = np.unique(k_labels)
            label_dict = {i:0 for i in unique_labels}
            label_dict_count = {i:0 for i in unique_labels}
            
            #add up the total distance away for each label, find average distance of each label type
            #and select the label with the smallest average distance
            for label,weight in zip(k_labels,k_dist_weight):
                label_dict[label] += weight
                label_dict_count[label] +=1
            for label in label_dict:
                label_dict[label] = label_dict[label]/float(label_dict_count[label])
            labels = label_dict.keys()
            values = label_dict.values()
            labels_values = np.rec.fromarrays([values,labels])
            labels_values.sort()
            
            #predict the lowest overall distance
            self.predictions.append(labels_values.f1[0])            

In [5]:
KNN = KNN(5)

In [6]:
KNN.train(x_train,y_train)

In [7]:
KNN.predict(x_test)

In [8]:
#accuracy
sum(y_test == KNN.predictions)/float(len(y_test))

0.92000000000000004