In [17]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [18]:
wisconsin_data = pd.read_csv('breast-cancer-wisconsin.csv', names=['Sample_code_number','Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin','Normal_Nucleoli','Mitoses','Class'])
wisconsin_data = wisconsin_data.drop(columns=['Sample_code_number'])

In [19]:
cleaned_wisconsin_data = wisconsin_data[wisconsin_data.get('Bare_Nuclei') != '?']
cleaned_wisconsin_data = cleaned_wisconsin_data.assign(Bare_Nuclei=cleaned_wisconsin_data.get('Bare_Nuclei').astype(int))
sample = cleaned_wisconsin_data.sample(1000, replace=True)
cleaned_wisconsin_data = pd.concat([cleaned_wisconsin_data, sample])
cleaned_wisconsin_data

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
656,5,1,1,1,2,1,2,1,1,2
422,4,3,3,1,2,1,3,3,1,2
301,1,1,1,1,2,1,3,1,1,2
654,3,1,1,1,2,1,3,1,1,2


In [20]:
shuffled = cleaned_wisconsin_data.sample(cleaned_wisconsin_data.shape[0], replace=False)

In [35]:
training = shuffled.iloc[:1000].reset_index().drop(columns=['index'])
training

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Class
0,1,1,2
1,7,3,4
2,4,8,4
3,1,1,2
4,5,1,2
...,...,...,...
995,10,10,4
996,9,6,4
997,3,1,2
998,6,3,2


In [34]:
test = shuffled.iloc[1000:].reset_index().drop(columns=['index'])
test

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Class
0,2,1,2
1,1,1,2
2,6,1,2
3,10,4,4
4,3,4,4
...,...,...,...
678,3,1,2
679,5,8,4
680,3,1,2
681,10,7,4


## Distances

In [23]:
def distance(row1, row2):
    return np.sqrt(np.sum((np.array(row1.astype(int)) - np.array(row2.astype(int)))**2))
def manhattan(row1, row2):
    return np.sum(abs(np.array(row1.astype(int) - np.array(row2.astype(int)))))

In [24]:
def nearest_neighbors(ex, training):
    distances = np.array([])
    for index, row in training.iterrows():
        distances = np.append(distances, distance(ex, row))
    return distances
def nearest_neighbors_man(ex, training):
    manhattans = np.array([])
    for index, row in training.iterrows():
        manhattans = np.append(manhattans, manhattan(ex, row))
    return manhattans


In [25]:
distance(test.iloc[0],test.iloc[1])

1.0

In [26]:
manhattan(test.iloc[0],test.iloc[1])

1

In [27]:
nearest_neighbors_man(test.iloc[0] , training)

array([ 1.,  7.,  4.,  1.,  3.,  2.,  1.,  1.,  4., 10.,  7.,  8.,  1.,
        2.,  1.,  3.,  0.,  1.,  5.,  6.,  1.,  3.,  7.,  9.,  5.,  1.,
        8.,  8.,  1.,  1.,  1.,  1., 10., 10.,  6.,  1.,  1.,  4.,  0.,
        1.,  3.,  1.,  3.,  2.,  6., 10.,  1.,  1.,  1.,  9.,  1.,  1.,
        1.,  9.,  3.,  2.,  1.,  1., 10.,  2.,  2.,  1., 10.,  0.,  1.,
        5.,  3.,  4.,  6.,  1.,  9.,  0.,  2.,  3.,  4.,  3.,  7.,  2.,
        3.,  2., 10.,  2.,  1.,  3.,  7.,  2.,  3.,  4.,  2.,  3.,  1.,
        1.,  0., 10.,  2.,  7.,  7.,  1.,  1.,  1.,  2.,  1.,  2.,  4.,
       10.,  1.,  1.,  4., 10.,  2.,  1., 10.,  2., 10.,  6.,  8.,  3.,
        3.,  3.,  5.,  0.,  4.,  0.,  5.,  1.,  3.,  1., 10.,  6.,  8.,
       10.,  7.,  2.,  3.,  1.,  3.,  1.,  1.,  3.,  3.,  1.,  2.,  2.,
        8.,  1.,  3.,  1.,  2.,  1., 10., 10.,  0.,  1.,  1.,  8.,  4.,
       10.,  1., 10.,  6.,  0.,  9.,  3., 10., 10.,  3.,  1.,  8.,  5.,
        3.,  1.,  1.,  2.,  2.,  5.,  1.,  1.,  8.,  1.,  0.,  1

In [28]:
test_unknown = test.drop(columns=['Class'])
test_unknown

Unnamed: 0,Clump_Thickness
0,2
1,1
2,6
3,10
4,3
...,...
678,3
679,5
680,3
681,10


In [29]:
def classify(ex, training, k):
    temp_training = training.drop(columns=['Class'])
    distances = nearest_neighbors(ex,temp_training)
    temp = temp_training.assign(distances=distances).assign(Class=training['Class']).sort_values(by='distances')
    most = temp.head(k).groupby('Class').count().sort_values(by='Clump_Thickness').index[-1]
    return most
def classify_man(ex, training, k):
    temp_training = training.drop(columns=['Class'])
    distances = nearest_neighbors_man(ex,temp_training)
    temp = temp_training.assign(distances=distances).assign(Class=training['Class']).sort_values(by='distances')
    most = temp.head(k).groupby('Class').count().sort_values(by='Clump_Thickness').index[-1]
    return most
classify(test_unknown.iloc[0], training, 3)

2

In [30]:
def accuracy(k):
    correct = 0
    for index, row in test_unknown.iterrows():
        if classify(row, training, k) == test['Class'].iloc[index]:
            correct = correct + 1
    return correct / test.shape[0]
def accuracy_man(k):
    correct = 0
    for index, row in test_unknown.iterrows():
        if classify_man(row, training, k) == test['Class'].iloc[index]:
            correct = correct + 1
    return correct / test.shape[0]

In [38]:
k_5 = accuracy(3)
k_5

0.7979502196193266

In [39]:
k_5_man = accuracy_man(3)
k_5_man

ValueError: Length of passed values is 2, index implies 1.