In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
wisconsin_data = pd.read_csv('breast-cancer-wisconsin.csv', names=['Sample_code_number','Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin','Normal_Nucleoli','Mitoses','Class'])
wisconsin_data = wisconsin_data.drop(columns=['Sample_code_number'])

In [3]:
cleaned_wisconsin_data = wisconsin_data[wisconsin_data.get('Bare_Nuclei') != '?']
cleaned_wisconsin_data = cleaned_wisconsin_data.assign(Bare_Nuclei=cleaned_wisconsin_data.get('Bare_Nuclei').astype(int))
sample = cleaned_wisconsin_data.sample(1000, replace=True)
cleaned_wisconsin_data = pd.concat([cleaned_wisconsin_data, sample])
cleaned_wisconsin_data

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
656,5,1,1,1,2,1,2,1,1,2
168,3,1,1,1,2,1,3,1,1,2
244,1,1,1,1,2,1,3,1,1,2
614,2,1,1,1,1,1,2,1,1,2


In [4]:
shuffled = cleaned_wisconsin_data.sample(cleaned_wisconsin_data.shape[0], replace=False)

In [5]:
training = shuffled.iloc[:1000].reset_index().drop(columns=['index'])
training

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,3,2,1,1,1,1,2
1,8,5,6,2,3,10,6,6,1,4
2,3,1,1,1,2,1,2,1,1,2
3,10,7,7,3,8,5,7,4,3,4
4,8,8,7,4,10,10,7,8,7,4
...,...,...,...,...,...,...,...,...,...,...
995,4,1,4,1,2,1,1,1,1,2
996,2,1,1,1,2,1,1,1,5,2
997,5,1,1,1,2,1,1,1,1,2
998,6,10,10,10,8,10,10,10,7,4


In [6]:
test = shuffled.iloc[1000:].reset_index().drop(columns=['index'])
test

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,6,1,1,1,2,1,3,1,1,2
1,7,4,6,4,6,1,4,3,1,4
2,1,1,1,1,1,1,3,1,1,2
3,4,10,4,7,3,10,9,10,1,4
4,4,1,1,1,2,1,1,1,1,2
...,...,...,...,...,...,...,...,...,...,...
678,2,5,7,6,4,10,7,6,1,4
679,10,7,7,3,8,5,7,4,3,4
680,4,1,1,1,2,1,2,1,1,2
681,10,6,6,2,4,10,9,7,1,4


## Distances

In [7]:
def distance(row1, row2):
    return np.sqrt(np.sum((np.array(row1.astype(int)) - np.array(row2.astype(int)))**2))
def manhattan(row1, row2):
    return np.sum(abs(np.array(row1.astype(int) - np.array(row2.astype(int)))))

In [8]:
def nearest_neighbors(ex, training):
    distances = np.array([])
    for index, row in training.iterrows():
        distances = np.append(distances, distance(ex, row))
    return distances
def nearest_neighbors_man(ex, training):
    manhattans = np.array([])
    for index, row in training.iterrows():
        manhattans = np.append(manhattans, manhattan(ex, row))
    return manhattans


In [9]:
distance(test.iloc[0],test.iloc[1])

8.306623862918075

In [10]:
manhattan(test.iloc[0],test.iloc[1])

21

In [11]:
nearest_neighbors_man(test.iloc[0] , training)

array([ 5., 32.,  4., 39., 54.,  4., 10., 44.,  2.,  4., 36., 43., 15.,
        4., 38., 17., 42., 27., 11., 40.,  4., 47., 51.,  8.,  8.,  8.,
        7.,  4.,  1., 44.,  6.,  4.,  4., 40.,  3.,  2.,  5.,  4.,  7.,
        5., 47.,  4., 60.,  4.,  3.,  5.,  4.,  9., 11., 55., 47., 44.,
        4.,  6., 40.,  7., 63.,  4.,  5., 13., 21., 41.,  3., 29.,  5.,
       29.,  4.,  2.,  6.,  8., 26.,  6., 26.,  7., 32.,  4.,  5., 26.,
        4.,  2., 26., 50.,  6., 15.,  5.,  6.,  6.,  7., 42., 45.,  5.,
        7.,  5., 54.,  8.,  7.,  3.,  3.,  5.,  9.,  6.,  4., 36.,  5.,
        5.,  4.,  6., 29., 54., 36., 33.,  3.,  5.,  6., 33., 30.,  7.,
        4.,  5.,  9.,  6., 17., 34.,  0.,  7.,  9., 27., 48.,  5., 47.,
       48.,  7.,  5., 26.,  2., 16.,  8., 33., 44.,  2.,  4.,  6.,  5.,
       55., 22.,  5.,  8.,  4., 69.,  4., 41.,  5.,  2.,  5., 36.,  5.,
        6.,  6., 30., 61.,  3., 45.,  2.,  9., 47., 10., 47.,  4.,  2.,
        7.,  3.,  3.,  5., 17.,  3., 34.,  4.,  2., 28., 32.,  3

In [12]:
test_unknown = test.drop(columns=['Class'])
test_unknown

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
0,6,1,1,1,2,1,3,1,1
1,7,4,6,4,6,1,4,3,1
2,1,1,1,1,1,1,3,1,1
3,4,10,4,7,3,10,9,10,1
4,4,1,1,1,2,1,1,1,1
...,...,...,...,...,...,...,...,...,...
678,2,5,7,6,4,10,7,6,1
679,10,7,7,3,8,5,7,4,3
680,4,1,1,1,2,1,2,1,1
681,10,6,6,2,4,10,9,7,1


In [13]:
def classify(ex, training, k):
    temp_training = training.drop(columns=['Class'])
    distances = nearest_neighbors(ex,temp_training)
    temp = temp_training.assign(distances=distances).assign(Class=training['Class']).sort_values(by='distances')
    most = temp.head(k).groupby('Class').count().sort_values(by='Clump_Thickness').index[-1]
    return most
def classify_man(ex, training, k):
    temp_training = training.drop(columns=['Class'])
    distances = nearest_neighbors_man(ex,temp_training)
    temp = temp_training.assign(distances=distances).assign(Class=training['Class']).sort_values(by='distances')
    most = temp.head(k).groupby('Class').count().sort_values(by='Clump_Thickness').index[-1]
    return most
classify(test_unknown.iloc[0], training, 3)

2

In [14]:
def accuracy(k):
    correct = 0
    for index, row in test_unknown.iterrows():
        if classify(row, training, k) == test['Class'].iloc[index]:
            correct = correct + 1
    return correct / test.shape[0]
def accuracy_man(k):
    correct = 0
    for index, row in test_unknown.iterrows():
        if classify_man(row, training, k) == test['Class'].iloc[index]:
            correct = correct + 1
    return correct / test.shape[0]

In [15]:
k_5 = accuracy(5)
k_5

0.986822840409956

In [16]:
k_5_man = accuracy_man(5)
k_5_man

0.9780380673499268