In [1]:
import numpy as np
from collections import Counter
import warnings
import pandas as pd
import random

In [2]:
df = pd.read_csv('../Datasets/breast-cancer-wisconsin.data', sep=',')
df.replace('?', -9999, inplace=True)
df.drop(['id'], 1, inplace=True)
#We are converting some of the string data to float type .........
#We are also converting it to list type of data
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)

In [3]:
test_size = 0.2
#we are creating our train data as 80% of total data
#we are creating our test data as 20% of total data
train_data = full_data[:-int(len(full_data)*test_size)] 
test_data = full_data[-int(len(full_data)*test_size):] 
#We are creating a train set and test set as dictionary as our KNN takes dictionary data as input
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}

In [4]:
#populating the train_set and test_set
for i in train_data:
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])

In [5]:
def k_nearest_neighbour(data, predict, k=5):
    #we are going to warn if anyone sets the value of k less than coting groups
    if len(data) >=k:
        warnings.warn('K value is less than total voting groups!')
    distances = []
    vote = []
    #First we are going to iterate in data we have got and take their group i.e. 2 or 4
    for group in data:
    #Here we go to the features of that group and take one feature at a time, calculate eculidean distance and append that 
    #distance and group as a list to the list called distances
        for features in data[group]:
            eculidean_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([eculidean_distance, group])
    #Here we are going to iterate through sorted distances and we are only taking numbers of distances and we need to derive 
    #data from that
    for i in sorted(distances)[:k]:
    #We are going to append the group only from the iteration that we have gone through
        vote.append(i[1])
    #here we count the votes by using counter and taking only most common
    #Counter(vote).most_common(1) returns a list of tuple of name that which has most number of votes and their values  
    vote_result = Counter(vote).most_common(1)[0][0]
    #confidence means that how much % algorith thinks it is correct
    confidence = Counter(vote).most_common(1)[0][1]/ k
    return (vote_result, confidence)

In [6]:
correct = 0
total = 0
#First we are going to iterate in trainig data set
for group in test_set:
    #Here we go to the features of that group and take one feature at a time and send to the function so that function says....
    #.... what group it belongs to and return vote_result
    for data in test_set[group]:
        vote, confidence = k_nearest_neighbour(train_set, data, 5)
        #here we compare the group from test_set and the group from the returned value
        if group == vote:
            correct += 1
        else:
            print(confidence)
        total += 1

0.8
0.6


In [7]:
print('Accuracy = ', (correct/total) * 100)

Accuracy =  98.56115107913669
