In [22]:
'''
Implementing kNN (k-Nearest Neighbors Algorithm) using only Python.
dataset: https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival

Dataset Description: The dataset contains cases from a study that was conducted
                     between 1958 and 1970 at the University of Chicago's Billings
                     Hospital on the survival of patients who had undergone surgery
                     for breast cancer.

Dataset Attributes Description: 1. Age of patient at time of operation (numerical)
                                2. Patient's year of operation (year - 1900, numerical)
                                3. Number of positive axillary nodes detected (numerical)
                                4. Survival status (class attribute)
                                     1 = the patient survived 5 years or longer
                                     2 = the patient died within 5 year
'''

#sample list
samples = []

In [48]:
# open data file
with open('/Users/rohithebbar/Desktop/Machine_learning_projects/k-nearest_neighbour/data/haberman/haberman.data','r')as f:
    for line in f.readlines():
        attributes = line.replace('\n', '').split(',')
        # converting items from the list of attributes (string to integer)
        samples.append([int(attribute) for attribute in attributes])
        

In [49]:
# displaying the data and returning the information
def dataset_info(samples, verbose = True):
    # displaying number of samples
    if verbose:
        print(f'Number of samples : {len(samples)}')
    
    # Initialising the counting variables for each label
    label_1, label_2 = 0, 0
    for sample in samples:
        if sample[-1] == 1:
            label_1 += 1
        else:
            label_2 += 1
        
    # Displaying number of samples  of each label
    if verbose:
        print(f'Number of samples of label_1 : {(label_1)}')
   
        print(f'Number of samples of label_2 : {(label_2)}')
    # return a tuple with the number of samples and the number 
    # of samples of each label
        
    return len(samples), label_1, label_2
        

In [50]:
# unpacking return tuple of dataset_info function
_ , label_1, label_2 = dataset_info(samples, verbose = True)

Number of samples : 612
Number of samples of label_1 : 450
Number of samples of label_2 : 162


In [51]:
# proportion of the dataset to include in the train split (60%)
p = 0.6


In [52]:
# Initializing the test and training samples list
train, test = [], []

# Calculating the maximum amount of training samples per label
max_label_1, max_label_2 = int(p * label_1), int(p * label_2)

# Total amount of training samples
total_of_train_samples = max_label_1 + max_label_2

# Initializing labels counters
count_label_1, count_label_2 = 0, 0

for sample in samples:
    # If the sum of the counters is less than the total amount of training samples
    if (count_label_1 + count_label_2) < (total_of_train_samples):
        # Adding sample to training set
        train.append(sample)
        if (sample[-1] == 1) and (count_label_1 < max_label_1):
            count_label_1 += 1
        else:
            count_label_2 += 1
    else:
        # Adding sample to list of test set
        test.append(sample)

In [53]:
# Displaying information about test and training samples
print('----------------------------------')
print('Train Samples')
dataset_info(train)
print('----------------------------------')
print('Test Samples')
dataset_info(test)
print('----------------------------------')


----------------------------------
Train Samples
Number of samples : 367
Number of samples of label_1 : 277
Number of samples of label_2 : 90
----------------------------------
Test Samples
Number of samples : 245
Number of samples of label_1 : 173
Number of samples of label_2 : 72
----------------------------------


In [54]:
# Euclidean Distance
import math
def euclidean_distance(v1, v2):
    # Getting vector 1 size and initializing summing variable 
    length, summation  =  len(v1), 0
    
    # Adding the square of the difference of the values of the two vectors
    for i in range(length - 1):
        # Adding the square of the difference of the values of the two vectors
        summation += math.pow(v1[i] - v2[i], 2)
        
        # Returning the square root of the sum
    return math.sqrt(summation)

In [55]:
# testing euclidian_distance function
v1 = [1, 2, 3]
v2 = [2, 1, 5]
euclidean_distance(v1,v2)

1.4142135623730951

In [56]:
# Implementing KNN
def KNN(train , new_sample, K):
    # Initializing dict of distances and variable with size of training set
    distances, train_length = {}, len(train)
    # Calculating the Euclidean distance between the new
    # sample and the values of the training sample
    for i in range(train_length):
        d = euclidean_distance(train[i], new_sample)
        distances[i] = d
        
    # Selecting the K nearest neighbors
    k_neigh = sorted(distances, key = distances.get)[:]
    
    # Initializing labels counters
    label_1, label_2 = 0, 0
    for index in k_neigh:
        if train[index][-1] == 1:
            label_1 += 1
        else:
            label_2 += 1
    if label_1 > label_2:
        return 1
    else:
        return 2

In [57]:
# Testing kNN function
print("New sample \n{}".format(test[12]))
print("Label: %d" %(test[12][3]))
print('---------------------------')
print("kNN return ")
print('Label: {}'.format(KNN(train, test[12], K=7)))

New sample 
[44, 64, 6, 2]
Label: 2
---------------------------
kNN return 
Label: 1


In [58]:
# Testing KNN and displaying results
# Initialising the hit counter
hit_counter = 0

# performing knn on all the test samples
for sample in test:
    label = KNN(train, sample, K = 7)
    # Comparing method result with actual sample result
    if sample[-1] == label:
        hit_counter += 1
print('Number of train samples: %d' % len(train))
print('Number of test samples: %d' % len(test))
print('Total of hits: %d' % hit_counter)
print('Number of hits (Percentage): %.2f%%' % (100 * hit_counter / len(test)))

Number of train samples: 367
Number of test samples: 245
Total of hits: 173
Number of hits (Percentage): 70.61%
