# <center>ML lab Homework 1

### Task 1
**Method Implementation:** Write your own code to implement the k-nearest neighbors classifier.

This k-Nearest Neighbors implementation is broken down into 3 parts:

   * Step 1: Calculate Euclidean Distance.
   * Step 2: Get Nearest Neighbors.
   * Step 3: Make Predictions.

In [27]:
from math import sqrt

# function to calculate euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0 
    for i in range(len(row1)-1):
        distance+=(row1[i]-row2[i])**2
    return sqrt(distance)

In [28]:
# function to locate the most similar neighbors
def get_neighbors(train, test_row, k):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(train_row,test_row)
        distances.append((train_row,dist))
    distances.sort(key=lambda tup:tup[1])
    neighbors = list()
    for i in range(k):
        neighbors.append(distances[i][0])
    return neighbors

In [29]:
# Make a classification prediction with neighbors
def make_prediction(train, test_row, k):
    neighbors = get_neighbors(train, test_row, k)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values),key=output_values.count)
    return prediction

In [30]:
# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = make_prediction(train, row, num_neighbors)
        predictions.append(output)
    return(predictions)

In [31]:
from random import randrange
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0
 
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        for x in train_set:
            for row in fold:
                if((x==row).all()):
                    train_set.remove(row)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [32]:
import pandas as pd
# importing data
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data",header=None)
data.columns = ["Age of patient","Year of operation(1900+)","Positive axillary nodes detected","label"]
data.head()

Unnamed: 0,Age of patient,Year of operation(1900+),Positive axillary nodes detected,label
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [33]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data,test_size=0.08,random_state=42)

In [34]:
test_data

Unnamed: 0,Age of patient,Year of operation(1900+),Positive axillary nodes detected,label
182,55,58,1,1
154,52,65,0,1
111,48,62,2,1
203,57,62,0,1
60,42,65,0,1
9,34,58,30,1
119,49,60,1,1
157,53,65,1,2
167,54,60,11,2
33,38,67,5,1


In [35]:
# removing headers from train and test data
train_data = train_data[:].values
test_data = test_data[:].values


In [36]:
predictions = k_nearest_neighbors(train_data,test_data,5)
predictions

[1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2]

In [37]:
actual = list()
for i in range(len(test_data)):
    actual.append(test_data[i][-1])
actual

[1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1]

In [38]:
import numpy as np
np.random.seed(42)
# evaluate algorithm
dataset = data[:].values
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [86.88524590163934, 78.68852459016394, 78.68852459016394, 73.77049180327869, 83.60655737704919]
Mean Accuracy: 80.328%
