<a href="https://colab.research.google.com/github/rubencg195/Pytorch-Tutorials/blob/master/KNN_From_Scratch_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KNN From Scratch

**Group Members:**

1. Ruben Chevez
2. Kratika Naskulwar

*Code inspired in the following blog post*

https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/

In [0]:
#@title Hyperparameters
k_distance = 3 #@param {type:"number"}

traning_filename = 'train.csv' #@param {type:"string"}

test_filename = 'test.csv' #@param {type:"string"}

standalone_file = False #@param ["False", "True"] {type:"raw"}

debug = True #@param ["False", "True"] {type:"raw"}



# Upload Traing Set


In [0]:
if not standalone_file:
  from google.colab import files
  print("\nUpload Traning Set")
  training_data = files.upload()

  print("\nUpload Test Set")
  test_data = files.upload()

  for fn in training_data.keys():
    traning_filename = fn
    if debug:
      print('\n\nUser uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(training_data[fn])))

  for fn in test_data.keys():
    test_filename = fn
    if debug:
      print('\n\nUser uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(test_data[fn])))
else:
  import sys
  #Argument List: ['KNN.py', 'train.tsv', 'test.tsv', 'K']
  python_filename    = sys.argv[0]
  traning_filename   = sys.argv[1]
  test_filename      = sys.argv[2]
  k_distance         = sys.argv[3]
  if debug:
    print("Python Filename: {} \nTraining Data Filename: {}\nTest Data Filename {} \n K: {} ".format( 
      python_filename,
      traning_filename, 
      test_filename, 
      k_distance ))




Upload Traning Set


Saving TrainingData_A1.tsv to TrainingData_A1 (3).tsv

Upload Test Set


Saving TestData_A1.tsv to TestData_A1 (3).tsv


User uploaded file "TrainingData_A1.tsv" with length 9404 bytes


User uploaded file "TestData_A1.tsv" with length 479 bytes


# Importing the Data

This section is to load the training and the test data files using read_csv function.

In [0]:
import pandas as pd

train_data = pd.read_csv(traning_filename, sep='\t')
test_data  = pd.read_csv(test_filename, sep='\t')

if debug: 
  print("Traning Set Head \n\n", train_data.head(5), "\n")
  print("Test Set Head \n\n", test_data.head(5))
  print("\nType of Classes: \n\n", pd.Series(train_data.as_matrix()[:,  -1], name='A').unique() )


Traning Set Head 

         RI     Na    Mg    Al     Si     K    Ca   Ba   Fe  Class
0  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0      1
1  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0      1
2  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0      1
3  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0      1
4  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0      1 

Test Set Head 

         RI     Na    Mg    Al     Si     K    Ca    Ba    Fe
0  1.51847  13.10  3.97  1.19  72.44  0.60  8.43  0.00  0.00
1  1.51567  13.29  3.45  1.21  72.74  0.56  8.57  0.00  0.00
2  1.51918  14.04  3.58  1.37  72.08  0.56  8.30  0.00  0.00
3  1.51811  12.96  2.96  1.43  72.92  0.60  8.79  0.14  0.00
4  1.51660  12.99  3.18  1.23  72.97  0.58  8.81  0.00  0.24

Type of Classes: 

 [1. 2. 3. 5. 6. 7.]


# Training on Data

In [0]:
import math
def euclideanDistance(instance1, instance2, length):
	distance = 0
	for x in range(length):
		distance += pow((instance1[x] - instance2[x]), 2)
	return math.sqrt(distance)

import operator 
def getNeighbors(trainingSet, testInstance, k):
	distances = []
	length = len(testInstance)-1
	for x in range(len(trainingSet)):
		dist = euclideanDistance(testInstance, trainingSet[x], length)
		distances.append((trainingSet[x], dist))
	distances.sort(key=operator.itemgetter(1))
	neighbors = []
	for x in range(k):
		neighbors.append(distances[x][0])
	return neighbors

In [0]:
data1 = train_data.as_matrix(columns=None)[:][0]
data2 = train_data.as_matrix(columns=None)[:][1]
number_attr_cols = 9
distance = euclideanDistance(data1, data2, number_attr_cols)


if debug:
  print("Calculating Distance for single instance")
  print("Data 1 : ", data1, "\n")
  print("Data 2 : ", data2, "\n")
  print( 'Distance: ' + repr(distance) )

Calculating Distance for single instance
Data 1 :  [1.52101e+00 1.36400e+01 4.49000e+00 1.10000e+00 7.17800e+01 6.00000e-02
 8.75000e+00 0.00000e+00 0.00000e+00 1.00000e+00] 

Data 2 :  [ 1.51761 13.89     3.6      1.36    72.73     0.48     7.83     0.
  0.       1.     ] 

Distance: 1.687457128344304


In [0]:
# trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
# testInstance = [5, 5, 5]

trainSet = train_data.as_matrix(columns=None)
testInstance = test_data.as_matrix(columns=None)[:][0]
neighbors = getNeighbors(trainSet, testInstance, k_distance)

if debug:
  print("Test the getNeighbors function \n\n")
  print("Shape train set:     ", trainSet.shape  )
  print("Shape test instance: ", testInstance.shape  )
  print("\nGet The Neighbors for one test instance based on the train set: \n\n")
  print(pd.DataFrame(neighbors, columns=["RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Class"]))

Test the getNeighbors function 


Shape train set:      (204, 10)
Shape test instance:  (9,)

Get The Neighbors for one test instance based on the train set: 


        RI     Na    Mg    Al     Si     K    Ca   Ba    Fe  Class
0  1.51789  13.19  3.90  1.30  72.33  0.55  8.44  0.0  0.28    2.0
1  1.51844  13.25  3.76  1.32  72.40  0.58  8.42  0.0  0.00    2.0
2  1.51829  13.24  3.90  1.41  72.33  0.55  8.31  0.0  0.10    2.0


# Response

In [0]:
def getResponse(neighbors):
	classVotes = {}
	for x in range(len(neighbors)):
		response = neighbors[x][-1]
		if response in classVotes:
			classVotes[response] += 1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
	return sortedVotes[0][0]

In [0]:
response = getResponse(neighbors)

if debug: 
  print("{} \n\n Belongs to Class: {}".format(
      pd.DataFrame(neighbors, columns=["RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Class"]), 
      str(int(response))
  ))

        RI     Na    Mg    Al     Si     K    Ca   Ba    Fe  Class
0  1.51789  13.19  3.90  1.30  72.33  0.55  8.44  0.0  0.28    2.0
1  1.51844  13.25  3.76  1.32  72.40  0.58  8.42  0.0  0.00    2.0
2  1.51829  13.24  3.90  1.41  72.33  0.55  8.31  0.0  0.10    2.0 

 Belongs to Class: 2


#Accuracy

In [0]:
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][-1] is predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

In [0]:
trainingSet  = train_data.as_matrix(columns=None)
testSet      = test_data.as_matrix(columns=None)


# testSet    = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
# predictions = ['a', 'a', 'a']

predictions=[]

for x in range(len(testSet)):
  neighbors = getNeighbors(trainingSet, testSet[x], k_distance)
  result = getResponse(neighbors)
  predictions.append(result)
  print('{}\t{}'.format( 
      str(int(result)),  
      0
      #', actual=' + repr(testSet[x][-1])
  ))

if debug:
  print("\nShape test set:       ",   testSet.shape )
  print("\nShape predictions set: ", len(predictions)  )

accuracy = getAccuracy(testSet, predictions)
#print('\nAccuracy: ' + repr(accuracy) + '%')

2	0
1	0
2	0
1	0
1	0
1	0
3	0
5	0
7	0
1	0

Shape test set:        (10, 9)

Shape predictions set:  10
