In [1]:
# knn implementation
# dataset: Haberman's survival

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
# read dataset from file
# full dataset will be read as a list of lists
samples = []

with open('../data/haberman.data') as f:
    for line in f.readlines():
            atrib = line.replace('\\n', '').split(',')
            samples.append([int(atrib[0]), int(atrib[1]),
                            int(atrib[2]), int(atrib[3])])

In [7]:
samples

[[30, 64, 1, 1],
 [30, 62, 3, 1],
 [30, 65, 0, 1],
 [31, 59, 2, 1],
 [31, 65, 4, 1],
 [33, 58, 10, 1],
 [33, 60, 0, 1],
 [34, 59, 0, 2],
 [34, 66, 9, 2],
 [34, 58, 30, 1],
 [34, 60, 1, 1],
 [34, 61, 10, 1],
 [34, 67, 7, 1],
 [34, 60, 0, 1],
 [35, 64, 13, 1],
 [35, 63, 0, 1],
 [36, 60, 1, 1],
 [36, 69, 0, 1],
 [37, 60, 0, 1],
 [37, 63, 0, 1],
 [37, 58, 0, 1],
 [37, 59, 6, 1],
 [37, 60, 15, 1],
 [37, 63, 0, 1],
 [38, 69, 21, 2],
 [38, 59, 2, 1],
 [38, 60, 0, 1],
 [38, 60, 0, 1],
 [38, 62, 3, 1],
 [38, 64, 1, 1],
 [38, 66, 0, 1],
 [38, 66, 11, 1],
 [38, 60, 1, 1],
 [38, 67, 5, 1],
 [39, 66, 0, 2],
 [39, 63, 0, 1],
 [39, 67, 0, 1],
 [39, 58, 0, 1],
 [39, 59, 2, 1],
 [39, 63, 4, 1],
 [40, 58, 2, 1],
 [40, 58, 0, 1],
 [40, 65, 0, 1],
 [41, 60, 23, 2],
 [41, 64, 0, 2],
 [41, 67, 0, 2],
 [41, 58, 0, 1],
 [41, 59, 8, 1],
 [41, 59, 0, 1],
 [41, 64, 0, 1],
 [41, 69, 8, 1],
 [41, 65, 0, 1],
 [41, 65, 0, 1],
 [42, 69, 1, 2],
 [42, 59, 0, 2],
 [42, 58, 0, 1],
 [42, 60, 1, 1],
 [42, 59, 2, 1],
 [42, 

In [20]:
# parameter to separate train and test datasets
p = 0.6

# train dataset
ltrain = int(p * len(samples))
sample_train = samples[0:ltrain]

# test dataset
sample_test = samples[ltrain:]

In [22]:
# create a Euclydian dist function
import math

def eucl(v1, v2):
    dim, s = len(v1), 0
    for i in range(dim - 1):
        s += math.pow(v1[i] - v2[i], 2)
    return math.sqrt(s)

In [23]:
v1 = [1,2,3]
v2 = [2,1,3]
eucl(v1,v2)

1.4142135623730951

In [25]:
def knn(train, new_sample, k):
    dists, ltrain = {}, len(train)
    # calculates Euclydian dist of new sample
    # for every example of train dataset
    for i in range(ltrain):
        d = eucl(train[i], new_sample)
        dists[i] = d
        
    # get keys of k-nearest neibors
    k_nearest = sorted(dists, key=dists.get)[:k]
    
    # most voted
    qt_label1, qt_label2 = 0, 0
    for index in k_nearest:
        if train[index][-1] == 1:
            qt_label1 += 1
        else:
            qt_label2 += 1
    if qt_label1 > qt_label2:
        return 1
    else:
        return 2

In [32]:
# running knn
j = 14
print(sample_test[j])
print(knn(sample_train, sample_test[j], k = 13))

[57, 61, 5, 2]
1


In [42]:
# proofing knn
matches, K = 0, 15
for samp in sample_test:
    cl = knn(sample_train, samp, K)
    if samp[-1] == cl:
        matches += 1
        
print ('Total train: %d' % len(sample_train))
print ('Total test:  %d' % len(sample_test))
print ('Total match: %d' % matches)
print ('Pct match:   %.2f%%' % (100 * matches / len(sample_test)))

Total train: 183
Total test:  123
Total match: 93
Pct match:   75.61%


In [43]:
# KNN con sklearn
from sklearn.neighbors import KNeighborsClassifier

In [44]:
inputs, outputs = [], []

In [45]:
with open('../data/haberman.data') as f:
    for line in f.readlines():
        atrib = line.replace('\\n', '').split(',')
        inputs.append([int(atrib[0]), int(atrib[2])])
        outputs.append(int(atrib[3]))

In [46]:
# pct train x test
p = 0.6

In [49]:
lim = int(p * len(inputs))
neigh = KNeighborsClassifier(n_neighbors = 15)
                             #, weights='uniform', algorithm='auto', 
                             # leaf_size=30, p=2, metric='minkowski', 
                             # metric_params=None, n_jobs=1, 
                             # **kwargs
neigh.fit(inputs[:lim], outputs[:lim])
lbl = neigh.predict(inputs[lim:])
matches, idx_lbl = 0,0

for i in range(lim, len(inputs)):
    if lbl[idx_lbl] == outputs[i]:
        matches += 1
    idx_lbl += 1

print ('Total train: %d' % lim)
print ('Total test:  %d' % (len(inputs) - lim))
print ('Total match: %d' % matches)
print ('Pct match:   %.2f%%' % (100 * matches / (len(inputs) - lim)))

Total train: 183
Total test:  123
Total match: 92
Pct match:   74.80%


In [50]:
# KNN with numpy and scikit-learn
# dataset: http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/
import numpy as np

In [64]:
# x are inputs, y are outputs
# using a modified dataset for easiness
x = np.genfromtxt('../data/balance-scale-new.data', delimiter=',', usecols=(1,2,3,4))
y = np.genfromtxt('../data/balance-scale-new.data', delimiter=',', usecols=(0))


In [66]:
print(x)
print(y)

[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  2.]
 [ 1.  1.  1.  3.]
 ..., 
 [ 5.  5.  5.  3.]
 [ 5.  5.  5.  4.]
 [ 5.  5.  5.  5.]]
[ 2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.  1.  2.  3.  3.  3.  2.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  1.  1.  2.  3.
  3.  1.  3.  3.  3.  3.  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  1.  1.  1.  2.  3.  1.  2.  3.  3.  3.  1.  3.  3.  3.  3.
  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  1.  1.  1.  1.  2.  1.  1.  3.
  3.  3.  1.  3.  3.  3.  3.  1.  3.  3.  3.  3.  2.  3.  3.  3.  3.  1.
  2.  3.  3.  3.  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  1.  1.  1.  2.  3.  1.  2.  3.  3.  3.  1.  3.
  3.  3.  3.  2.  3.  3.  3.  3.  3.  3.  3.  3.  3.  1.  1.  1.  1.  1.
  1.  1.  2.  3.  3.  1.  2.  3.  3.  3.  1.  3.  3.  3.  3.  1.  3.  3.
  3.  3.  1.  1.  1.  1.  1.  1.  1.  1.  2.  3.  1.  1.  3.  3.  3.  1.
  

In [67]:
from sklearn.model_selection import train_test_split

In [86]:
# generates train and test datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [69]:
print(len(x_train))

437


In [88]:
x_test

array([[ 4.,  3.,  5.,  3.],
       [ 4.,  5.,  3.,  1.],
       [ 2.,  4.,  4.,  1.],
       [ 2.,  4.,  3.,  3.],
       [ 4.,  5.,  2.,  1.],
       [ 5.,  1.,  4.,  1.],
       [ 2.,  2.,  3.,  4.],
       [ 1.,  4.,  1.,  3.],
       [ 5.,  2.,  1.,  3.],
       [ 2.,  1.,  5.,  1.],
       [ 3.,  1.,  4.,  1.],
       [ 1.,  2.,  2.,  2.],
       [ 1.,  3.,  2.,  1.],
       [ 1.,  4.,  4.,  1.],
       [ 5.,  3.,  2.,  3.],
       [ 1.,  4.,  1.,  2.],
       [ 1.,  1.,  1.,  3.],
       [ 3.,  5.,  4.,  4.],
       [ 4.,  1.,  5.,  1.],
       [ 3.,  2.,  3.,  2.],
       [ 2.,  1.,  2.,  2.],
       [ 2.,  1.,  5.,  4.],
       [ 1.,  2.,  5.,  5.],
       [ 5.,  5.,  5.,  2.],
       [ 4.,  3.,  3.,  3.],
       [ 1.,  5.,  3.,  1.],
       [ 3.,  1.,  2.,  3.],
       [ 5.,  4.,  1.,  4.],
       [ 5.,  5.,  5.,  1.],
       [ 3.,  1.,  2.,  2.],
       [ 3.,  4.,  2.,  5.],
       [ 3.,  1.,  3.,  2.],
       [ 2.,  3.,  2.,  5.],
       [ 5.,  3.,  1.,  5.],
       [ 2.,  

In [89]:
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(x_train, y_train)
labels = knn.predict(x_test)
print(len(labels))

188


In [91]:
len(labels)

188

In [92]:
print ('Total train: %d' % len(x_train))
print ('Total test:  %d' % len(x_test))
print ('Total match: %d' % np.sum(labels == y_test))
print ('Pct match:   %.2f%%' % (100 * (np.sum(labels == y_test)) / len(x_test)))

Total train: 437
Total test:  188
Total match: 163
Pct match:   86.70%


In [93]:
knn.score(x_test, y_test)

0.86702127659574468

In [94]:
np.mean(labels == y_test)

0.86702127659574468