In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
from sklearn.datasets import load_iris
from scipy.spatial import distance
iris = load_iris()

# prepare dataset

In [2]:
data = iris.data
labels = iris.target
names = iris.target_names

# implement the knn class

In [3]:
class KNeighborsClassifier:
    def __init__(self, k: int, data: np.ndarray, names: list, labels: np.ndarray, test_data: list):
        self.k = k
        self.data = data
        self.names = names
        self.labels = labels
        self.test_data = test_data

        normalized_data = self.normalizeData()
        distances = self.euclideanDistance()
        sorted_distances = self.sortDistances(distances)
        nNeighbors = self.determineNeighbors(sorted_distances, labels)

        print('The {} closest neighbors of searched class {} are:'.format(self.k, self.test_data))
        for name in nNeighbors:
            print("class: ", names[name])

        self.determinePropability(nNeighbors)

    def normalizeData(self):
        x = self.data
        data_min = np.min(x, axis=0)
        data_max = np.max(x, axis=0)
        x_transformed = (x - data_min) / (data_max - data_min)
        return x_transformed

    def euclideanDistance(self):
        distances = list()
        for i in self.data:
            distances.append(distance.euclidean(i, self.test_data))
        return distances

    def sortDistances(self, x:list):
        return np.argsort(x)

    def determineNeighbors(self, distances: np.ndarray, y: np.ndarray):
        targetList = list()
        neighbors = distances[0:self.k]
        for i in neighbors:
            targetList.append(y[i])
        return targetList

    def determinePropability(self, nNeighbors:list):
        amount = len(nNeighbors)
        closedNeighbor = nNeighbors[0]
        occurrences = nNeighbors.count(closedNeighbor)
        propability = occurrences/amount
        print('Propability for {} by examine next {} neighbors is {} %'.format(self.names[closedNeighbor], self.k, propability*100))


# testing some data

In [4]:
test_data = [4.8, 2.5, 5.3, 2.4] #DATA FROM EXERCISE

In [5]:
clf = KNeighborsClassifier(20, data, names, labels, test_data)

The 20 closest neighbors of searched class [4.8, 2.5, 5.3, 2.4] are:
class:  virginica
class:  virginica
class:  virginica
class:  virginica
class:  virginica
class:  virginica
class:  virginica
class:  versicolor
class:  versicolor
class:  virginica
class:  versicolor
class:  versicolor
class:  virginica
class:  virginica
class:  virginica
class:  virginica
class:  versicolor
class:  versicolor
class:  virginica
class:  virginica
Propability for virginica by examine next 20 neighbors is 70.0 %


In [6]:
test_data = [5.2,4.1,1.5,0.1] 

In [7]:
clf = KNeighborsClassifier(20, data, names, labels, test_data)

The 20 closest neighbors of searched class [5.2, 4.1, 1.5, 0.1] are:
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
Propability for setosa by examine next 20 neighbors is 100.0 %


In [8]:
test_data = [5.2,2.7,3.9,1.4]

In [9]:
clf = KNeighborsClassifier(20, data, names, labels, test_data)

The 20 closest neighbors of searched class [5.2, 2.7, 3.9, 1.4] are:
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  versicolor
class:  virginica
class:  versicolor
Propability for versicolor by examine next 20 neighbors is 95.0 %


# using the class "KNeighborsClassifier" for just 2 dimensions

In [10]:
data2 = iris.data[:,[0,1]]
labels2 = iris.target
names2 = iris.target_names

In [11]:
test_data = [4.8, 2.5]

In [12]:
clf = KNeighborsClassifier(10, data2, names2, labels2, test_data)

The 10 closest neighbors of searched class [4.8, 2.5] are:
class:  virginica
class:  versicolor
class:  versicolor
class:  versicolor
class:  setosa
class:  versicolor
class:  setosa
class:  setosa
class:  setosa
class:  versicolor
Propability for virginica by examine next 10 neighbors is 10.0 %


In [13]:
test_data = [5.2,4.1] 

In [14]:
clf = KNeighborsClassifier(10, data2, names2, labels2, test_data)

The 10 closest neighbors of searched class [5.2, 4.1] are:
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
class:  setosa
Propability for setosa by examine next 10 neighbors is 100.0 %


# using class "KNeighborsClassifier" for simple categorization movies

In [15]:
movies = np.array(pd.read_csv("movies.csv"))

In [16]:
data2 = movies[:,[0,1]]
names2 = np.unique(movies[:,2])
labels2 = movies[:,3]

In [17]:
print(movies[:10])

[[3.0 104.0 'Romance' 0]
 [2.0 100.0 'Romance' 0]
 [1.0 81.0 'Romance' 0]
 [101.0 10.0 'Action' 1]
 [99.0 5.0 'Action' 1]
 [98.0 2.0 'Action' 1]
 [5.0 99.0 'Romance' 0]
 [8.0 110.0 'Romance' 0]
 [6.0 85.0 'Romance' 0]
 [120.0 12.0 'Action' 1]]


In [18]:
test_data2 = [5.0, 140.0]

In [19]:
clf1 = KNeighborsClassifier(5, data2, names2, labels2, test_data2)

The 5 closest neighbors of searched class [5.0, 140.0] are:
class:  Action
class:  Action
class:  Action
class:  Action
class:  Action
Propability for Action by examine next 5 neighbors is 100.0 %


In [20]:
test_data2 = [140.0, 2.0]

In [21]:
clf1 = KNeighborsClassifier(5, data2, names2, labels2, test_data2)

The 5 closest neighbors of searched class [140.0, 2.0] are:
class:  Romance
class:  Romance
class:  Romance
class:  Romance
class:  Romance
Propability for Romance by examine next 5 neighbors is 100.0 %
