In [1]:
import numpy as np
import pandas as pd

In [2]:
class KM:
    def __init__(self, k , tol = 0.001, max_iter=300):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol

    def process(self,df):
        self.centroids = {}

        for i in range(self.k):
            self.centroids[i] = df[i]  # centroids

        for i in range(self.max_iter):
            self.classifications = {} # centroid : nearest point
            for i in range(self.k):
                classifications[i] = [] # for each centroid

            for point in df: 
                distance = [np.linalg.norm(point-self.centroids[centroid]) for centroid in centroids]
                index = np.argmin(distance) # centroid number 
                self.classifications[index].append(point) 

            k_centroids = self.centroids.todict()

            for classification in classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis=0)  

            done = True
            for centroid in self.centroids: 
                old = k_centroids[centroid]
                new = centroids[centroid]
                if np.sum((current-original)/original*100.0) > self.tol: 
                    done = False
            if done: 
                break
                
            
    def to_numeric(self,df):
        columns = list(df.keys())
        for column in columns: 
            values = {}
            def convert_to_int(val):
                return values[val]

            if df[column].dtype != np.int64 and df[column].dtype != np.float64:
                col_content = df[column]
                unique_elements = np.unique(df[column])
                x = 0
                for unique in unique_elements:
                    if unique not in values:
                        values[unique] =x
                        x+=1

                df[column] = list(map(convert_to_int,df[column]))


        return df

    def predict(self, data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids] # distance 
        classification = distances.index(min(distances))
        # print(distances)
        return classification


In [3]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
99,7.4,2.8,6.1,1.9,Iris-virginica
100,7.9,3.8,6.4,2.0,Iris-virginica
101,6.4,2.8,5.6,2.2,Iris-virginica
102,6.3,2.8,5.1,1.5,Iris-virginica


In [4]:
col_names = list(df.keys())
col_names

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']

In [8]:
data = pd.read_csv('data.csv', skiprows = 1, names = col_names[:-1] + ['name'])
data

Unnamed: 0,5.1,3.5,1.4,0.2,name
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
99,7.4,2.8,6.1,1.9,Iris-virginica
100,7.9,3.8,6.4,2.0,Iris-virginica
101,6.4,2.8,5.6,2.2,Iris-virginica
102,6.3,2.8,5.1,1.5,Iris-virginica


In [9]:
km = KM(4)

In [10]:
data = km.to_numeric(data)
data

Unnamed: 0,5.1,3.5,1.4,0.2,name
0,4.9,3.0,1.4,0.2,0
1,4.7,3.2,1.3,0.2,0
2,4.6,3.1,1.5,0.2,0
3,5.0,3.6,1.4,0.2,0
4,5.4,3.9,1.7,0.4,0
...,...,...,...,...,...
99,7.4,2.8,6.1,1.9,2
100,7.9,3.8,6.4,2.0,2
101,6.4,2.8,5.6,2.2,2
102,6.3,2.8,5.1,1.5,2


In [25]:
km.process(data)

KeyError: 0