In [1]:
import numpy as np
import pandas as pd

class K_Means:
    def __init__(self, k, tol=0.001, max_iter=300):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        
    def calculate(self, dataset):
        self.centroids = {}
        
        for i in range(self.k):
            self.centroids[i] = dataset[i] # list of centroids
            
        for i in range(self.max_iter):
            self.classifications = {}
            
            for a in range(self.k): # a of the kth centroid 
                self.classifications[a] = []
                
            for data in dataset:
                distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids] # distance point with centroids
                classification = distances.index(min(distances)) # min distance index - got centroid number 
                self.classifications[classification].append(data) # centroid : [..points]
            
            k_centroids = dict(self.centroids) # convert to dict
            
            for classification in self.classifications:
                self.centroids[classification] = np.average(self.classifications[classification], axis=0) # update the average
                
            done = True
            
            for centroid in self.centroids:
                original = k_centroids[centroid] # old centroid
                current = self.centroids[centroid] # new centroid
                if np.sum((current-original)/original*100.0) > self.tol: 
                    # print(np.sum((current-original)/original*100.0)) 
                    done = False
                    
            if done:
                break

    # you have a final list of distance of this from each centroid and minimum distance - returns classified centroid
    def predict(self, data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        # print(distances)
        return classification

    
    def non_numeric_to_numeric(self, file):
        columns = file.columns.values # 
        
        for column in columns:
            values = {}
            
            def convert_to_int(val):
                return values[val]
            
            if file[column].dtype != np.int64 and file[column].dtype != np.float64:
                col_content = file[column].values.tolist()
                unique_elements = set(col_content)
                
                x = 0
                
                for unique in unique_elements:
                    if unique not in values:
                        values[unique] = x
                        x += 1
                
                file[column] = list(map(convert_to_int, file[column]))
                
        return file




In [2]:

file_name = 'data.csv'
k = 4

cols = pd.read_csv(file_name, nrows = 1).columns.tolist() # columns names
file = pd.read_csv(file_name, skiprows = 1, names = cols[:-1] + ['name']) # data rows, last column=name
logic = K_Means(k)

dataset = logic.non_numeric_to_numeric(file)
correct_answers = np.array(dataset['name']) # true value
dataset.drop(['name'], axis=1, inplace = True) # xtrain
data = np.array(dataset.astype(float)) # xtrain
logic.calculate(data)   
correct = 0

for i in range(len(data)):
    predict = np.array(data[i].astype(float)) 
    predict = predict.reshape(-1, len(predict))
    result = logic.predict(predict)
    print(f'{np.array(data[i].astype(float))} {result}')
    
    if result == correct_answers[i]:
        correct += 1
print(f'Accuracy: {(correct/len(data))*100 if (correct/len(data))*100 > 50 else (1.0-(correct/len(data)))*100}%')

[4.9 3.  1.4 0.2] 1
[4.7 3.2 1.3 0.2] 1
[4.6 3.1 1.5 0.2] 1
[5.  3.6 1.4 0.2] 1
[5.4 3.9 1.7 0.4] 1
[4.6 3.4 1.4 0.3] 1
[5.  3.4 1.5 0.2] 1
[4.4 2.9 1.4 0.2] 1
[4.9 3.1 1.5 0.1] 1
[5.4 3.7 1.5 0.2] 1
[4.8 3.4 1.6 0.2] 1
[4.8 3.  1.4 0.1] 1
[4.3 3.  1.1 0.1] 1
[5.8 4.  1.2 0.2] 1
[5.7 4.4 1.5 0.4] 1
[5.4 3.9 1.3 0.4] 1
[5.1 3.5 1.4 0.3] 1
[5.7 3.8 1.7 0.3] 1
[5.1 3.8 1.5 0.3] 1
[5.4 3.4 1.7 0.2] 1
[5.1 3.7 1.5 0.4] 1
[4.6 3.6 1.  0.2] 1
[5.1 3.3 1.7 0.5] 1
[4.8 3.4 1.9 0.2] 1
[5.  3.  1.6 0.2] 1
[5.  3.4 1.6 0.4] 1
[5.2 3.5 1.5 0.2] 1
[5.2 3.4 1.4 0.2] 1
[4.7 3.2 1.6 0.2] 1
[4.8 3.1 1.6 0.2] 1
[5.4 3.4 1.5 0.4] 1
[5.2 4.1 1.5 0.1] 1
[5.5 4.2 1.4 0.2] 1
[4.9 3.1 1.5 0.1] 1
[7.  3.2 4.7 1.4] 2
[6.4 3.2 4.5 1.5] 2
[6.9 3.1 4.9 1.5] 0
[5.5 2.3 4.  1.3] 2
[6.5 2.8 4.6 1.5] 2
[5.7 2.8 4.5 1.3] 2
[6.3 3.3 4.7 1.6] 2
[4.9 2.4 3.3 1. ] 2
[6.6 2.9 4.6 1.3] 2
[5.2 2.7 3.9 1.4] 2
[5.  2.  3.5 1. ] 2
[5.9 3.  4.2 1.5] 2
[6.  2.2 4.  1. ] 2
[6.1 2.9 4.7 1.4] 2
[5.6 2.9 3.6 1.3] 2
[6.7 3.1 4.4 1.4] 2


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [3]:
data # xtrain

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [7. , 3