In [1]:
import numpy as np
import random
from scipy.spatial.distance import cdist 


# make sure number of centroids are not more than Number of data points in dataset
class Kmeans:  
    def __init__(self,centroids=3,iterations=1000):
        # Hyperparameter - number of centroids 
        self.centroids = centroids
        
        # data to be used for clustering
        self.data = []
        
        # list containing centroids selected at each iteration
        self.centroids_ls =[]
        #number of iterations
        self.iterations = iterations  
        
    def load(self,data): 
        
        if type(data) == np.ndarray and len(data.shape)>1:
            
            self.data = data
        else:
            raise ValueError('Input Data not in Valid Format. Type should be N dimensional Numpy array')        
    
    def fit(self):
        # if number of centroids more than number of rows in dataset then run this
        if self.centroids < len(self.data):
        
            # selecting random rows  indexes
            n_random_rows = np.random.randint(self.data.shape[0], size=self.centroids)
            centroids_ = self.data[n_random_rows, :]

            # storing centroids we get at each stage  --- ADDITIONAL STEP
            self.centroids_ls.append(centroids_)

            # distances from centroids to rest of other points
            distances = cdist(self.data,centroids_,'euclidean')

            #Centroid with the minimum Distance on first Iteration
            # points will always vary from 0 to centroids
            points = np.array([np.argmin(i) for i in distances]) 


            for _ in range(self.iterations):

                centroids_ =[]
                # For each centoid we will see which point belongs to that class and then take mean to 
                # update centroid of that class
                for c in range(self.centroids):

                    #Updating Centroids by taking mean of Cluster it belongs to
                    temp_cent = self.data[points==c].mean(axis=0)

                    centroids_.append(temp_cent)


                # on every iteration storing new centoids formed
                self.centroids_ls.append(centroids_)

                centroids_ = np.vstack(centroids_)
                distances = cdist(self.data, centroids_ ,'euclidean')
                points = np.array([np.argmin(i) for i in distances])
            return points
        else:
            raise ValueError("Number of Centroids more than Number of rows of Dataset")


In [2]:
from sklearn.datasets import load_iris
#Loading the data
data = load_iris()
 
#Preparing the data
x = data.data
y = data.target
 
k = Kmeans(centroids=3,iterations=100)

k.load(x)

preds = k.fit()