In [1]:
import numpy as np

In [15]:
class KMeans:
    def __init__(self, n_clusters, max_iter, random_state):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        self.centroid = None
        
    def initialize_clusters(self, X):
        if self.random_state:
            np.random.seed(self.random_state)
        self.centroids = X[np.random.choice(len(X), self.n_clusters, replace=False)]

    def compute_distance_and_get_label(self, X):
        """
        We will compute the distance of each X from the centroids and return argmax across the centroids for each point
        """
        distance = np.zeros((len(X), self.n_clusters))
        for i, point in enumerate(X):
            for j, centroid in enumerate(self.centroids):
                distance[i][j] = np.linalg.norm(point - centroid)            
        return np.argmin(distance, axis=1)
    
    def update_clusters(self, X, labels):
        new_centroid = np.zeros((self.n_clusters, X.shape[1]))
        for i in range(self.n_clusters):
            new_centroid[i] = np.mean(X[labels == i], axis=0)            
        return new_centroid
    
    def fit(self, X):
        self.initialize_clusters(X)
        for _ in range(self.max_iter):
            labels = self.compute_distance_and_get_label(X)
            new_centroids = self.update_clusters(X, labels)

            if np.array_equal(new_centroids, self.centroids):
                break

            self.centroids = new_centroids

    def predict(self, X):
        return self.compute_distance_and_get_label(X)

In [26]:
random_state = 42

X = np.random.rand(100, 2)

clusters = 30

model = KMeans(clusters, 100, random_state)

model.fit(X)

labels = model.predict(X)

labels

array([ 9, 28, 28,  3, 15, 29,  5, 19, 24, 28,  8,  9, 18,  6, 26, 25,  5,
        6, 10, 28, 11, 23, 24, 11, 24, 29, 22, 11, 26, 20, 11, 19, 15, 13,
        1,  6,  6, 15, 29,  5, 26, 25, 23,  5,  4,  3, 18,  1, 23,  1, 11,
        4, 17,  1, 19, 20,  6, 14,  3, 17,  8,  1,  4,  3,  3,  5, 24,  4,
       19, 24,  2, 15, 29, 12, 26, 20, 16, 17, 14, 10,  7, 26, 25,  0,  6,
       16, 19, 16, 21,  8, 14, 24, 27,  9, 23, 13, 27,  5, 19, 19])

In [25]:
np.array(np.unique(labels, return_counts=True)).T

array([[ 0,  1],
       [ 1,  5],
       [ 2,  1],
       [ 3,  5],
       [ 4,  4],
       [ 5,  6],
       [ 6,  6],
       [ 7,  1],
       [ 8,  3],
       [ 9,  3],
       [10,  2],
       [11,  5],
       [12,  1],
       [13,  2],
       [14,  3],
       [15,  4],
       [16,  3],
       [17,  3],
       [18,  2],
       [19,  7],
       [20,  3],
       [21,  1],
       [22,  1],
       [23,  4],
       [24,  6],
       [25,  3],
       [26,  5],
       [27,  2],
       [28,  4],
       [29,  4]])

In [28]:
np.all((labels >= 0) & (labels < clusters ))

True

In [30]:
# checkin if agg sum adds up
agg = np.array(np.unique(labels, return_counts=True)).T

In [33]:
np.sum(agg, axis=0)[1] == 100

True