In [None]:
# We want to implement a class called kmeans that accepts
# max_iter and num of cluster and maybe some random state
# and with fit and predict methods to assign clusters to the datapoints pass to fit

In [14]:
import numpy as np

class KMeans:
    def __init__(self, n_clusters: int, max_iter: int, random_state: int):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        self.centroids = None
    
    def initialize_centroids(self, X):
        if self.random_state:
            np.random.seed(self.random_state)
        # we will use the random state to randomly pick n_clusters points from X
        self.centroids = X[np.random.choice(len(X), size=self.n_clusters, replace=False)]

    def compute_distance_and_get_new_centroids(self, X) -> np.ndarray:
        distance = np.zeros((len(X), self.n_clusters))
        for i, point in enumerate(X):
            for j, centroid in enumerate(self.centroids):
                distance[i][j] = np.linalg.norm(point - centroid)
        return np.argmin(distance, axis=1) # this returns the index of the x axis with min value or cluster
    
    def update_cluster(self, X: np.ndarray, labels: np.ndarray) -> np.ndarray:
        new_centroids = np.zeros((self.n_clusters, X.shape[1]))
        for idx in range(self.n_clusters):
            new_centroids[idx] = np.mean(X[labels == idx], axis=0)
        return new_centroids

    def fit(self, X):
        # initialize the centorids 
        self.initialize_centroids(X) # this is called one time only after initialization
        for _ in range(self.max_iter):
            # get new cluster assignment
            labels = self.compute_distance_and_get_new_centroids(X)
            new_clusters = self.update_cluster(X, labels)
            if np.array_equal(self.centroids, new_clusters): # if no change we can quit early
                break
            self.centroids = new_clusters

    def predict(self, X) -> np.ndarray:
        return self.compute_distance_and_get_new_centroids(X)

In [15]:
random_state = 42

X = np.random.rand(100,2) # filled with random 100 2d points

model = KMeans(5, 100, random_state=42)

model.fit(X)

labels = model.predict(X)

print(labels)

[1 0 0 3 2 1 0 1 1 3 2 1 2 1 1 4 0 1 3 0 2 2 1 2 1 1 2 2 0 4 2 1 2 2 3 1 1
 2 1 0 1 4 2 0 0 3 2 3 2 3 2 0 1 3 1 4 1 2 3 3 2 3 0 3 3 0 1 4 2 1 2 2 1 4
 0 4 3 1 2 0 4 1 4 0 1 3 1 3 2 2 2 1 4 1 2 2 4 0 2 1]
