**K-means Clustering Coding**

Step 1: Select the number of clusters that we want, k.\
Step 2: Randomly choose k points in the data as centroids.\
Step 3: Assign each data point to its nearest centroid.\
Step 4: Recompute the centroids of each cluster.\
Step 5: Repeat Step 3 and Step 4 until some stopping criteria: points do not change cluster, centroids do not change within some threshold or max number of iterations.

In [3]:
import numpy as np
import random

In [45]:
def train_k_means(X, k):
    n, p = X.shape
    random_list = random.sample(range(n), k)
    centroids = X[random_list]
    diff = 10
    while diff > 0.01:
        clusters = predict(X, centroids)
        new_centroids = []
        for c in range(k):
            new_centroids.append(np.mean(X[clusters == c], axis = 0))
        diff = np.sum(np.absolute(centroids - np.array(new_centroids)))
        centroids = np.array(new_centroids)
    return centroids

In [46]:
def predict(X, centroids):
    predicted_centroids = []
    for point in X:
        predicted_centroid = classify(point, centroids)
        predicted_centroids.append(predicted_centroid)
    return np.array(predicted_centroids) 

In [47]:
def classify(x, centroids):
    dist = np.sqrt(np.sum((centroids - x)**2, axis = 1))
    return np.argmin(dist)

In [48]:
X = np.array([[1,2],[1,3],[2,1],[8,9],[7,8],[10,7]])
centroids_hat = train_k_means(X, 2)
print(centroids_hat)

[[1.33333333 2.        ]
 [8.33333333 8.        ]]
