In [83]:
import numpy as np


In [84]:
# kmeans iteratively cluster assigment & moved centroid steps

In [85]:
def initialize_centroids(points, k):
    """returns k centroids from the initial points"""
    centroids = points.copy()
    np.random.shuffle(centroids)
    return centroids[:k]

In [86]:
def cluster_assignment(points, centroids):
    # converting 2d array to 3d array to leverage numpy broad casting
    centroids = centroids[:, np.newaxis, :] 
    # numpy broadcasting i.e the distances between all points and all centroids are computed i.e point1 -> c1, point1 -> c2 ... etc
    distances = np.square(points - centroids)
    # this converts the matrix to centroid on x axis and point on y axis. ex 4 points / 3 centroids will have 3 * 4 dminesion
    distances = np.sqrt(distances.sum(axis=2))
    # this returns the cluster assginemnst for all the points in a list
    return np.argmin(distances, axis=0)
    

In [87]:
def move_centroid(points, clusters, centroids):
    #returns the new centroids assigned from the points closest to them
    return np.array([points[clusters==k].mean(axis=0) for k in range(len(centroids))])

In [88]:
def run_kmeans_clustering(data=None, num_clusters=5, num_iterations=100):
    if data is None:
        raise ValueError('Data cannot be none')
    centroids = initialize_centroids(data, num_clusters)
    for i in range(num_iterations):
        clusters = cluster_assignment(data, centroids)
        centroids = move_centroid(data, clusters, centroids)
    return centroids 
            
            

In [89]:
data = np.vstack(((np.random.randn(5, 2) * 0.75 + np.array([1, 0])),
                  (np.random.randn(5, 2) * 0.25 + np.array([-0.5, 0.5])),
                  (np.random.randn(5, 2) * 0.5 + np.array([-0.5, -0.5]))))

In [90]:
clusters = run_kmeans_clustering(data, 2, 10)
clusters

array([[ 0.32563176,  1.21878261],
       [-0.27550991, -0.1871433 ]])