In [1]:
import numpy as np


def kmeans(data, numofclasses, options=None):
    """
    Calculates the clusters using k-means algorithm and returns cluster labels and centroids
    :param data: Data to cluster structured as [Number of observations x Data dimensions(variables)]
    :param numofclasses: Number of classes you want to cluster the data into.
    :param options: Optional data in dictionary form to overwrite the defaults
                    max_iterations - int: Maximum number of iterations
                    all_stages - bool: If true returns labels and clusters for iterations
    :return: {"numofiterations": number of iterations used to define cluster labels,
              "centroids": Centroids of the clusters for final or all iterations,
              "labels": Cluster labels for the final or all iterations }
    """

    allCentroids = []
    allLabels = []

    # Defaults
    max_iteration = 100
    all_stages = False

    # Overwrite defaults according to options
    if options is not None:
        if "max_iteration" in options:
            max_iteration = options["max_iteration"]
        if "all_stages" in options:
            all_stages = options["all_stages"]

    # Reference cluster centers
    refcenters = np.zeros([numofclasses, 2])

    # Randomly pick up seed mean points
    singleCentroid = data[np.random.randint(data.shape[0], size=numofclasses), :]
    singleLabel = []

    # Number of iterations
    iteration = 0

    while not np.allclose(refcenters, singleCentroid) and iteration < max_iteration:

        # Update reference cluster centers
        refcenters = np.copy(singleCentroid)

        # Distance between the data points to individual mean points
        d1 = np.square(np.tile(data[:, 0], [numofclasses, 1]).transpose()-singleCentroid[:, 0])
        d2 = np.square(np.tile(data[:, 1], [numofclasses, 1]).transpose()-singleCentroid[:, 1])
        dist = np.sqrt(d1 + d2)

        # Data point label to the minimum distance
        singleLabel = np.argmin(dist, axis=1)

        for i in range(numofclasses):
            x_ = data[:, 0][singleLabel == i]
            y_ = data[:, 1][singleLabel == i]
            if len(x_) > 0:
                singleCentroid[i, 0] = x_.mean()
            if len(y_) > 0:
                singleCentroid[i, 1] = y_.mean()
        # endfor

        if all_stages:
            allLabels.append(singleLabel)
            allCentroids.append(np.copy(singleCentroid))

        iteration += 1
    # endwhile

    if all_stages:
        return {"numofiterations": iteration, "centroids": allCentroids, "labels": allLabels}
    else:
        return {"numofiterations": iteration, "centroids": singleCentroid, "labels": singleLabel}
    # endifelse

# enddef

#
# Data generation and running k-means
#

import matplotlib.pyplot as plt
import glob, os

# Delete the current images in the folder
filelist = glob.glob("*.png")
for f in filelist:
    os.remove(f)

numofclasses = 5

#  Data generation
mean = [-25, -25, 25, 25, -25, 25, 25, -25, 0, 0]
cov = [[50, 0], [0, 50]]
colors = ['r', 'g', 'b', 'brown', 'm']

x = []
y = []
for i in range(0,2*numofclasses,2):
    x_, y_ = np.random.multivariate_normal([mean[i], mean[i+1]], cov, 100).T
    x.extend(x_)
    y.extend(y_)
x = np.asarray(x)
y = np.asarray(y)

data = np.vstack((x, y)).T
print(data)
output = kmeans(data, numofclasses, {"all_stages": True})

# Display original data
plt.plot(data[:, 0], data[:, 1], 'x', color='black')
plt.axis('equal')
#plt.show()
plt.savefig('data.png')
plt.clf()

# Display the output stages
numofiterations = output["numofiterations"]
if numofiterations > 0:
    for i in range(output["numofiterations"]):
        plt.clf()
        centers = output["centroids"][i]
        for n in range(numofclasses):
            x_ = data[:, 0][output["labels"][i] == n]
            y_ = data[:, 1][output["labels"][i] == n]
            plt.plot(x_, y_, 'x', color=colors[n])
            plt.plot(centers[n,0], centers[n,1], 'o', color=colors[n])
        plt.axis('equal')
        plt.savefig(str(i) + ".png")


[[-2.84969298e+01 -2.85504144e+01]
 [-1.22580552e+01 -2.91292734e+01]
 [-2.15066603e+01 -1.20906730e+01]
 [-2.20866209e+01 -2.17759524e+01]
 [-1.56359742e+01 -2.38363248e+01]
 [-1.35869114e+01 -3.62336556e+01]
 [-2.98130748e+01 -1.51657138e+01]
 [-2.56492927e+01 -1.41130851e+01]
 [-1.57623544e+01 -3.24853320e+01]
 [-3.63746975e+01 -3.04915635e+01]
 [-3.19729108e+01 -2.15030297e+01]
 [-1.80152369e+01 -2.77555744e+01]
 [-1.87328609e+01 -2.70070886e+01]
 [-2.73271833e+01 -2.67158139e+01]
 [-2.11846036e+01 -1.88060034e+01]
 [-2.72590298e+01 -1.60621738e+01]
 [-3.09030672e+01 -3.73202951e+01]
 [-3.22976855e+01 -1.17467598e+01]
 [-3.28047910e+01 -1.85407637e+01]
 [-1.41913036e+01 -1.76474308e+01]
 [-2.07788441e+01 -2.79018493e+01]
 [-2.43354346e+01 -3.22661236e+01]
 [-2.31458604e+01 -3.05609313e+01]
 [-3.68258511e+01 -2.93453879e+01]
 [-1.17838878e+01 -2.35136507e+01]
 [-2.11107110e+01 -2.84470956e+01]
 [-1.95625208e+01 -2.89627364e+01]
 [-3.23410927e+01 -3.03420777e+01]
 [-2.15650865e+01 -1