In [2]:
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
import math

In [41]:
## Function Utils
def rand_centroids(K, X):
    # rand_centroids(K=Int, X=Float_array):
    # Return a numpy array of size K with each element 
    # being a normally random distributed var with mu and sigma calculated 
    # from the mean and std of the data X
    mean, std = np.mean(X, axis=0), np.std(X, axis=0)
    clusters = [np.random.normal(mean, std) for n in range(K)]
    return np.array(clusters)

def euc_distance(X, Y):    
    # euc_distance(X=Float_array, Y=Float_array):
    # Returns an array of euclidean distances, 
    # for the square root of the sum of the square of the differences
    # of array X and array Y
    diff = X - Y[:, np.newaxis]
    squared_diff = diff**2
    sum_squared_diff = squared_diff.sum(axis=2)
    return np.sqrt(sum_squared_diff)

def compute_clusters(K, C, X):
    # compute_clusters(K=Int, C=Float_array, X=Float_array)
    # Compute the clusters for cluster size K, clusters C and data X
    # where a new cluster is calculated as the mean of the data points 
    # which share a common nearest cluster. Repeats
    # until the sum of the euc distances between clusters
    # and points does not change
    print('=> OldCluster:', C)
    D = euc_distance(X, C)
    CC = np.argmin(D, axis=0)
    C = np.array([X[CC==k].mean(axis=0) for k in range(K)])
    print('=> NewCluster:', C)
    D2 = euc_distance(X, C)
    print('=> DistanceSums:', D.sum(), D2.sum())
    if (D.sum() == D2.sum()):
        return C
    else:
        return compute_clusters(K, C, X)

def k_means(K, X):
    # k_means(K=Int, X=Float_array)
    # K-means for clust size K on dataset X using random initialised centroids
    C = rand_centroids(K, X)
    return compute_clusters(K, C, X)

In [47]:
# Load iris data set values into a numpy array
iris = loadarff('./datasets/iris.arff')
headers = iris[1]
values = iris[0]
iris_data = []
for val in values:
    iris_data.append([val[0], val[1], val[2], val[3]]) 
iris_data = np.array(iris_data).astype(np.float)

In [49]:
# Perform k-means
k_means(2, iris_data)

('=> OldCluster:', array([[ 6.04833039,  3.29899686,  0.63365464,  0.65314573],
       [ 5.8665646 ,  2.68828682,  2.44194179,  1.01416518]]))
('=> NewCluster:', array([[ 5.02272727,  3.45454545,  1.43636364,  0.23636364],
       [ 6.18396226,  2.88773585,  4.72264151,  1.59811321]]))
('=> DistanceSums:', 880.11748145615582, 724.1276926276804)
('=> OldCluster:', array([[ 5.02272727,  3.45454545,  1.43636364,  0.23636364],
       [ 6.18396226,  2.88773585,  4.72264151,  1.59811321]]))
('=> NewCluster:', array([[ 5.00784314,  3.4       ,  1.49411765,  0.26078431],
       [ 6.27373737,  2.87575758,  4.92525253,  1.68181818]]))
('=> DistanceSums:', 724.1276926276804, 728.73145837830452)
('=> OldCluster:', array([[ 5.00784314,  3.4       ,  1.49411765,  0.26078431],
       [ 6.27373737,  2.87575758,  4.92525253,  1.68181818]]))
('=> NewCluster:', array([[ 5.00566038,  3.36037736,  1.56226415,  0.28867925],
       [ 6.30103093,  2.88659794,  4.95876289,  1.69587629]]))
('=> DistanceSums:', 7

array([[ 5.00566038,  3.36037736,  1.56226415,  0.28867925],
       [ 6.30103093,  2.88659794,  4.95876289,  1.69587629]])