In [2]:
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
import math

In [180]:
## Function Utils
def rand_centroids(K, X):
    # rand_centroids(K=Int, X=Float_array):
    # Return a numpy array of size K with each element 
    # being a normally random distributed var with mu and sigma calculated 
    # from the mean and std of the data X
    mean, std = np.mean(X, axis=0), np.std(X, axis=0)
    clusters = [np.random.normal(mean, std) for n in range(K)]
    return np.array(clusters)

def euc_distance(X, Y):    
    # euc_distance(X=Float_array, Y=Float_array):
    # Returns an array of euclidean distances, 
    # for the square root of the sum of the square of the differences
    # of array X and array Y
    diff = X - Y[:, np.newaxis]
    squared_diff = diff**2
    sum_squared_diff = squared_diff.sum(axis=2)
    return np.sqrt(sum_squared_diff)

def compute_clusters(K, C, X):
    # compute_clusters(K=Int, C=Float_array, X=Float_array)
    # Compute the clusters for cluster size K, clusters C and data X
    # where a new cluster is calculated as the mean of the data points 
    # which share a common nearest cluster. Repeats
    # until the sum of the euc distances between clusters
    # and points does not change
    D = euc_distance(X, C)
    CC = np.argmin(D, axis=0)
    C = []
    for k in range(K):
        x = X[CC==k]
        # ignore cluster if not closest to any
        if (len(x) > 0):
            C.append(x.mean(axis=0))
        
    C = np.array(C)    
    D2 = euc_distance(X, C)
    if (D.sum() == D2.sum()):
        return C
    else:
        return compute_clusters(K, C, X)

def k_means(K, X):
    # k_means(K=Int, X=Float_array)
    # K-means for clust size K on dataset X using random initialised centroids
    C = rand_centroids(K, X)
    return compute_clusters(K, C, X)

In [16]:
# Load iris data set values into a numpy array
iris = loadarff('./datasets/iris.arff')
headers = iris[1]
values = iris[0]
iris_data = []
for val in values:
    iris_data.append([val[0], val[1], val[2], val[3]]) 
iris_data = np.array(iris_data).astype(np.float)

In [198]:
# Perform k-means
clust_size = 3
result = k_means(clust_size, iris_data)
from sklearn.cluster import KMeans
sk_means = KMeans(n_clusters=clust_size, init='random').fit(iris_data)
print(result)
print('==')
print(sk_means.cluster_centers_)

[[ 6.30103093  2.88659794  4.95876289  1.69587629]
 [ 5.00566038  3.36037736  1.56226415  0.28867925]]
==
[[ 5.006       3.418       1.464       0.244     ]
 [ 6.85        3.07368421  5.74210526  2.07105263]
 [ 5.9016129   2.7483871   4.39354839  1.43387097]]
