In [227]:
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
import math

In [474]:
## Function Utils
def rand_centroids(K, X):
    # rand_centroids(K=Int, X=Float_array):
    # Return a numpy array of size K with each element 
    # being a normally random distributed var with mu and sigma calculated 
    # from the mean and std of the data X
    mean, std = np.mean(X, axis=0), np.std(X, axis=0)
    clusters = [np.random.normal(mean, std) for n in range(K)]
    return np.array(clusters)

def euc_distance(X, Y):    
    # euc_distance(X=Float_array, Y=Float_array):
    # Returns an array of euclidean distances, 
    # for the square root of the sum of the square of the differences
    # of array X and array Y
    diff = X - Y[:, np.newaxis]
    squared_diff = diff**2
    sum_squared_diff = squared_diff.sum(axis=2)
    return np.sqrt(sum_squared_diff)

def compute_clusters(K, C, X):
    # compute_clusters(K=Int, C=Float_array, X=Float_array)
    # Compute the clusters for cluster size K, clusters C and data X
    # where a new cluster is calculated as the mean of the data points 
    # which share a common nearest cluster. Repeats until the sum of
    # the euc distances between clusters and points does not change, 
    # then returns the clusters
    D = euc_distance(X, C)
    CC = np.argmin(D, axis=0)
    C = np.array([new_cluster(k, X, CC) for k in range(K)])
    D2 = euc_distance(X, C)
    if (D.sum() == D2.sum()):
        return C, np.argmin(D2, axis=0)
    else:
        return compute_clusters(K, C, X)

def new_cluster(k, X, CC):
    # Returns a new cluster based on the mean of the points associated with it
    # if no points associated with it, generates a new one
    x = X[CC==k]
    if (len(x) > 0):
        return x.mean(axis=0)
    else: 
        return rand_centroids(1, X)[0]
    
def k_means(K, X):
    # k_means(K=Int, X=Float_array)
    # K-means for clust size K on dataset X using random initialised centroids
    # returns final clusters and predicted labels
    C = rand_centroids(K, X)
    return compute_clusters(K, C, X)

In [475]:
# Load iris data set values into a numpy array
iris_data, iris_meta = loadarff('./datasets/iris.arff')
data = np.array([[v[0], v[1], v[2], v[3]] for v in iris_data])
labels = np.unique([v[4] for v in iris_data])
labels_true = np.array([np.where(labels == v[4])[0][0] for v in iris_data])

In [489]:
# Perform k-means test
K = 2
clusters, labels_pred = k_means(K, data)
print(clusters)

[[ 6.30103093  2.88659794  4.95876289  1.69587629]
 [ 5.00566038  3.36037736  1.56226415  0.28867925]]


In [490]:
# SK-learn k-means comparison test
from sklearn.cluster import KMeans
sk_means = KMeans(n_clusters=K, init='random').fit(data)
print(sk_means.cluster_centers_)

[[ 5.00566038  3.36037736  1.56226415  0.28867925]
 [ 6.30103093  2.88659794  4.95876289  1.69587629]]


In [491]:
# Adjusted_rand_score
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(labels_true, labels_pred)

0.53992182942071232

In [493]:
# Calinski harabaz score
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(data, labels_pred)

513.30384335175665

In [540]:
# Perform scores for each metric and several clusters
MAX_K = 20
accuracies = []
for k in range(2, MAX_K):
    clusters, labels_pred = k_means(k, data)
    rand_score = adjusted_rand_score(labels_true, labels_pred)
    calinski = calinski_harabaz_score(labels_pred,)
    accuracies.append([rand_score, calinski])
accuracies = np.array(accuracies)

In [541]:
# Plot the accuracies on a graph 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout

traces = [
    Scatter(x=range(2, MAX_K), y=accuracies[:,0], name = 'adjusted_rand_score'),
    Scatter(x=range(2, MAX_K), y=accuracies[:,1], name = 'calinski_score', yaxis='y2')
]

yaxis2=dict(side='right')
layout = Layout(
    title='K-means Metrics over cluster size 2-20',
    xaxis=dict(title='cluster_size'),
    yaxis=dict(title='adjusted_rand_score'),
    yaxis2=dict(title='calinski_score', overlaying='y', side='right')
)

fig = Figure(data=traces, layout=layout)
plot(fig)

'file:///Users/jnalexander/Projects/MAI-machine-learning/temp-plot.html'