# Experimental fun with pycluster

## Initial setup and configuration

Import statements:

In [1]:
import sklearn.datasets as skdatasets
import sklearn.cluster as skcluster
import sklearn.metrics as skmetrics
import kmeans
import utils
from initialisations import random, ikmeans

Set up our options:

In [2]:
args = {
    'dataset':'iris',
    'algorithm':'ikmeans',
    'K':3
}

Configuration available:

In [3]:
datasets = {
    'iris':  skdatasets.load_iris,
    'wine':  skdatasets.load_wine,
    'bc':    skdatasets.load_breast_cancer,
}

algorithms = {
    'random': random.generate,
    'ikmeans': ikmeans.generate,
}                

Run some setup code:

In [10]:
dataloader = datasets[args['dataset']]
initialiser = algorithms[args['algorithm']]
K = args['K']

dataset = dataloader()

data = utils.standardise(dataset.data)
target = dataset.target

## Discover some centroids:

In [11]:
centroids = initialiser(data, K)
print("Centroids:\n", centroids)

Centroids:
 [[-0.60777778  0.19       -0.84338983 -0.87833333]
 [-0.5        -0.31045752 -0.45550528 -0.66666667]
 [ 0.71980676 -0.00854701  0.62666667  0.37037037]]


## Run k-means clustering algorithm:

In [6]:
Z, U, clusters, iterations = kmeans.cluster(data, K, centroids)

est1 = skcluster.KMeans(n_clusters=K, n_init=1, init='random')
est1.fit(data)

est2 = skcluster.KMeans(n_clusters=K)
est2.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

## Some output

In [7]:
print('Me:\n', U)
print("SKL (naive):\n", est1.labels_)
print("SKL (smarter):\n", est2.labels_)
print("Target:\n", target)

Me:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
 2 2]
SKL (naive):
 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 0 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 2 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]
SKL (smarter):
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]
Target:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

## Run metrics

In [8]:
acc_me = skmetrics.accuracy_score(target, U)
acc_them_n = skmetrics.accuracy_score(target, est1.labels_)
acc_them_s = skmetrics.accuracy_score(target, est2.labels_)

print("\nAccuracy Score:")
print("Me:", acc_me, "| SKL (naive):", acc_them_n, "| SKL (smarter):", acc_them_s)

ari_me = skmetrics.adjusted_rand_score(target, U)
ari_them_n = skmetrics.adjusted_rand_score(target, est1.labels_)
ari_them_s = skmetrics.adjusted_rand_score(target, est2.labels_)

print("\nAdjusted Rand Index:")
print("Me:", ari_me, "| SKL (naive):", ari_them_n, "| SKL (smarter):", ari_them_s)

print("\nIterations:")
print("Me:", iterations, "| SKL (naive):", est1.n_iter_ , "| SKL (smarter):", est2.n_iter_ )
print("")


Accuracy Score:
Me: 0.88 | SKL (naive): 0.09333333333333334 | SKL (smarter): 0.8866666666666667

Adjusted Rand Index:
Me: 0.7008666982225341 | SKL (naive): 0.7163421126838475 | SKL (smarter): 0.7163421126838475

Iterations:
Me: 8 | SKL (naive): 7 | SKL (smarter): 6

