# K-Means trial run

Run own implementation of k-means alongside scikit-learn to ensure results are consistent.

## 0. Initial setup and configuration

In [1]:
import imports
import sklearn.datasets as skdatasets
import sklearn.cluster as skcluster
import sklearn.metrics as skmetrics    
import kmeans
from metrics import accuracy
from initialisations import random, ikmeans, erisoglu

Configuration available:

In [2]:
datasets = {
    'iris':  skdatasets.load_iris,
    'wine':  skdatasets.load_wine,
    'bc':    skdatasets.load_breast_cancer,
}

algorithms = {
    'random': random.generate,
    'ikmeans': ikmeans.generate,
    'erisoglu': erisoglu.generate,
}                

Set up the options for this run:

In [3]:
args = {
    'dataset':'iris',
    #'dataset':'wine',
    #'dataset':'bc',
    #'algorithm':'random',
    #'algorithm':'ikmeans',
    'algorithm':'erisoglu',
    'K':3
}

Run some setup code:

In [4]:
dataloader = datasets[args['dataset']]
initialiser = algorithms[args['algorithm']]
K = args['K']

dataset = dataloader()
data = dataset.data

if args['algorithm'] == 'ikmeans':
    data = ikmeans.standardise(data)

target = dataset.target

## 1. Discover some centroids:

In [5]:
centroids = initialiser(data, K)
print("Centroids:\n", centroids)

Centroids:
 [[5.7 4.4 1.5 0.4]
 [7.7 2.6 6.9 2.3]
 [4.5 2.3 1.3 0.3]]


## 2. Run k-means clustering algorithm:

In [6]:
# Homemade version:
result = kmeans.cluster(data, K, centroids.copy())

# SKLearn version
est1 = skcluster.KMeans(n_clusters=K, n_init=1, init=centroids.copy())
est1.fit(data)

KMeans(algorithm='auto', copy_x=True,
    init=array([[5.7, 4.4, 1.5, 0.4],
       [7.7, 2.6, 6.9, 2.3],
       [4.5, 2.3, 1.3, 0.3]]),
    max_iter=300, n_clusters=3, n_init=1, n_jobs=None,
    precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)

## 3. Show clustering outcome

In [7]:
print('Me:\n', result['labels'])
print("SKL:\n", est1.labels_)
print("Target:\n", target)

print("\nIterations:")
print("Me:", result['iterations'], "| SKL:", est1.n_iter_)

Me:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]
SKL:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]
Target:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

Iterations:
Me: 14 | SKL: 14


### Centroids

In [8]:
print("Centroids (them):\n", est1.cluster_centers_ )
print("Centroids (me):\n", result['centroids'])

Centroids (them):
 [[5.006      3.428      1.462      0.246     ]
 [6.85384615 3.07692308 5.71538462 2.05384615]
 [5.88360656 2.74098361 4.38852459 1.43442623]]
Centroids (me):
 [[5.006      3.428      1.462      0.246     ]
 [6.85384615 3.07692308 5.71538462 2.05384615]
 [5.88360656 2.74098361 4.38852459 1.43442623]]


## 4. Run metrics

### Main metrics

In [9]:
acc_me = accuracy.score(target, result['labels'])
acc_them = accuracy.score(target, est1.labels_)

print("\nAccuracy Score:")
print("Me:", acc_me, "| SKL:", acc_them)

ari_me = skmetrics.adjusted_rand_score(target, result['labels'])
ari_them = skmetrics.adjusted_rand_score(target, est1.labels_)

print("\nAdjusted Rand Index:")
print("Me:", ari_me, "| SKL:", ari_them)


Accuracy Score:
Me: 0.8866666666666667 | SKL: 0.8866666666666667

Adjusted Rand Index:
Me: 0.7163421126838475 | SKL: 0.7163421126838475


### Confusion matrices

In [10]:
cm_me = skmetrics.confusion_matrix(target, result['labels'])
cm_them = skmetrics.confusion_matrix(target, est1.labels_)

print("Me:\n", cm_me, "\n")
print("SKL:\n", cm_them)

Me:
 [[50  0  0]
 [ 0  3 47]
 [ 0 36 14]] 

SKL:
 [[50  0  0]
 [ 0  3 47]
 [ 0 36 14]]
