# K-Means trial run

Run own implementation of k-means alongside scikit-learn to ensure results are consistent.

## 0. Initial setup and configuration

In [1]:
import imports
import sklearn.datasets as skdatasets
import sklearn.cluster as skcluster
import sklearn.metrics as skmetrics    
import kmeans
import utils
from initialisations import random, ikmeans, erisoglu

Configuration available:

In [2]:
datasets = {
    'iris':  skdatasets.load_iris,
    'wine':  skdatasets.load_wine,
    'bc':    skdatasets.load_breast_cancer,
}

algorithms = {
    'random': random.generate,
    'ikmeans': ikmeans.generate,
    'erisoglu': erisoglu.generate,
}                

Set up the options for this run:

In [3]:
args = {
    'dataset':'iris',
    #'dataset':'wine',
    #'dataset':'bc',
    #'algorithm':'random',
    'algorithm':'ikmeans',
    #'algorithm':'erisoglu',
    'K':3
}

Run some setup code:

In [4]:
dataloader = datasets[args['dataset']]
initialiser = algorithms[args['algorithm']]
K = args['K']

dataset = dataloader()

data = utils.standardise(dataset.data)
target = dataset.target

## 1. Discover some centroids:

In [5]:
centroids = initialiser(data, K)
print("Centroids:\n", centroids)

Centroids:
 [[-0.60777778  0.19       -0.84338983 -0.87833333]
 [-0.5        -0.31045752 -0.45550528 -0.66666667]
 [ 0.71980676 -0.00854701  0.62666667  0.37037037]]


## 2. Run k-means clustering algorithm:

In [6]:
# Homemade version:
result = kmeans.cluster(data, K, centroids.copy())

# SKLearn version
est1 = skcluster.KMeans(n_clusters=K, n_init=1, init=centroids.copy())
est1.fit(data)

KMeans(algorithm='auto', copy_x=True,
    init=array([[-0.60778,  0.19   , -0.84339, -0.87833],
       [-0.5    , -0.31046, -0.45551, -0.66667],
       [ 0.71981, -0.00855,  0.62667,  0.37037]]),
    max_iter=300, n_clusters=3, n_init=1, n_jobs=None,
    precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)

## 3. Show clustering outcome

In [7]:
print('Me:\n', result['labels'])
print("SKL:\n", est1.labels_)
print("Target:\n", target)

print("\nIterations:")
print("Me:", result['iterations'], "| SKL:", est1.n_iter_)

Me:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
 2 2]
SKL:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1
 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2
 2 2]
Target:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

Iterations:
Me: 9 | SKL: 9


## 4. Run metrics

### Main metrics

In [8]:
acc_me = skmetrics.accuracy_score(target, result['labels'])
acc_them = skmetrics.accuracy_score(target, est1.labels_)

print("\nAccuracy Score:")
print("Me:", acc_me, "| SKL:", acc_them)

ari_me = skmetrics.adjusted_rand_score(target, result['labels'])
ari_them = skmetrics.adjusted_rand_score(target, est1.labels_)

print("\nAdjusted Rand Index:")
print("Me:", ari_me, "| SKL:", ari_them)


Accuracy Score:
Me: 0.88 | SKL: 0.88

Adjusted Rand Index:
Me: 0.7008666982225341 | SKL: 0.7008666982225341


### Confusion matrices

In [9]:
cm_me = skmetrics.confusion_matrix(target, result['labels'])
cm_them = skmetrics.confusion_matrix(target, est1.labels_)

print("Me:\n", cm_me, "\n")
print("SKL:\n", cm_them)

Me:
 [[50  0  0]
 [ 0 40 10]
 [ 0  8 42]] 

SKL:
 [[50  0  0]
 [ 0 40 10]
 [ 0  8 42]]
