# Yuan 2004 "New" Algorithm

See: [A new algorithm to get the initial centroids](https://ieeexplore.ieee.org/abstract/document/1382371)

In [1]:
import imports

import numpy as np
import sklearn.datasets as skdatasets
import sklearn.metrics as skmetrics
from sklearn.cluster import KMeans

from initialisations import yuan2004 as yuan
import kmeans
from metrics import accuracy

## Run against Iris

In [2]:
dataset_i = skdatasets.load_iris()
data_i = dataset_i.data
target_i = dataset_i.target

K = 3

### Find distances between all rows

In [3]:
distances = yuan.distance_table(data_i)
print(distances)

[[       nan 0.53851648 0.50990195 ... 4.45982062 4.65080638 4.14004831]
 [0.53851648        nan 0.3        ... 4.49888875 4.71805044 4.15331193]
 [0.50990195 0.3               nan ... 4.66154481 4.84871117 4.29883705]
 ...
 [4.45982062 4.49888875 4.66154481 ...        nan 0.6164414  0.64031242]
 [4.65080638 4.71805044 4.84871117 ... 0.6164414         nan 0.76811457]
 [4.14004831 4.15331193 4.29883705 ... 0.64031242 0.76811457        nan]]



### Find closest two rows in U

In [4]:
closest = yuan.find_closest(data_i)

print(closest)
print(data_i[closest])

[7, 39]
[[5.  3.4 1.5 0.2]
 [5.1 3.4 1.5 0.2]]


### Find centroids

In [5]:
centroids_i = yuan.generate(data_i, K)
print(centroids_i)

[[4.98421053 3.37631579 1.49210526 0.24473684]
 [6.4        2.97631579 5.35263158 2.01842105]
 [5.74473684 2.7        4.10526316 1.26842105]]


### Run k-means

In [6]:
est_i = KMeans(n_clusters=K, n_init=1, init=centroids_i)
est_i.fit(data_i)

print(est_i.labels_)
print(target_i)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


### Run some metrics

In [7]:
acc = skmetrics.accuracy_score(target_i, est_i.labels_)
num = skmetrics.accuracy_score(target_i, est_i.labels_, normalize=False)
ari = skmetrics.adjusted_rand_score(target_i, est_i.labels_)

print("Accuracy Score:", acc)
print("Num classified:", num, "/", len(est_i.labels_))
print("Adjusted Rand Index:", ari)

Accuracy Score: 0.44666666666666666
Num classified: 67 / 150
Adjusted Rand Index: 0.7163421126838475


### Actual accuracy score

Paper claims: **0.886667**

In [8]:
print((50 + 47 + 36)/len(data_i))
print(accuracy.score(target_i, est_i.labels_))

0.8866666666666667
0.8866666666666667


### ...and with my k-means

In [9]:
mine = kmeans.cluster(data_i, K)

print(accuracy.score(target_i, mine['labels']))

0.8866666666666667


## Run against Wine

Paper claims: **0.685393**

In [10]:
dataset_w = skdatasets.load_wine()
data_w = dataset_w.data
target_w = dataset_w.target

K = 3

centroids_w = yuan.generate(data_w, K)

est_w = KMeans(n_clusters=K, n_init=1, init=centroids_w)
est_w.fit(data_w)

print(est_w.labels_)
print(target_w)
print(accuracy.score(target_w, est_w.labels_))

[1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 2 2 1 1 2 1 1 1 1 1 1 2 2
 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 2 0 2 0 0 2 0 0 2 2 2 0 0 1
 2 0 0 0 2 0 0 2 2 0 0 0 0 0 2 2 0 0 0 0 0 2 2 0 2 0 2 0 0 0 2 0 0 0 0 2 0
 0 2 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 2 2 2 2 0 0 0 2 2 0 0 2 2 0 2
 2 0 0 0 0 2 2 2 0 2 2 2 0 2 0 2 2 0 2 2 2 2 0 0 2 2 2 2 2 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
0.702247191011236
