# Yuan 2004 "New" Algorithm

See: [A new algorithm to get the initial centroids](https://ieeexplore.ieee.org/abstract/document/1382371)

In [1]:
import imports
import numpy as np
import sklearn.datasets as skdatasets
import sklearn.metrics as skmetrics
from initialisations import yuan2004 as yuan
from metrics import accuracy
import kmeans

## Run against Iris

In [2]:
dataset = skdatasets.load_iris()
data = dataset.data
target = dataset.target

K = 3

### Find distances between all rows

In [3]:
distances = yuan.distance_table(data)
print(distances)

[[       nan 0.53851648 0.50990195 ... 4.45982062 4.65080638 4.14004831]
 [0.53851648        nan 0.3        ... 4.49888875 4.71805044 4.15331193]
 [0.50990195 0.3               nan ... 4.66154481 4.84871117 4.29883705]
 ...
 [4.45982062 4.49888875 4.66154481 ...        nan 0.6164414  0.64031242]
 [4.65080638 4.71805044 4.84871117 ... 0.6164414         nan 0.76811457]
 [4.14004831 4.15331193 4.29883705 ... 0.64031242 0.76811457        nan]]



### Find closest two rows in U

In [4]:
closest = yuan.find_closest(data)

print(closest)
print(data[closest])

[101, 142]
[[5.8 2.7 5.1 1.9]
 [5.8 2.7 5.1 1.9]]


### Find centroids

In [5]:
centroids = yuan.generate(data, K)
print(centroids)

[[5.13684211 3.45789474 1.59473684 0.28947368]
 [4.90789474 3.02105263 1.40526316 0.2       ]
 [4.72105263 3.21578947 1.30526316 0.20263158]]


### Run k-means

In [6]:
Z, U, clusters, iterations = kmeans.cluster(data, K, centroids.copy())

print(U)
print(target)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 0 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


### Run some metrics

In [7]:
acc = skmetrics.accuracy_score(target, U)
bas = skmetrics.balanced_accuracy_score(target, U)
num = skmetrics.accuracy_score(target, U, normalize=False)
ari = skmetrics.adjusted_rand_score(target, U)

print("Accuracy Score:", acc)
print("Balanced accuracy score:", bas)
print("Num classified:", num, "/", len(U))
print("Adjusted Rand Index:", ari)

Accuracy Score: 0.31333333333333335
Balanced accuracy score: 0.3133333333333333
Num classified: 47 / 150
Adjusted Rand Index: 0.7163421126838475


### Confusion matrix

In [8]:
cm = skmetrics.confusion_matrix(target, U)
print(cm)

[[ 0  0 50]
 [ 3 47  0]
 [36 14  0]]


### Actual accuracy score

Paper claims: **0.886667**

In [9]:
print((50 + 47 + 36)/len(data))
print(accuracy.from_matrix(cm))


0.8866666666666667
0.8866666666666667


## Run against Wine

Paper claims: **0.685393**

In [10]:
dataset = skdatasets.load_wine()
data = dataset.data
target = dataset.target

K = 3

centroids = yuan.generate(data, K)

Z, U, clusters, iterations = kmeans.cluster(data, K, centroids.copy())

print(U)
print(target)
cm = skmetrics.confusion_matrix(target, U)
print(cm)
print(accuracy.from_matrix(cm))

[0 0 2 2 0 2 2 2 0 0 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 2 2 0 0 2 2 0 2 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 2 2 2 2 2 0 2 0 2 2 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0
 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[[31  1 27]
 [ 7 64  0]
 [11 37  0]]
0.6853932584269663
