# Erisoglu 2011 "New" Algorithm

Brief description of task

## Initial setup and configuration

In [1]:
import imports
import numpy as np
import sklearn.datasets as skdatasets
#import sklearn.cluster as skcluster
import sklearn.metrics as skmetrics    
import kmeans
import utils
from initialisations import erisoglu

In [2]:
eris = erisoglu.Erisoglu()

dataset = skdatasets.load_iris()
data = dataset.data
target = dataset.target

K = 3

## A few inpections

In [3]:
main = eris._find_main_axis(data.T)
secondary = eris._find_secondary_axis(data.T, main)

print("Axes are: [%i, %i]" % (main, secondary))
print("Main feature deemed to be: %s" % (dataset.feature_names[main]))

Axes are: [3, 1]
Main feature deemed to be: petal width (cm)


In [4]:
initial, axes = eris._initialise(dataset.data)
center = eris._find_center(data, axes)

print("Center was:", center)
print("Initial seed is:", initial)

Center was: [2.3499999999999996, 2.375]
Initial seed is: 15


## Experiments and results

Attempts to reproduce the published numbers.

### Table 1: Descriptive statistics

In [5]:
for i in range(0, len(dataset.feature_names)):
    
    mean = np.mean(dataset.data.T[i])
    stddev = np.std(dataset.data.T[i])
    cvj = eris.variation_coefficient(dataset.data.T[i])
    
    print('%s: mean(%f), stdddev(%f), cvj(%f), ' % (dataset.feature_names[i], mean, stddev, cvj))
    

sepal length (cm): mean(5.843333), stdddev(0.825301), cvj(0.141238), 
sepal width (cm): mean(3.057333), stdddev(0.434411), cvj(0.142088), 
petal length (cm): mean(3.758000), stdddev(1.759404), cvj(0.468176), 
petal width (cm): mean(1.199333), stdddev(0.759693), cvj(0.633429), 


**TODO:** why is standard deviation slightly different?

### Table 2: Correlations with other variables

In [6]:
for i in [0,1,2]:
    
    corr = eris.correlation_coefficient(data.T[3], data.T[i])
    
    print('%s:\t%f' % (dataset.feature_names[i], corr))

sepal length (cm):	0.817941
sepal width (cm):	-0.366126
petal length (cm):	0.962865


## Main execution

### Find seeds

In [7]:
seeds = eris.generate(dataset.data, K)

print(seeds)

[[5.7 4.4 1.5 0.4]
 [7.7 2.6 6.9 2.3]
 [4.5 2.3 1.3 0.3]]


### Run k-means

In [8]:
Z, U, clusters, iterations = kmeans.cluster(data, K, seeds.copy())

print(U)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]


### Initial metrics

In [9]:
acc = skmetrics.accuracy_score(target, U)
ari = skmetrics.adjusted_rand_score(target, U)

print("Accuracy Score:", acc)
print("Adjusted Rand Index:", ari)

Accuracy Score: 0.44666666666666666
Adjusted Rand Index: 0.7163421126838475
