# Hatamlou 2012 Binary Search Algorithm

See: [In search of optimal centroids on data clustering using a binary search algorithm](https://www.sciencedirect.com/science/article/pii/S0167865512001961)

In [25]:
import imports
import numpy as np
import sklearn.datasets as skdatasets
import sklearn.metrics as skmetrics
from initialisations import hatamlou2012 as hatamlou
#from metrics import accuracy
import kmeans

In [2]:
dataset = skdatasets.load_iris()
data = dataset.data
target = dataset.target

K = 3

### Define a temporary "objective function"

In [3]:
def objective_function(labels, target):
    return skmetrics.adjusted_rand_score(labels, target) 
    

### Find min and max

In [4]:
min_ds, max_ds = hatamlou.find_min_max(data)

print(min_ds)
print(max_ds)

[4.3 2.  1.  0.1]
[7.9 4.4 6.9 2.5]


### Find G

In [5]:
G = (max_ds - min_ds)/K

print(G)

[1.2        0.8        1.96666667 0.8       ]


### Suggest initial centroids

In [28]:
centroids = []

for i in range(0, K):
    Ci = min_ds + i * G
    centroids.append(Ci)
    print(Ci)
    
centroids = np.array(centroids)

[4.3 2.  1.  0.1]
[5.5        2.8        2.96666667 0.9       ]
[6.7        3.6        4.93333333 1.7       ]


### Clustering function

In [29]:
def do_clustering(data, centroids):

    distances = kmeans.distance_table(data, centroids)

    U = distances.argmin(1)

    clusters = []

    for k in range(len(centroids)):
        cluster = data[U==k, :]
        clusters.append(cluster)

    return clusters, U   
        

### Initial assignment of data to centroids

In [30]:
clusters, labels = do_clustering(data, centroids)

score = objective_function(U, target)
print(score)

0.3898597211698147


### Main reassignment loops

In [13]:
## NB: termination criterion?

for centroid in centroids:
    for attribute in centroid:
        
        print(attribute)

4.3
2.0
1.0
0.1
5.5
2.8000000000000003
2.966666666666667
0.8999999999999999
6.7
3.6000000000000005
4.933333333333334
1.7
