# Bradley & Fayyad 1998 Algorithm


### Questions:

 - What is sample size? Steinley 2007 says |X|/J
 - What is SP "Starting point"?
 - Can sample sets overlap or must they be removed from the pool?
 - Why is FMS never used?
 - Remove duplicate centroids from CM?

In [1]:
import imports
import numpy as np
from sklearn.cluster import KMeans
import sklearn.datasets as skdatasets
import sklearn.metrics as skmetrics 
from metrics import accuracy
from initialisations import bradleyfayyad1998 as bf
from initialisations import random as ran

In [2]:
dataset = skdatasets.load_iris()
data = dataset.data
target = dataset.target

## Configuration/Parameters

In [3]:
K = 3
J = 10

sample_size = int(len(data)/J) # According to Steinley

# I have no idea
SP = ran.generate(data, K)

print(SP)

[[4.8 3.  1.4 0.1]
 [4.9 3.1 1.5 0.2]
 [5.  3.3 1.4 0.2]]


### k_means()

In [4]:
def k_means(seeds, data, K):
    '''Calls the standard k-means with the given seeds'''
    est = KMeans(n_clusters=K, init=seeds, n_init=1)
    est.fit(data)
    return est       

### k_means_mod()

In [5]:
def k_means_mod(SP, Si, K):
    '''TODO'''
    
    centroids = k_means(SP, Si, K).cluster_centers_ 
    #print(centroids)
    return centroids

## Refine()

In [6]:


def refine(SP, data, K, J):
    '''Main algorithm'''
    
    # The J possible solutions
    CMI = []
    
    # All the points found so far
    CM = []
    
    for i in range(0, J):
        Si = data[np.random.choice(data.shape[0], sample_size, replace=False), :]
        centroids = k_means_mod(SP, Si, K)
    
        CMI.append(centroids)
        for c in centroids:
            CM.append(c)
     
    CM = np.unique(CM, axis=0)
    
    best = None
    
    for i in range(0, J):

        km = k_means(CMI[i], CM, K)     
        
        if best is None or km.inertia_ < best.inertia_:
            best = km

    return km.cluster_centers_

In [7]:
solution = refine(SP, data.copy(), K, J)
print(solution)

[[4.73777778 3.12592593 1.42222222 0.21444444]
 [6.34614469 2.84969197 4.95966783 1.64643773]
 [5.31291667 3.75104167 1.45875    0.25354167]]


### Metrics

In [8]:
final_est =  KMeans(n_clusters=K, init=solution, n_init=1)
final_est.fit(data)

ari = skmetrics.adjusted_rand_score(target, final_est.labels_)
print("Adjusted Rand Index:", ari)

print("Accuracy score:", accuracy.score(target, final_est.labels_))

Adjusted Rand Index: 0.4289511167236898
Accuracy score: 0.6666666666666666
