In [1]:
import imports

import math

import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from scipy.special import erfcinv

from datasets import loader
from initialisations import khanahmad2004 as ka

In [2]:
dataset = loader.load_iris()
data = normalize(dataset.data)

K = 3

### k_means_clustering()

In [3]:

def k_means_clustering(data, means, K):
    est = KMeans(K, init=means, n_init=1)
    est.fit(data)
    return est.labels_
    

### cluster_numeric_attribute()

In [4]:

def cluster_numeric_attribute(attrib, data):

    xs = []

    mn = np.mean(attrib)
    sd = np.std(attrib)
    
    for i in range(0, K):
        percentile = (2*(i+1)-1) / (2*K)
        z = math.sqrt(2) * erfcinv(2*percentile)
        xs.append(z * sd * mn)
        
    ad = attrib.reshape(-1, 1)        
    seeds = np.array(xs).reshape(-1, 1)
    
    return k_means_clustering(ad, seeds, K)
    

### generate_cluster_string()

In [5]:
def generate_cluster_string(mystr, data):
    '''
    Find new centers corresponding to this attribute's cluster 
    allotments and allot data objects based on cluster allotments
    
    TODO: this is just calculating means. Vectorise it
    '''
    
    num_attrs = data.shape[1]
    
    clust = np.zeros((K, num_attrs))
    count = [0] * K
        
    # for each data point label
    for i in range(0, len(mystr)):
        
        # for each attribute
        for j in range(0, num_attrs): 
            clust[mystr[i]][j] += data[i][j]

        count[mystr[i]] += 1

    # same loops again to get means
    for i in range(0, K):
        for j in range(0, num_attrs): 
            clust[i][j] = clust[i][j]/count[i]

    return k_means_clustering(data, clust, K)
   

### extract_cluster_strings()

In [6]:

def extract_cluster_strings(cluster_string, data):
    '''Extract clustering strings for the whole data'''
    
    num_samples = data.shape[0]
    num_attrs = data.shape[1]
    
    cstr = []

    for i in range(0, num_samples):
        cstr.append("")
        
        for j in range(0, num_attrs-1):
            cstr[i] = cstr[i] + str(cluster_string[i][j]) + ","
        
        cstr[i] += str(cluster_string[i][num_attrs-1])

    print(cstr)


In [7]:
# Main CCIA method
num_samples = data.shape[0]
num_attrs = data.shape[1]

cluster_string = np.zeros((num_samples, num_attrs))

# Find centers corresponding to each attribute
for i in range (0, num_attrs):
    val = data[:,i]
    
    mystr = cluster_numeric_attribute(val, data)
    #print(mystr)
    
    membership = generate_cluster_string(mystr, data)
    #print(membership)
    
    for l in range(0, num_samples):
        cluster_string[l][i] = membership[l]
    
# end for each attribute
 
cstr = extract_cluster_strings(cluster_string, data)
print(cstr)
#Map<String, Integer> distinctClassStr = findUniqueClusterStrings(cstr);
#double [][] initCenters = findInitialCenters(cstr,distinctClassStr, data);
#return initCenters;
    

['1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,1.0,0.0,0.0', '1.0,2.0,0.0,0.0', '2.0,0.0,2.0,2.0', '2.0,0.0,2.0,2.0', '2.0,0.0,2.