In [498]:
import pandas as pd
import numpy as np
import random

# Function definitions

In [499]:
# For a specified array of centers and an array with measurements of a numerical variable,
# returns the sum of the distance of each measurement to the closest center.
def total_distance_to_centers(centers, data_attribute):
    d = 0
    for x in data_attribute:
        if not np.isnan(x):
            d = d + np.min(np.abs(centers - x))
    return d

In [553]:
data_attribute = data[attribute]

def compute_kohonen_centers(n_centers, data_attribute_input, max_iterations=1000000, dataset_rounds=15):
    
    debug = False
    
    # Get data
    data_attribute_input.dropna(inplace=True)
    data_attribute = data_attribute_input.as_matrix()
    
    # Initialize equidistant centerns
    step = (data_attribute.max() - data_attribute.min()) / (n_centers-1)
    centers = data_attribute.min() + step * np.arange(n_centers)
    
    # Compute sum of distances of each point to the closest center
    D = total_distance_to_centers(centers, data_attribute)
    
    #print("Initial equidistant centers: ", centers)
    #print("Initial distance to centers: ", D)
    
    D_previous = D + 1.0
    dataset_round = 0
    it = 0
    eta = 0.6
    pertubation_precision = 5.0
    
    while  dataset_round <  dataset_rounds and it < max_iterations:
        data_attribute = data_attribute_input.as_matrix()
        
        while len(data_attribute) > 0 and np.abs(D_previous - D) > 0 and it < max_iterations:


            # Randomly pop a value of data_attribute
            random_index = random.randrange(len(data_attribute))
            random_x = data_attribute[random_index]
            data_attribute = np.delete(data_attribute, random_index)

            # Get closest center and it's position
            closest_center_index = np.argmin(np.abs(centers - random_x))
            closest_center = centers[closest_center_index]

            # Small perturbation to prevent iterations without updates, in case random_x is equal to closest_center
            random_float = np.random.rand()
            while random_float == 0: random_float = np.random.rand() # need non-zero float in (0,1)
            # make sure pertubation goes to zero by multiplying by eta

            if len(data_attribute > 0):
                perturbation = 10**(-pertubation_precision) * eta * data_attribute.std() * random_float
            else:
                # Prevent running data_attribute.std() when data_attribute is empty
                perturbation = 10**(-pertubation_precision) * eta * perturbation * random_float

            # Update rule for the centers
            centers[closest_center_index] = closest_center + eta * (random_x - closest_center + perturbation)

            if debug:
                print("\nIteration: ", it)

                print("random_index", random_index)
                print("random_x", random_x)

                print("closest_center_index", closest_center_index)
                print("closest_center", closest_center)

                print("centers: ", centers)

            eta = 0.99999 * eta
            D_previous = D
            D = total_distance_to_centers(centers, data_attribute)
            it = it + 1
        
        dataset_round = dataset_round + 1
    
    # Atenuate perturbations in the centers
    for i in np.arange(n_centers): centers[i] =  round(centers[i], int(pertubation_precision))

    if debug:
        print("Final centers: ", centers)
        print("Final distance to centers: ", D)
        print("Dataset rounds: ", dataset_round)
        print("Iterations: ", it)
    
    return centers

# Do stuff

In [564]:
# Read dataset as a pandas dataframe. Columns to be indexed by attribute name
dataset = "hd"
file_path = "/home/cecilia/uncertainty/revision/" + dataset + "/hd.data"
header_path = "/home/cecilia/uncertainty/revision/" + dataset + "/hd.header"

# Read data
header = pd.read_csv(header_path, sep="|", names=["attribute", "type", "values"])

target_name = header[header.type == "target"].attribute.as_matrix()[0]
data = pd.read_csv(file_path, sep="|", header=None, names=header.attribute.as_matrix())

# Remove target column
#target = data[target_name]
#data.drop(target_name, axis=1, inplace=True)

In [562]:
n_cols = data.shape[1]
n_centers = 3

centers = {}

for attribute in data:    
    if (header[header.attribute == attribute].type == "floating").as_matrix()[0]:
        print("")
        print("Attribute: ", attribute)
        
        n_centers_to_use = min(len(np.unique(data[attribute].as_matrix())), n_centers)
        print("Number of centers: ", n_centers_to_use)
        
        centers[attribute] = compute_kohonen_centers(n_centers_to_use, data[attribute], dataset_rounds=2);


Attribute:  age
Number of centers:  3

Attribute:  trestbps
Number of centers:  3

Attribute:  chol
Number of centers:  3

Attribute:  fbs
Number of centers:  2

Attribute:  thalach
Number of centers:  3

Attribute:  oldpeak
Number of centers:  3

Attribute:  slope
Number of centers:  3

Attribute:  ca
Number of centers:  3


In [563]:
centers

{'age': array([ 41.23564,  58.00499,  66.94293]),
 'ca': array([ 0.     ,  1.25978,  3.     ]),
 'chol': array([ 186.22735,  295.61832,  504.27244]),
 'fbs': array([ 0.,  1.]),
 'oldpeak': array([ 0.36059,  1.20578,  3.75686]),
 'slope': array([ 1.,  2.,  3.]),
 'thalach': array([ 105.67928,  129.40093,  180.45974]),
 'trestbps': array([ 114.92883,  130.88244,  177.12998])}