In [1]:
import numpy as np
import scipy as sp
from pprint import pprint
from utils import preprocess, select_dim, calc_selection_threshold, score_function_i

In [2]:
data = preprocess('dataset_diabetes/diabetic_data.csv')
N, V = data.shape

In [3]:
data, N, V

(array([[  2278392,   8222157,         6, ...,         0,         0,
                 1],
        [   149190,  55629189,         1, ...,         0,         0,
                 9],
        [    64410,  86047875,         1, ...,         0,         1,
                 6],
        ...,
        [443854148,  41088789,         1, ...,         0,         0,
                13],
        [443857166,  31693671,         2, ...,         0,         1,
                 9],
        [443867222, 175429310,         1, ...,         0,         0,
                 9]]), 101766, 13)

In [4]:
selection_threshold = calc_selection_threshold(data)

In [5]:
C = [[1, 2, 3], [9, 39, 43, 341], [32, 21, 432]]
select_dim(data[C[0]], selection_threshold)

[0, 1, 2, 3, 4, 5, 8, 10, 11]

In [19]:
# labeled_objects[i][j] = j-th object in i-th cluster.
labeled_objects = [[9, 23, 543], [456, 34, 52], [76, 500, 381, 245]]

# labeled_dimensions[i][j] = j-th dimension relevant to i-th cluster.
labeled_dimensions = [[1, 3, 5], [9, 7, 8], [4, 2, 6]]    

In [20]:
def bin_cell(data, edges):
    """
    Bin samples in data into the cell defined by edges.
    """
    res = []
    for sample in data:
        
        # Check whether sample lies in the cell.
        in_cell = True
        for j in range(len(sample)):
            if sample[j] < edges[j][0] or sample[j] > edges[j][1]:
                in_cell = False
                break
        
        # Add it to the results if so.
        if in_cell:
            res.append(sample)
    return res

In [21]:
def define_edges(centre, edge_lengths):
    """
    Define the cell edges from a centre and the edge lengths.
    """
    edges = [(centre[i] - edge_lengths[i] / 2, centre[i] + edge_lengths[i] / 2) for i in range(len(edge_lengths))]
    return edges

In [22]:
def hill_climb(data, curr_centre, step_lengths):
    """
    Hill-climbing to find the cell with highest density. 
    """
    # Find the central cell count.
    curr_edges = define_edges(curr_centre, step_lengths)
    curr_bin = bin_cell(data, curr_edges)
    
    # Find the denser cell than current centre.
    denser_found = False
    max_centre = curr_centre
    max_bin = curr_bin
    
    # Explore the neighbouring cells.
    for i in range(len(step_lengths)):
        for sign in [-1, 1]:
            
            # Move to the neighbouring centre.
            step_length = step_lengths[i]
            cell_centre = curr_centre.copy()
            cell_centre[i] += sign * step_length

            # Bin the neighbouring cell.
            cell_edges = define_edges(cell_centre, step_lengths)
            cell_bin = bin_cell(data, cell_edges)

            # Find the most dense cell.
            if len(cell_bin) > len(max_bin):
                max_centre = cell_centre
                max_bin = cell_bin
                denser_found = True
    
    # Found the maximum.
    if not denser_found:
        return max_bin
    else:
        return hill_climb(data, max_centre, step_lengths)

In [24]:
def get_peak(data, step_lengths):
    """
    Find the absolute density peak of the grid.
    """
    
    # Bin the data into cells.
    vals_range = np.ptp(data, axis=0)
    bin_nums = [vals_range[i] / step_lengths[i] for i in range(len(step_lengths))]
    H, edges = np.histogramdd(data, bin_nums)
    
    # Find absolute density peak.
    max_indices = np.unravel_index(H.argmax(), H.shape)
    max_edges = [(edges[i][max_indices[i]], edges[i][max_indices[i] + 1]) for i in range(len(edges))]
    
    # Find data points in the peak cell.
    return bin_cell(data, max_edges)

In [25]:
# TODO: Find MaxMin for the no-input case.
def max_min_dist():
    return []

In [26]:
building_dim_num = 3

def initialize(data, k, labeled_objects, labeled_dimensions):
    selection_threshold = calc_selection_threshold(data)
    
    # i-th sample belongs to seed_group_label[i] seed group.
    seed_group_label = np.zeros(len(data))
    
    # Denote the current distance to the closest median.
    med_dist = np.zeros(len(data))
    
    # Private seeds.
    G = []
    
    # Loop over clusters.
    for i in range(k):
        
        # Clusters with labeled objects.
        if (i < len(labeled_objects) and len(labeled_objects[i]) > 0):
            
            # Extract i-th cluster's labeled objects.
            data_i = data[labeled_objects[i]]
            med_i = np.median(data_i, axis=0)
            
            # Define a candidate set that includes the dimensions selected by SelectDim as well as 
            # those in labeled_dimensions (if any).
            candidate_set = select_dim(data_i, selection_threshold, med_i)
            if (i < len(labeled_dimensions) and len(labeled_dimensions[i]) > 0):
                candidate_set = list(set(np.concatenate((candidate_set, labeled_dimensions[i]))))
            
            # TODO: Ask Vic to fix score function.
            # score_ij = score_function_i(i, candidate_set, med_i)
            score_ij = np.random.rand(len(candidate_set))
            
            # Each dimension in the set has a probability proportional to 
            # the score function of being selected as a building dimension of a grid.
            score_ij /= sum(score_ij)
            building_dims = np.random.choice(a=candidate_set, size=building_dim_num, replace=False, p=score_ij)
            
            # Extract the data with building dimensions.
            data_grid = data[:, building_dims]
            sd_grid = np.std(data_grid, axis=0)
            med_i_grid = med_i[building_dims]
            
            # Apply hill-climbing search to find most dense cell.
            G_i = hill_climb(data_grid, med_i_grid, sd_grid)

        # Clusters without labeled objects.
        else:
            
            # Clusters with labeled dimensions.
            if (i < len(labeled_dimensions) and len(labeled_dimensions[i]) > 0):
                candidate_set = labeled_dimensions[i]
                building_dims = np.random.choice(a=candidate_set, size=building_dim_num, replace=False)
                
                # Extract the data with building dimensions.
                data_grid = data[:, building_dims]
                sd_grid = np.std(data_grid, axis=0)
                
                # Find absolute peak.
                G_i = get_peak(data_grid, sd_grid)
            
            # Clusters with no input.
            else:
                G_i = max_min_dist()
        G.append(G_i)
    return G
initialize(data, 5, labeled_objects, labeled_dimensions)

[[array([       0, 91530936,        9]),
  array([        0, 114304932,         9]),
  array([        0, 112367394,         9]),
  array([        0, 105345450,         9]),
  array([       0, 92840139,        9]),
  array([        0, 102190059,         9]),
  array([       0, 93228012,        9]),
  array([       0, 84615111,        9]),
  array([        0, 102780567,         9]),
  array([        0, 100494054,         9]),
  array([       0, 89785206,        9]),
  array([        0, 109656828,         9]),
  array([       0, 87601464,        9]),
  array([        0, 114181587,         9]),
  array([       0, 86880384,        9]),
  array([        0, 106739226,         9]),
  array([       0, 95282361,        9]),
  array([        0, 113314410,         9]),
  array([        0, 101037321,         9]),
  array([       0, 99220410,        9]),
  array([       0, 96806466,        9]),
  array([        0, 110830923,         9]),
  array([       0, 99898803,        9]),
  array([        0, 1

In [None]:
# Simple closest centre search.
for j in range(len(data_grid)):
    sample = data_grid[j]

    # Check whether the point lies within the grid.
    in_grid = True

    # Calculate the standardized distance to the median.
    std_dist = 0
    for building_dim in range(building_dim_num):
        std_dist_comp = abs(sample[building_dim] - med_i[building_dim]) / sd_grid[building_dim]
        if std_dist > 1:
            in_grid = False
            break
        std_dist += std_dist_comp ** 2
    std_dist = std_dist ** 0.5

    # If the point lies within the grid, check whether this is the closest median.
    if in_grid and (seed_group_label[j] == 0 or std_dist < med_dist[j]):

        # Update its label and closest to-median distance.
        if seed_group_label[j] != 0:
            G[seed_group_label[j] - 1].remove(j)
        G_i.append(j)
        seed_group_label[j] = i + 1
        med_dist[j] = std_dist

In [None]:
r = np.random.randn(100,3)
H, edges = np.histogramdd(r, bins = (5, 8, 4))
edges