In [1]:
import numpy as np
import scipy as sp
from pprint import pprint
from utils import preprocess, select_dim, calc_selection_threshold, score_function_i

In [2]:
data = preprocess('dataset_diabetes/diabetic_data.csv')
N, V = data.shape



In [3]:
data, N, V

(array([[-1.58733032, -1.19154543,  0.42755739, ...,  0.92659052,
         -1.82986799,  0.8368795 ],
        [-1.60807473,  0.03356374,  0.42755739, ..., -1.07922537,
          0.54648751, -0.62496982],
        [-1.60890073,  0.81965411, -1.70370587, ...,  0.92659052,
          0.54648751,  0.8368795 ],
        ...,
        [ 2.71485842, -0.34219438,  0.42755739, ..., -1.07922537,
          0.54648751,  0.8368795 ],
        [ 2.71488782, -0.58498632,  0.42755739, ..., -1.07922537,
          0.54648751,  0.8368795 ],
        [ 2.71498579,  3.12948056,  0.42755739, ...,  0.92659052,
         -1.82986799,  0.8368795 ]]), 101766, 50)

In [4]:
selection_threshold = calc_selection_threshold(data)

In [5]:
C = [[1, 2, 3], [9, 39, 43, 341], [32, 21, 432]]
select_dim(data[C[0]], selection_threshold)

array([1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1.,
       1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0.])

In [6]:
# labeled_objects[i][j] = j-th object in i-th cluster.
labeled_objects = [[9, 23, 543], [456, 34, 52], [76, 500, 381, 245]]

# labeled_dimensions[i][j] = j-th dimension relevant to i-th cluster.
labeled_dimensions = [[1, 3, 5], [9, 20, 48], [23, 12, 31]]    

In [None]:
building_dim_num = 3

def initialize(data, k, labeled_objects, labeled_dimensions):
    selection_threshold = calc_selection_threshold(data)
    
    # i-th sample belongs to seed_group_label[i] seed group.
    seed_group_label = np.zeros(len(data))
    
    # Denote the current distance to the closest median.
    med_dist = np.zeros(len(data))
    
    # Private seeds.
    G = []
    
    # Loop over clusters.
    for i in range(k):
        if (i < len(labeled_objects)):
            
            # Extract i-th cluster's labeled objects.
            data_i = data[labeled_objects[i]]
            med_i = np.median(data_i)
            
            # Define a candidate set that includes the dimensions selected by SelectDim as well as 
            # those in labeled_dimensions.
            candidate_set = set(select_dim(data_i) + labeled_dimensions[i])
            score_ij = score_function_i(i, candidate_set, med_i)
            
            # Each dimension in the set has a probability proportional to 
            # the score function of being selected as a building dimension of a grid.
            score_ij /= sum(score_ij)
            building_dims = np.random.choice(a=candidate_set, size=building_dim_num, replace=False, p=score_ij)
            
            # Find all the data points within 1 std to the median in all building dimensions.
            data_grid = data[:, building_dims]
            sd_grid = np.std(data_grid, axis=0)
            G_i = []
            for j in range(len(data_grid)):
                sample = data_grid[j]
                
                # Check whether the point lies within the grid.
                in_grid = True
                
                # Calculate the standardized distance to the median.
                std_dist = 0
                for building_dim in range(building_dim_num):
                    std_dist_comp = abs(sample[building_dim] - med_i[building_dim]) / sd_grid[building_dim]
                    if std_dist > 1:
                        in_grid = False
                        break
                    std_dist += std_dist_comp ** 2
                std_dist = std_dist ** 0.5
                
                # If the point lies within the grid, check whether this is the closest median.
                if in_grid and seed_group_label[j] != 0 and std_dist < med_dist[j]:

                    # Update its label and closest to-median distance.
                    G[seed_group_label[j]].remove(j)
                    G_i.append(j)
                    med_dist[j] = std_dist
            G.append(G_j)
            