In [7]:
import pickle
import numpy as np
import networkx as nx
import time

N = 1000 # 1000 data points in this training set
k = 10 # Each node in the graph will have edges to its 10 nearest neighbors
S = 50 # need S-nn graph for soft label empirical distribution over S nearest neighbors, S >> k
M = 4 # partition this into 4 parts
num_points = 8 # each example has 8 points
num_features_per_point = 2 # each point has 2 features

folder = '../data/imbalanced_toy/'

### Model Setup

In [None]:
# load the training data
points = pickle.load(open(folder + 'points.pickle', 'rb'))
assert len(points) == N
distances = pickle.load(open(folder + 'distances.pickle', 'rb'))

In [None]:
def total_distance(med, cluster):
    total = 0
    for i in cluster:
        total += distances[points[i], points[med]]
    return total

def calculate_medoids_for_partition(partition):
    partition_copy = [list(part) for part in partition]
    medoids = [part[0] for part in partition_copy]
    
    for i in range(len(medoids)):
        for new_med in partition_copy[i][1:]:
            if total_distance(new_med, partition[i]) < total_distance(medoids[i], partition[i]):
                medoids[i] = new_med
    return medoids

In [148]:
# import the knn graph and partitions
G = nx.read_gpickle(folder + 'knn_graph.gpickle')

# partition_by_component = True
# if partition_by_component:
#     partitions = [set(range(250)), set(range(250, 500)), set(range(500, 750)), set(range(750,1000))]
# else:
#     partitions = nx.read_gpickle(folder + 'graph_partitions.pickle')
graph_partition = nx.read_gpickle(folder + 'graph_partitions.pickle')
km_partitions = nx.read_gpickle(folder + 'km_partitions.pickle')

graph_medoids = calculate_medoids_for_partition(graph_partition)
km_medoids = pickle.load(open(folder + 'medoids.pickle'), 'rb')

assert N == len(G.nodes())
assert M == len(graph_partition)
assert M == len(km_partitions)
levels = int(np.log2(M))
assert M == 2 ** levels

In [10]:
def get_flat_X_unsorted(points):
    # flatten each training point's feature matrix into a single feature vector
    # note that not every collision event may have the same number of particles
    num_particles = 0
    for point in points:
        num_particles = max(num_particles, point.shape[0])
    num_readings = points[0].shape[1] # every particle should have the same number (3) of readigns

    X = []
    for point in points:
        feature = np.copy(point)
        feature.resize((num_particles * num_readings, ))
        X.append(feature)
    return np.array(X)

def get_flat_X_sorted(points):
    # flatten each training point's feature matrix into a single feature vector
    # note that not every collision event may have the same number of particles
    num_particles = 0
    for point in points:
        num_particles = max(num_particles, point.shape[0])
    num_readings = points[0].shape[1] # every particle should have the same number (3) of readigns

    X = []
    for point in points:
        feature = np.copy(point)
        
        # sort the point data by the first column
        index = feature[:,0].argsort()
        feature = feature[index]
        
        # now flatten the feature and fill in with 0's
        feature.resize((num_particles * num_readings, ))
        X.append(feature)
    return np.array(X)
    
def get_class_labels(partition):
#     if partition_by_component:
#         labels = []
#         for i in range(250):
#             labels.append(0)
#         for i in range(250, 500):
#             labels.append(1)
#         for i in range(500, 750):
#             labels.append(2)
#         for i in range(750, 1000):
#             labels.append(3)
#         return np.array(labels)
#     else:
    # create labels for each node
    labels_dict = {}
    for i, part in enumerate(partition):
        for node in part:
            labels_dict[node] = i

    labels = []
    for i in range(N):
        labels.append(labels_dict[i])
    return np.array(labels)

def get_hierarchical_labels(partition):
    # return vector of 0 and 1's where each entry determines next splitting point
    labels = get_class_labels(partition)
    
    Y = []
    for label in labels:
        hierarchical_label = []
        for j in range(levels):
            hierarchical_label.append(label // (2 ** (levels - 1 - j)))
            label = label % (2 ** (levels - 1 - j))
        Y.append(hierarchical_label)
    return np.array(Y)

def get_soft_labels(partition):
    # first get regular labels
    labels = get_class_labels(partition)

    # turn the labels into soft labels
    # for this, the label becomes the empirical distribution of the part that each node's S nearest neighbors belong to
    nns = nx.read_gpickle(folder + 'nearest_neighbors.pickle')
    Y = []
    for i in range(N):
        distribution = np.zeros(M)
        for n in nns[i]:
            distribution[labels[n]] += 1
        distribution = np.divide(distribution, np.sum(distribution))
        Y.append(distribution)
    Y = np.array(Y)

    return Y

In [150]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

def lr_train_hierarchical(models, prefix, X, y, level):
    if level == levels:
        return
    
    clf = LogisticRegression().fit(X, y[:,level])
    models[prefix] = clf
    
    predictions = clf.predict(X)
    
    all_zeros = np.ones(len(predictions))
    X_left = X[predictions == all_zeros]
    y_left = y[predictions == all_zeros]
    lr_train_hierarchical(models, prefix + '0', X_left, y_left, level + 1)
    
    all_ones = np.ones(len(predictions))
    X_right = X[predictions == all_ones]
    y_right = y[predictions == all_ones]
    lr_train_hierarchical(models, prefix + '1', X_right, y_right, level + 1)
    
def lr_predict(model, x, num_levels=levels):
    prediction = []
    prefix = ''
    while len(prefix) < num_levels:
        clf = model[prefix]
        label = clf.predict(x)[0]
        prefix += str(label)
        prediction.append(label)
    return np.array(prediction)
    
def lr_prediction_accuracy(model, X, y):
    num_levels = levels # how deep in the tree to check
    num_correct = 0
    
    # check each data point
    for i in range(len(X)):
        x = X[i:i+1,:]
        prediction = predict(model, x, num_levels)
                
        if (y[i][:num_levels] == prediction).all():
            num_correct += 1
    
    return num_correct / len(X)

def lr_predict_best_partition(model, x):
    return np.dot(np.power(2, range(levels)[::-1]), lr_predict_label(model, x))

In [None]:
# K-Medoids Classifier code
import ot

def ot_distance(x, y):
    x = x.reshape(-1, 2)
    y = y.reshape(-1, 2)
    C = ot.dist(x, y)
    return ot.emd2([], [], C)

def km_predict_best_partition(medoids, x):
    best = 0
    distance = ot_distance(x, points[medoids[best]])
    for i in range(1, len(medoids)):
        new_distance = ot_distance(x, points[medoids[i]])
        if new_distance < distance:
            distance = new_distance
            best = i
    return best

In [None]:
X = get_flat_X_sorted(points)
y = get_hierarchical_labels(partition)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_query = # load unseen points for evaluation

### Experiments - KL Partitions

In [None]:
partition = graph_partition
medoids = graph_medoids

In [151]:
# Sorted Linear Regression Training
lr_sorted_model = {}
lr_train_hierarchical(lr_sorted_model, "", X, y, 0)

# now assess test accuracy
print(lr_prediction_accuracy(lr_sorted_model, X_test, y_test))



In [None]:
# Ground Truth
knn_real = []
t1 = time.time()
for i in range(len(X_query)):
    x = X_query[i:i+1, :]
    distances = []
    for node_id in range(N):
        d = ot_distance(x, X[node_id:node_id+1, :])
        distances.append((node_id, d))
    distances.sort(key=lambda x: x[1])
    knn_real.append([u[0] for u in distances[:k]])
t2 = time.time()

print('Time: {}'.format(t2-t1))

In [1]:
# Model Evaluation
for model in [('Log Reg Sorted', lr_predict_best_partition, lr_sorted_model), 
              ('K-Medoids', km_predict_best_partition, medoids)]:
    print('Model: {}'.format(model[0]))
    
    knn_sample = []
    t1 = time.time()
    for i in range(len(X_test)):
        x = X_test[i:i+1, :]
        label = model[1](model[2], x)
        assigned_partition = partition[label]

        distances = []
        for node_id in assigned_partition:
            d = distance(x, X[node_id:node_id+1, :])
            distances.append((node_id, d))
        distances.sort(key=lambda x: x[1])
        knn_sample.append([u[0] for u in distances[:k]])
    t2 = time.time()
    
    print('Time: {}'.format(t2-t1))
    
    # evaluate accuracy
    percents_captured = []
    for i in range(len(knn_sample)):
        num_correct = len(np.intersect1d(knn_sample[i], knn_real[i]))
        percents_captured.append(num_correct / k)
    print('Average Accuracy/Recall: {}'.format(np.mean(percents_captured)))

NameError: name 'lr_predict_best_partition' is not defined

### Other Stuff

In [None]:
import matplotlib.pyplot as plt

labels = get_class_labels(partition)
color_map = []
colors = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow'}

for i in range(N):
    color_map.append(colors[labels[i]])

nx.draw(G, node_color = color_map)
plt.show()