In [2]:
import pandas as pd
import numpy as np
import fastcluster as fc
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context("poster")

In [3]:
class CondensedTreeNode:    
    def __init__(self, id, dist, children, size, is_leaf):
        self.id = id
        self.dist = dist
        self.children = children
        self.child_size = size
        self.is_leaf = is_leaf
    
    def add_child(self, child):
        self.children.append(child)

    def __repr__(self):
        return '<Node object at %s>' % (
            hex(id(self))
            )
        
    def __str__(self):
        return "ID: %d, Lambda %d, Number of Children %d, " \
               "Number of children %d, Leaf node: %s" % (self.id, self.dist, len(self.children), 
                                                         self.child_size, self.is_leaf)

def get_leaves(tree):
    """Consume a tree object and return a list of leaf nodes"""
    return tree.pre_order(lambda x: x)

def condense_tree(tree, min_cluster_size=10, next_id=0):
        
    #Verbose assert
    if tree.count == 0:
        print("Invalid input: Null tree")
        result = Node(-1, -1, [], -1, False)
        return result
    elif tree.count == 1:
        #Passed in a single node. only that node
        result = Node(-1, tree.dist, [], 1, True)
        return result
        
    result = CondensedTreeNode(next_id, tree.dist, [], tree.left.count + tree.right.count, 0)
        
    #If the left node is too small, add a leaf
    if tree.left.count <= min_cluster_size:
        leaves = get_leaves(tree.left)
        for leaf in leaves:
            result.add_child(CondensedTreeNode("POINT %i" % leaf.id, tree.left.dist, [], 1, True))
    elif tree.right.count <= min_cluster_size:
        child, next_id = condense_tree(tree.left, min_cluster_size, next_id)
        result.add_child(child)
    else:
        child, next_id = condense_tree(tree.left, min_cluster_size, next_id + 1)
        result.add_child(child)
            
    #If the right node is too small, add a leaf
    if tree.right.count <= min_cluster_size:
        leaves = get_leaves(tree.right)
        for leaf in leaves:
            result.add_child(CondensedTreeNode("POINT %i" % leaf.id, tree.right.dist, [], 1, True))
    elif tree.left.count <= min_cluster_size:
        child, next_id = condense_tree(tree.right, min_cluster_size, next_id)
        result.add_child(child)        
    else:
        child, next_id = condense_tree(tree.right, min_cluster_size, next_id + 1)
        result.add_child(child)
        
    return result, next_id
    
def flatten_node(tree_node):
    return [(tree_node.id, x.id, 1.0/tree_node.dist, x.child_size) for x in tree_node.children
             if tree_node.id != x.id]

def flatten_tree_recursion(tree):
    if tree.is_leaf:
        return []
    result = flatten_node(tree)
    for subtree in tree.children:
        result.extend(flatten_tree(subtree))
    return result

def flatten_tree(tree):
    result = flatten_tree_recursion(tree)
    return pd.DataFrame(result, columns=("parent","child","lambda","child_size"))

In [6]:
class HDBSCAN (object):
    
    def __init__(self, min_cluster_size=5, min_points=None):
        self.min_cluster_size = min_cluster_size
        if min_points is None:
            self.min_points = min_cluster_size
        else:
            self.min_points = min_points
        self._mutual_reachability_graph = None
        self._raw_tree = None
        self._tree_frame = None
        
    def _mutual_reachability(self, distance_matrix):
        dim = distance_matrix.shape[0]
        core_distances = np.partition(distance_matrix, self.min_points, axis=0)[self.min_points]
        core_distance_matrix = core_distances.repeat(dim).reshape((dim,dim))
        self._mutual_reachability_graph = np.dstack((core_distance_matrix, 
                                                     core_distance_matrix.T, 
                                                     distance_matrix)).max(axis=2)
        return
    
    def _single_linkage(self):
        assert(self._mutual_reachability_graph is not None)
        self._raw_tree = fc.single(self._mutual_reachability_graph)
        return
    
    def _condense_tree(self):
        assert(self._raw_tree is not None)
        base_tree = sch.to_tree(self._raw_tree)
        condensed_tree, final_id = condense_tree(base_tree, self.min_cluster_size)
        self._tree_frame = flatten_tree(condensed_tree)
        
    def _compute_stable_clusters(self):
        assert(self._tree_frame is not None)
        pass
    
    def fit(self, distance_matrix):
        self._mutual_reachability(distance_matrix)
        self._single_linkage()
        self._condense_tree()
        self._compute_stable_clusters()
        