In [7]:
import pandas as pd
import numpy as np
import fastcluster as fc
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as data

%matplotlib inline
sns.set_context("poster")

In [21]:
class CondensedTreeNode:    
    def __init__(self, id, dist, children, size, is_leaf):
        self.id = id
        self.dist = dist
        self.children = children
        self.child_size = size
        self.is_leaf = is_leaf
    
    def add_child(self, child):
        self.children.append(child)

    def __repr__(self):
        return '<Node object at %s>' % (
            hex(id(self))
            )
        
    def __str__(self):
        return "ID: %d, Lambda %d, Number of Children %d, " \
               "Number of children %d, Leaf node: %s" % (self.id, self.dist, len(self.children), 
                                                         self.child_size, self.is_leaf)

def get_leaves(tree):
    """Consume a tree object and return a list of leaf nodes"""
    return tree.pre_order(lambda x: x)

def condense_tree(tree, min_cluster_size=10, next_id=0):
        
    #Verbose assert
    if tree.count == 0:
        print("Invalid input: Null tree")
        result = Node(-1, -1, [], -1, False)
        return result
    elif tree.count == 1:
        #Passed in a single node. only that node
        result = Node(-1, tree.dist, [], 1, True)
        return result
        
    result = CondensedTreeNode(next_id, tree.dist, [], tree.left.count + tree.right.count, 0)
        
    #If the left node is too small, add a leaf
    if tree.left.count <= min_cluster_size:
        leaves = get_leaves(tree.left)
        for leaf in leaves:
            result.add_child(CondensedTreeNode(-(leaf.id + 1), tree.left.dist, [], 1, True))
    elif tree.right.count <= min_cluster_size:
        child, next_id = condense_tree(tree.left, min_cluster_size, next_id)
        result.add_child(child)
    else:
        child, next_id = condense_tree(tree.left, min_cluster_size, next_id + 1)
        result.add_child(child)
            
    #If the right node is too small, add a leaf
    if tree.right.count <= min_cluster_size:
        leaves = get_leaves(tree.right)
        for leaf in leaves:
            result.add_child(CondensedTreeNode(-(leaf.id + 1), tree.right.dist, [], 1, True))
    elif tree.left.count <= min_cluster_size:
        child, next_id = condense_tree(tree.right, min_cluster_size, next_id)
        result.add_child(child)        
    else:
        child, next_id = condense_tree(tree.right, min_cluster_size, next_id + 1)
        result.add_child(child)
        
    return result, next_id
    

In [22]:
def flatten_node(tree_node):
    return [(tree_node.id, x.id, 1.0/tree_node.dist, x.child_size) for x in tree_node.children
             if tree_node.id != x.id]

def flatten_tree_recursion(tree):
    if tree.is_leaf:
        return []
    result = flatten_node(tree)
    for subtree in tree.children:
        result.extend(flatten_tree_recursion(subtree))
    return result

def flatten_tree(tree):
    result = flatten_tree_recursion(tree)
    return pd.DataFrame(result, columns=("parent","child","lambda","child_size"))

In [23]:
def stability(row):
    return (row["lambda_death"] - row["lambda_birth"]) * row["child_size"]

def compute_stability(cluster_tree):
    births = cluster_tree.groupby("child").min()[["lambda"]]
    births_and_deaths = cluster_tree.join(births, on="parent", lsuffix="_death", rsuffix="_birth")
    births_and_deaths["stability"] = births_and_deaths.apply(stability, axis=1)
    return births_and_deaths.groupby("parent")[["stability"]].sum() / \
            pd.DataFrame(births_and_deaths.parent.value_counts(), columns=["stability"])
    
def stability_score(tree_node, stability_table):
    node_stability = stability_table.loc[tree_node.id][0]
    child_stability = sum(max(stability_score(x, stability_table)) for x in tree_node.children if not x.is_leaf)
    return (node_stability, child_stability)

def get_clusters(tree_node, stability_table, results={}):
    tree_node.score = stability_score(tree_node, stability_table)
    if tree_node.score[0] > tree_node.score[1]:
        tree_node.is_cluster = True
        cluster_id = max(results.keys()) + 1 if results.keys() else 0 
        results[cluster_id] = tree_node.points
    else:
        tree_node.is_cluster = False
    if not tree_node.is_cluster:
        for node in tree_node.children:
            if not node.is_leaf:
                get_clusters(node, stability_table, results)
    return results

def get_leaf_point_ids(tree):
    results = []
    for node in tree.children:
        if node.is_leaf:
            results.append(-(node.id - 1))
        else:
            results.extend(get_leaf_point_ids(node))
    return results

def reduce_tree(tree):
    result = CondensedTreeNode(tree.id, 0, [], 0, 0)
    result.points = get_leaf_point_ids(tree)
    
    children_to_process = tree.children[:]
    
    for child in children_to_process:
        if child.is_leaf:
            continue
        if child.id == tree.id:
            children_to_process.extend(child.children)
        else:
            result.children.append(reduce_tree(child))
            
    return result

In [52]:
class HDBSCAN (object):
    
    def __init__(self, min_cluster_size=5, min_points=None):
        self.min_cluster_size = min_cluster_size
        if min_points is None:
            self.min_points = min_cluster_size
        else:
            self.min_points = min_points
        self._mutual_reachability_graph = None
        self._raw_tree = None
        self._tree_frame = None
        
    def _mutual_reachability(self, distance_matrix):
        dim = distance_matrix.shape[0]
        core_distances = np.partition(distance_matrix, self.min_points, axis=0)[self.min_points]
        core_distance_matrix = core_distances.repeat(dim).reshape((dim,dim))
        self._mutual_reachability_graph = np.dstack((core_distance_matrix, 
                                                     core_distance_matrix.T, 
                                                     distance_matrix)).max(axis=2)
        return
    
    def _single_linkage(self):
        assert(self._mutual_reachability_graph is not None)
        #self._raw_tree = single_linkage(self._mutual_reachability_graph)
        pdist_array = distance_matrix_to_pdist(self._mutual_reachability_graph)
        self._raw_tree = fc.single(pdist_array)
        return
    
    def _condense_tree(self):
        assert(self._raw_tree is not None)
        base_tree = sch.to_tree(self._raw_tree)
        self._condensed_tree, final_id = condense_tree(base_tree, self.min_cluster_size)
        self._tree_frame = flatten_tree(self._condensed_tree)
        
    def _compute_stable_clusters(self):
        assert(self._tree_frame is not None)
        self._stability_table = compute_stability(self._tree_frame)
        self._reduced_tree = reduce_tree(self._condensed_tree)
        self._cluster_dict = get_clusters(self._reduced_tree, self._stability_table)
    
    def fit(self, distance_matrix):
        self._mutual_reachability(distance_matrix)
        self._single_linkage()
        self._condense_tree()
        self._compute_stable_clusters()
        return self._cluster_dict
        

In [25]:
import scipy.spatial.distance as dist

iris = pd.read_csv("iris.csv")
distance_matrix = dist.squareform(dist.pdist(iris.ix[:,:4].as_matrix()))

In [53]:
%%timeit
clusterer = HDBSCAN(10)
clusterer.fit(distance_matrix)

10 loops, best of 3: 73.6 ms per loop


In [27]:
digits = data.load_digits()
digits_distance_matrix = dist.squareform(dist.pdist(digits.data))

In [54]:
%%timeit
clusterer = HDBSCAN(10)
clusterer.fit(digits_distance_matrix)

1 loops, best of 3: 1.47 s per loop


In [55]:
%%prun
clusterer = HDBSCAN(10)
clusterer.fit(digits_distance_matrix)

 

In [30]:
%timeit fc.single(digits_distance_matrix)

1 loops, best of 3: 36.7 s per loop


In [31]:
def mst_linkage_core(distance_matrix):
    result = []
    node_labels = np.arange(distance_matrix.shape[0])
    current_node = 0
    last_distances = np.infty * np.ones(node_labels.shape[0])
    current_distances = np.infty * np.ones(node_labels.shape[0])
    current_labels = node_labels
    for i in xrange(1,node_labels.shape[0]):
        label_filter = current_labels != current_node
        current_labels = current_labels[label_filter]
        comparison_distances = np.vstack((last_distances[label_filter],
                                            distance_matrix[current_node][current_labels]))
        current_distances = np.min(comparison_distances, axis=0)
        last_distances = current_distances
        
        new_node_index = np.argmin(current_distances)
        new_node = current_labels[new_node_index]
        result.append([current_node, new_node, current_distances[new_node_index]])
        current_node = new_node
        
    return result

In [32]:
%timeit mst_linkage_core(digits_distance_matrix)

1 loops, best of 3: 345 ms per loop


In [43]:
class UnionFind:
    
    def __init__(self, N):
        self.parent = np.array([-1] * (2*N-1))
        self.next_label = N
        self.size = np.array([1] * N)
        self.size = np.concatenate([self.size, np.array([0] * (N-1))])
        
    def union(self, m, n):
        self.size[self.next_label] = self.size[m] + self.size[n]
        self.parent[m] = self.next_label
        self.parent[n] = self.next_label
        self.size[self.next_label] = self.size[m] + self.size[n]
        self.next_label += 1
        
    def find(self, n):
        while self.parent[n] != -1 :
            n = self.parent[n]
        return n
    
    def fast_find(self, n):
        p = n
        while self.parent[n] != -1 :
            n = self.parent[n]
        while self.parent[p] != n: # label all the way up to the root
            (p, self.parent[p]) = (self.parent[p], n)
        return n

In [41]:
def label(L, do_fast_find=True):
    LL = []
    N = len(L) + 1 # QUESTION: why isn't this len(L)??
    U = UnionFind(N)
    for idx, (a, b, delta) in enumerate(L):
        if do_fast_find:
            aa, bb = (U.fast_find(a), U.fast_find(b))
        else:
            aa, bb = (U.find(a), U.find(b))
        LL.append((aa, bb, delta, U.size[aa] + U.size[bb]))      
        U.union(aa, bb)
    return LL

In [42]:
def single_linkage(distance_matrix):
    hierarchy = np.array(mst_linkage_core(distance_matrix))
    sort_order = np.argsort(hierarchy.T[2])
    sorted_hierarchy = hierarchy[sort_order,:]
    return label(sorted_hierarchy)

In [50]:
def distance_matrix_to_pdist(matrix):
    result = []
    for index, row in enumerate(distance_matrix):
        result.append(row[index:])
    return np.hstack(result)

In [51]:
%timeit distance_matrix_to_pdist(digits_distance_matrix)

100 loops, best of 3: 2.11 ms per loop
