##Label


**Input:** numpy array of triples: (a, b, delta) where a and b are points (representing clusters) that were merged together at time delta. The array must already be sorted by delta.

**Output:** array of triples (ca, cb, delta) where ca, cb are cluster labels that are being merged at time delta.

In [65]:
import numpy as np
import sys
import pandas as pd

In [92]:
class UnionFind:
    def __init__(self, N):
        self.parent = np.array([-1] * (2*N-1))
        self.next_label = N
        
    def union(self, m, n):
        self.parent[m] = self.next_label
        self.parent[n] = self.next_label
        self.next_label += 1
        
    def find(self, n):
        while self.parent[n] != -1 :
            n = self.parent[n]
        return n
    
    def fast_find(self, n):
        p = n
        while self.parent[n] != -1 :
            n = self.parent[n]
        while self.parent[p] != n: # label all the way up to the root
            (p, self.parent[p]) = (self.parent[p], n)
        return n

In [93]:
def label(L, do_fast_find=True):
    LL = []
    N = len(L) + 1
    U = UnionFind(N)
    for idx, (a, b, delta) in enumerate(L):
        if do_fast_find:
            LL.append((U.fast_find(a), U.fast_find(b), delta))
        else:
            LL.append((U.find(a), U.find(b), delta))
        U.union(a, b)
    return LL

## Testing

In [2]:
import sklearn.datasets
import scipy.cluster.hierarchy as hac


In [3]:
iris = sklearn.datasets.load_iris()

In [5]:
data = iris.data

In [100]:
import scipy

In [102]:
a = np.array([[0,   0  ],
              [1,   0  ],
              [0,   1  ],
              [1,   1  ], 
              [0.5, 0  ],
              [0,   0.5],
              [0.5, 0.5],
              [2,   2  ],
              [2,   3  ],
              [3,   2  ],
              [3,   3  ]])

## Example

In [None]:
import scipy.cluster.hierarchy as hac
import matplotlib.pyplot as plt
import numpy as np

In [3]:
a = np.array([[0.1,   2.5],
              [1.5,   .4 ],
              [0.3,   1  ],
              [1  ,   .8 ],
              [0.5,   0  ],
              [0  ,   0.5],
              [0.5,   0.5],
              [2.7,   2  ],
              [2.2,   3.1],
              [3  ,   2  ],
              [3.2,   1.3]])

hac

In [None]:
for method, axes in zip(['single', 'complete'], axes23):
    z = hac.linkage(a, method=method)

    # Plotting
    axes[0].plot(range(1, len(z)+1), z[::-1, 2])
    knee = np.diff(z[::-1, 2], 2)
    axes[0].plot(range(2, len(z)), knee)

    num_clust1 = knee.argmax() + 2
    knee[knee.argmax()] = 0
    num_clust2 = knee.argmax() + 2

    axes[0].text(num_clust1, z[::-1, 2][num_clust1-1], 'possible\n<- knee point')

    part1 = hac.fcluster(z, num_clust1, 'maxclust')
    part2 = hac.fcluster(z, num_clust2, 'maxclust')

    clr = ['#2200CC' ,'#D9007E' ,'#FF6600' ,'#FFCC00' ,'#ACE600' ,'#0099CC' ,
    '#8900CC' ,'#FF0000' ,'#FF9900' ,'#FFFF00' ,'#00CC01' ,'#0055CC']

    for part, ax in zip([part1, part2], axes[1:]):
        for cluster in set(part):
            ax.scatter(a[part == cluster, 0], a[part == cluster, 1], 
                       color=clr[cluster])

    m = '\n(method: {})'.format(method)
    plt.setp(axes[0], title='Screeplot{}'.format(m), xlabel='partition',
             ylabel='{}\ncluster distance'.format(m))
    plt.setp(axes[1], title='{} Clusters'.format(num_clust1))
    plt.setp(axes[2], title='{} Clusters'.format(num_clust2))

plt.tight_layout()
plt.show()