I started from [this tutorial](https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/), but let me know if there are other sources I should check out.

In [1]:
from scipy.cluster.hierarchy import leaves_list, linkage
import pandas
import numpy as np
from time import time

In [2]:
def cluster(dataframe):
    cols_linkage = linkage(dataframe.T, 'ward')
    cols_order = leaves_list(cols_linkage).tolist()
    col_labels = dataframe.columns.tolist()
    col_label_order = [col_labels[i] for i in cols_order]

    rows_linkage = linkage(dataframe, 'ward')
    rows_order = leaves_list(rows_linkage).tolist()
    row_labels = dataframe.index.tolist()
    row_label_order = [row_labels[i] for i in rows_order]

    return dataframe[col_label_order].loc[row_label_order]

In [3]:
dataframe = pandas.DataFrame(
    [
        [1, 4, 1, 5],
        [8, 4, 8, 5],
        [2, 4, 2, 5],
        [9, 4, 9, 5]
    ],
    columns=['c1', 'c2', 'c3', 'c4'],
    index=['r1', 'r2', 'r3', 'r4']
)

clustered = cluster(dataframe)
    
assert clustered.as_matrix().tolist() == [
    [1, 1, 4, 5],
    [2, 2, 4, 5],
    [8, 8, 4, 5],
    [9, 9, 4, 5]
]
assert clustered.columns.tolist() == ['c1', 'c3', 'c2', 'c4']
assert clustered.index.tolist() == ['r1', 'r3', 'r2', 'r4']

In [4]:
def random_df(rows,cols):
    array = np.random.rand(rows, cols)
    col_labels = ['cond-{}'.format(i)
                  for i in range(cols)]
    row_labels = ['gene-{}'.format(i)
                  for i in range(rows)]
    return pandas.DataFrame(
        array,
        columns=col_labels,
        index=row_labels)

In [10]:
def const_df(rows,cols):
    array = np.full((rows, cols), 0.0)
    col_labels = ['cond-{}'.format(i)
                  for i in range(cols)]
    row_labels = ['gene-{}'.format(i)
                  for i in range(rows)]
    return pandas.DataFrame(
        array,
        columns=col_labels,
        index=row_labels)

In [12]:
def time_cluster(df):
    start = time()
    cluster(df)
    end = time()
    return end - start

In [16]:
cols=[5,25]
rows=[10,50,100,500,1000,5000,10000]
pandas.DataFrame(
    [[time_cluster(const_df(r, c)) for c in cols] for r in rows],
    columns=cols,
    index=rows
)

Unnamed: 0,5,25
10,0.001986,0.001898
50,0.002218,0.002403
100,0.001894,0.001976
500,0.006028,0.008274
1000,0.01722,0.026527
5000,0.410697,0.595262
10000,1.57984,2.417594


In [14]:
cols=[5,25]
rows=[10,50,100,500,1000,5000,10000]
pandas.DataFrame(
    [[time_cluster(random_df(r, c)) for c in cols] for r in rows],
    columns=cols,
    index=rows
)

Unnamed: 0,5,25
10,0.001995,0.002233
50,0.001941,0.001865
100,0.002065,0.002064
500,0.008196,0.010492
1000,0.024445,0.033961
5000,0.910093,1.082504
10000,3.795155,4.658869


In [17]:
rows=[10000,20000,30000]
pandas.DataFrame(
    [[time_cluster(random_df(r, c)) for c in cols] for r in rows],
    columns=cols,
    index=rows
)

Unnamed: 0,5,25
10000,3.866267,4.759982
20000,18.907857,22.474682
30000,52.796474,61.883403


Take homes:
- Clustering speed depends on characteristics of input.
- Increasing the column count won't matter that much at the scale we're dealing with.
- This gives us a better sense of when we might start running into multiple minute runtimes.
- Letting the UI offer reclustering of subsets is not inconceivable.