## UMAP reproducibility benchmark (runtime & trustworthiness)

In [1]:
from umap import UMAP as umap_learn
from cuml.manifold import UMAP as umap_cuml
from cuml.metrics import trustworthiness

from sklearn.datasets import make_blobs
import time
import numpy as np

In [2]:
def generate_data(n_samples):
    X, y = make_blobs(n_samples=n_samples, n_features=args['n_features'],
                      centers=int(n_samples/20), cluster_std=8.0)
    return X

In [3]:
def benchmark_model(model_constr, args, data, n_components):
    durations = []
    trust_scores = []
    for i in range(args['n_iter'] + 1):
        # Instantiate model
        model = model_constr(n_components=n_components, n_neighbors=args['n_neighbors'],
                             n_epochs=args['n_epochs'], random_state=args['random_state'])
        
        # Perform transformation and measure time
        start = time.time()
        transformed = model.fit_transform(data)
        durations.append(time.time()-start)
        
        # Compute trustworthiness score
        trust_scores.append(trustworthiness(data, transformed, n_neighbors=args['n_neighbors']))
    
    durations = np.array(durations[1:])
    trust_scores = np.array(trust_scores)
    
    # Compute runtime average and variance as well as trustworthiness score average
    return durations.mean(), durations.var(), trust_scores.mean()

In [4]:
def benchmark(args):
    for n_samples in args['n_samples']:
        for n_components in args['n_components']:
            # Generate dataset
            X = generate_data(n_samples)

            # Benchmarks the two models
            print("For dataset of shape ({}, {}) and n_components = {}:".format(n_samples, args['n_features'], n_components))

            n_elements = n_samples * args['n_features']
            if n_elements <= 10000000:
                print("\tUMAP-LEARN:")
                args['random_state'] = None
                ul_inconsistent = benchmark_model(umap_learn, args, X, n_components)
                args['random_state'] = 42
                ul_consistent = benchmark_model(umap_learn, args, X, n_components)
                print_results(ul_inconsistent, ul_consistent)

            print("\tCUML UMAP:")
            args['random_state'] = None
            cuml_inconsistent = benchmark_model(umap_cuml, args, X, n_components)
            args['random_state'] = 42
            cuml_consistent = benchmark_model(umap_cuml, args, X, n_components)
            print_results(cuml_inconsistent, cuml_consistent)

            a = cuml_consistent[0]
            b = cuml_inconsistent[0]
            slowdown = ((a - b) / a) * 100
            print('\t\tcuML consistent pathway is {:.2f}% slower\n'.format(slowdown))

In [5]:
def print_results(inconsistent, consistent):
    ic_dur_mean, ic_dur_var, ic_trust = inconsistent
    print("\t\tWithout random seed: runtime avg - var: {:.2f} - {:.2f}, tustworthiness: {:.2f}".format(ic_dur_mean, ic_dur_var, ic_trust))
    c_dur_mean, c_dur_var, c_trust = consistent
    print("\t\tWith random seed: runtime avg - var: {:.2f} - {:.2f}, tustworthiness: {:.2f}".format(c_dur_mean, c_dur_var, c_trust))

In [6]:
import warnings
warnings.filterwarnings('ignore')

args = {'n_samples':[1000, 10000, 100000], 'n_features':1000, 'centers':500,
        'n_components':[2, 8, 16], 'n_neighbors':15, 'n_epochs':500, 'n_iter':3}

benchmark(args)

For dataset of shape (1000, 1000) and n_components = 2:
	UMAP-LEARN:
		Without random seed: runtime avg - var: 2.56 - 0.00, tustworthiness: 1.00
		With random seed: runtime avg - var: 2.54 - 0.00, tustworthiness: 1.00
	CUML UMAP:
		Without random seed: runtime avg - var: 0.24 - 0.00, tustworthiness: 1.00
		With random seed: runtime avg - var: 0.24 - 0.00, tustworthiness: 1.00
		cuML consistent pathway is -3.18% slower

For dataset of shape (1000, 1000) and n_components = 8:
	UMAP-LEARN:
		Without random seed: runtime avg - var: 2.75 - 0.00, tustworthiness: 1.00
		With random seed: runtime avg - var: 2.71 - 0.00, tustworthiness: 1.00
	CUML UMAP:
		Without random seed: runtime avg - var: 0.25 - 0.00, tustworthiness: 1.00
		With random seed: runtime avg - var: 0.26 - 0.00, tustworthiness: 1.00
		cuML consistent pathway is 5.35% slower

For dataset of shape (1000, 1000) and n_components = 16:
	UMAP-LEARN:
		Without random seed: runtime avg - var: 2.85 - 0.00, tustworthiness: 1.00
		With ra