In [2]:
import numpy as np
import pandas as pd

import cudf
import os

from sklearn import datasets
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.manifold.t_sne import trustworthiness

from cuml.manifold.umap import UMAP

The following generates 5 random clusters and demonstrates UMAP's ability to project the input data into a low-dimensional manifold that is still separable into the 5 clusters.

In [14]:
data, labels = datasets.make_blobs(
    n_samples=50000, n_features=10, centers=5)

In [15]:
%%time
embedding = UMAP().fit_transform(data)

CPU times: user 15.4 s, sys: 1.71 s, total: 17.1 s
Wall time: 4.53 s


In [17]:
score = adjusted_rand_score(labels,
            KMeans(5).fit_predict(embedding))

if score == 1.0:
    print("Cluster demonstration completed successfully")
else:
    print("Cluster demonstration failed")

Cluster demonstration completed successfully


The following uses the iris dataset to demonstrate UMAP's ability to produce low-dimensional embeddings that preserve local neighborhood structure. This is evaluated using a trustworthiness score, introduced originally by the t-SNE algorithm. 

In [20]:
iris = datasets.load_iris()
data = iris.data

Evaluate trustworthiness on randomly initialized embeddings

In [21]:
%%time
embedding = UMAP(
    n_neighbors=10, min_dist=0.01,  init="random"
).fit_transform(data)

CPU times: user 9.84 s, sys: 380 ms, total: 10.2 s
Wall time: 945 ms


In [22]:
trust = trustworthiness(iris.data, embedding, 10)
if trust >= 0.95:
    print("Trustworthiness on random initialization passed successfully")
else:
    print("Trustworthiness failed on random data")

Trustworthiness on random initialization passed successfully


Evaluate trustworthiness on embeddings initialized with spectral embeddings. 

In [103]:
iris_selection = np.random.choice(
    [True, False], 150, replace=True, p=[0.75, 0.25])
data = iris.data[iris_selection]

In [106]:
%%time
fitter = UMAP(n_neighbors=10, min_dist=0.01, verbose=True, init = "spectral")
fitter.fit(data)

CPU times: user 9.83 s, sys: 296 ms, total: 10.1 s
Wall time: 832 ms


In [107]:
new_data = iris.data[~iris_selection]
embedding = fitter.transform(new_data)

In [108]:
trust = trustworthiness(new_data, embedding, 10)
if trust >= 0.96:
    print("Trustworthiness on spectral initialization passed successfully")
else:
    print("Trustworthiness failed on spectral initialization: " + str(trust))

Trustworthiness on spectral initialization passed successfully
