# UMAP Experiment

In [None]:
import sys
sys.path.append("..")

import datasets

from umap_bench.funcs import build_and_train
from umap_bench.funcs import draw_chart
from umap_bench.funcs import _run_build_and_train_once
from umap_bench.funcs import store_results
from umap_bench.funcs import maybe_load_results
from umap_bench.funcs import maybe_get_results

from umap_bench.funcs import perform_n_samples_test
from umap_bench.funcs import perform_n_components_test

from umap_bench import loaders

import warnings
warnings.filterwarnings("ignore")

import pickle
import rmm
import time
import numpy as np

from cuml.metrics import trustworthiness

import matplotlib.pyplot as plt

from umap import UMAP as UMAP_LEARN
from cuml.manifold import UMAP as UMAP_CUML

import os
os.getcwd()

Define the number of cores for the multi-core CPU UMAP implementation to use

In [None]:
import os
os.environ["NUMBA_NUM_THREADS"] = "80"

Since the GPUMAP project is no longer being maintained, we make a best effort to provide reproducibility of benchmarks. We make it optional so the other implementations may still be evaluated if GPUMAP is not installed. 

In [None]:
has_gpumap = True
try:
    from gpumap import GPUMAP as UMAP_GPUMAP
except ImportError:
    has_gpumap = False
    
has_gpumap

In [None]:
RESULTS_FILE="results/results.pickle"
SCALE_RESULTS_FILE="results/scale_results.pickle"

POOL_SIZE_GB=15 # Number of GB to use for device memory pool

TRUST_BATCH_SIZE=5000 # Number of rows to use per batch for computing trustworthiness

KEY_UMAPCUML = "umapcuml"
KEY_UMAPLEARN = "umaplearn"
KEY_UMAPGPUMAP = "umapgpumap"

In [None]:
rmm.reinitialize(
    pool_allocator=True, # default is False
    managed_memory=False, # default is False
    initial_pool_size=int(1024*1024*1024*POOL_SIZE_GB), # set to 2GiB. Default is 1/2 total GPU memory
    devices=0, # GPU device  IDs to register. By default registers only GPU 0.
    logging=False, # default is False -- has perf overhead
)

In [None]:
final_results = maybe_load_results(RESULTS_FILE)

In [None]:
final_results

## Pen Digits Dataset

In [None]:
KEY_DIGITS = "digits"

X, y = loaders.load_digits()

In [None]:
results_digits = maybe_get_results(final_results, KEY_DIGITS)

In [None]:
results_digits[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_digits[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
results_digits[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
final_results[KEY_DIGITS] = results_digits

In [None]:
final_results[KEY_DIGITS]

In [None]:
store_results(final_results, RESULTS_FILE)

## Fashion MNIST Dataset

In [None]:
# https://github.com/zalandoresearch/fashion-mnist/blob/master/utils/mnist_reader.py
KEY_FASHION_MNIST = "fashion_mnist"

In [None]:
train, train_labels = loaders.load_fashion_mnist('data/fashion', kind='train')
test, test_labels = loaders.load_fashion_mnist('data/fashion', kind='t10k')
X = (np.array(np.vstack([train, test]), dtype=np.float64) [:50000]/ 255.0).astype(np.float32)
y = np.array(np.hstack([train_labels, test_labels]))[:50000].astype(np.float32)

In [None]:
results_fashion = maybe_get_results(final_results, KEY_FASHION_MNIST)

In [None]:
results_fashion[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_fashion[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
results_fashion[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
final_results[KEY_FASHION_MNIST] = results_fashion

In [None]:
store_results(final_results, RESULTS_FILE)

In [None]:
final_results[KEY_FASHION_MNIST]

In [None]:
classes = [
    'T-shirt/top',
    'Trouser',
    'Pullover',
    'Dress',
    'Coat',
    'Sandal',
    'Shirt',
    'Sneaker',
    'Bag',
    'Ankle boot']

In [None]:
draw_chart(UMAP_LEARN(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "UMAP-learn", classes)

In [None]:
draw_chart(UMAP_CUML(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "cuML UMAP", classes)

In [None]:
draw_chart(UMAP_GPUMAP(n_neighbors=10, min_dist=0.01), X, y, "Fashion MNIST", "GPUUMAP", classes)

### CIFAR-100

In [None]:
KEY_CIFAR100 = "cifar100"

train, test = loaders.load_cifar100("data/cifar100/cifar-100-python")

train, train_labels = (train[b"data"], train[b"fine_labels"])
test, test_labels = (test[b"data"], test[b"fine_labels"])

In [None]:
X = (np.array(np.vstack([train, test]), dtype=np.float64) [:60000]/ 255.0).astype(np.float32)
y = np.array(np.hstack([train_labels, test_labels]))[:60000].astype(np.float32)

In [None]:
results_cifar100 = maybe_get_results(final_results, KEY_CIFAR100)

In [None]:
results_cifar100[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
results_cifar100[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_cifar100[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
final_results[KEY_CIFAR100] = results_cifar100
store_results(final_results, RESULTS_FILE)

In [None]:
results_cifar100

## Shuttle Dataset

In [None]:
KEY_SHUTTLE = "shuttle"

X, y = loaders.load_shuttle("data/shuttle.mat")

In [None]:
results_shuttle = maybe_get_results(final_results, KEY_SHUTTLE)

In [None]:
results_shuttle[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_shuttle[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
results_shuttle[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
final_results[KEY_SHUTTLE] = results_shuttle
store_results(final_results, RESULTS_FILE)

In [None]:
results_shuttle

## COIL-20 Dataset

In [None]:
KEY_COIL20 = "coil20"

X, y = loaders.load_coil20("data/")

In [None]:
results_coil20 = maybe_get_results(final_results, KEY_COIL20)

In [None]:
results_coil20[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_coil20[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
results_coil20[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
final_results[KEY_COIL20] = results_coil20
store_results(final_results, RESULTS_FILE)

In [None]:
final_results[KEY_COIL20]

## MNIST Dataset

In [None]:
KEY_MNIST = "mnist"

X, y = loaders.load_mnist("data/")

In [None]:
results_mnist = maybe_get_results(final_results, KEY_MNIST)

In [None]:
results_mnist[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, y, {})

In [None]:
results_mnist[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, y, {})

In [None]:
results_mnist[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, y, {})

In [None]:
final_results[KEY_MNIST] = results_mnist
store_results(final_results, RESULTS_FILE)

In [None]:
results_mnist

## scRNA

This benchmark requires a pickle file to be output from the GPU notebook [here](https://github.com/clara-parabricks/rapids-single-cell-examples)

In [None]:
KEY_SCRNA = "scrna"

X = pickle.load( open( "data/scrna.pickle", "rb" ) )

In [None]:
X.shape

In [None]:
results_scrna = maybe_get_results(final_results, KEY_SCRNA)

In [None]:
results_scrna[KEY_UMAPCUML] = build_and_train(UMAP_CUML, X, None, {})

In [None]:
results_scrna[KEY_UMAPLEARN] = build_and_train(UMAP_LEARN, X, None, {})

In [None]:
results_scrna[KEY_UMAPGPUMAP] = build_and_train(UMAP_GPUMAP, X, None, {})

In [None]:
final_results[KEY_SCRNA] = results_scrna
store_results(final_results, RESULTS_FILE)

In [None]:
results_scrna

## Scale Benchmark

Test UMAP variants at different `n_samples` and `n_components`. Need to download the "GoogleNews-vectors-negative300.bin.gz" dataset.

In [None]:
X = load_word2vec("data/")

In [None]:
scale_results = maybe_load_results(SCALE_RESULTS_FILE)

In [None]:
scale_results

In [None]:
%%time
perform_n_components_test(UMAP_CUML, X, KEY_UMAPCUML)

In [None]:
store_results(scale_results, SCALE_RESULTS_FILE)

In [None]:
%%time
scale_results[KEY_UMAPCUML] = perform_n_samples_test(UMAP_CUML, X)

In [None]:
store_results(scale_results, SCALE_RESULTS_FILE)