# Benchmark and Bounds Tests

The purpose of this notebook is to benchmark all of the single GPU cuML algorithms against their skLearn counterparts, while also providing the ability to find and verify upper bounds. This version of the `cuml_benchmarks` is meant to complete faster than the full version and on GPUs will smaller memory capacities.  If you need an exhaustive benchmark, please use the `cuml_benchmarks` notebook.

This benchmark will persist results into a file so that benchmarking may be continued, in the case of failure. 

Also supported is the ability to draw charts with the results, which should aid in presentations and transparency to end-users. 

**Note: if you get a Memory Error, please reduce your upper bound bench_rows to something that will fit in your GPU's memory.  This benchmark is Single GPU only, and you will have the opportunity to choose which GPU you want to benchmark**

## Notebook Credits
**Authorship**<br />
Original Author: Taurean Dyer, based on the work of Corey Nolet's original [cuML Benchmarks](intermediate_notebooks/benchmarks/cuml_benchmarks.ipynb)<br />
Last Edit: Taurean Dyer, 9/25/2019<br />

**Test System Specs**<br />
Test System Hardware: GV100<br />
Test System Software: Ubuntu 18.04<br />
RAPIDS Version: 0.10.0a - Docker Install<br />
Driver: 410.79<br />
CUDA: 10.0<br />


**Known Working Systems**<br />
RAPIDS Versions: 0.10

In [None]:
import numpy as np
import pandas as pd
import cudf
import os
import time
import pickle
import cuml

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from pylab import rcParams
rcParams['figure.figsize'] = 40, 20
rcParams['figure.dpi'] = 100

sns.set_style("darkgrid")


print(cuml.__version__)

## Please choose the GPU you'll be benchmarking and set its ID in the OS environment

In [None]:
!nvidia-smi

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Choose GPU here

In [None]:
# Default parameters

N_JOBS_SKLEARN = -1            # Passed to the n_jobs parameter, indicates number of cpu jobs to run
                               # Note that some sklearn algorithms do not support n_jobs (e.g. PCA), so they run a single job
RERUN_BENCH = True             # Set to true to force re-running even if a result is cached
MAX_BENCH_ROW_COUNTS = -1      # When iterating over many row sizes, only consider first N options (for faster testing, set to -1 for all options)
MAX_BENCH_FEATURE_COUNTS = -1  # When iterating over many feature counts, only consider first N options (for faster testing, set to -1 for all options)

# Benchmark function definitions

### Data loading functions

In [None]:
import gzip

def load_data_mortgage_X(nrows, ncols, cached = '../../data/mortgage/mortgage.npy.gz',source='mortgage', dtype = np.float32):
    print("Loading " + str(cached))
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.random((nrows,ncols)).astype(dtype)
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

def load_data_mortgage_Xy(nrows, ncols, dtype = np.float32):
    """
    Generate a dataframe and series based on rows and cols
    """
    X = load_data_mortgage_X(nrows, ncols, dtype = dtype)
    y = load_data_mortgage_X(nrows, 1, dtype = dtype)["fea0"]
    return (X, y)


def load_data_X(nrows, ncols, dtype = np.float32):
    """
    Generate a single dataframe with specified rows and cols
    """
    X = np.random.uniform(-1, 1, (nrows,ncols))
    df = pd.DataFrame({'fea%d'%i:X[:,i].astype(dtype) for i in range(X.shape[1])})
    return df

def load_data_Xy(nrows, ncols, dtype = np.float32):
    """
    Generate a dataframe and series based on rows and cols
    """
    X = load_data_X(nrows, ncols, dtype)
    y = load_data_X(nrows, 1, dtype)["fea0"]
    return (X, y)

def load_data_X_npy(nrows, ncols, dtype=np.float32):
    return np.random.uniform(-1, 1,(nrows, ncols))

def load_data_Xy_npy(nrows, ncols, dtype = np.float32):
    X = load_data_X_npy(nrows, ncols, dtype)
    y = load_data_X_npy(nrows, 1, dtype)
    return (X, y)
    

In [None]:
def pandas_convert(data):
    if isinstance(data, tuple):
        return tuple([pandas_convert(d) for d in data])
    elif isinstance(data, pd.DataFrame):
        return cudf.DataFrame.from_pandas(data)
    elif isinstance(data, pd.Series):
        return cudf.Series.from_pandas(data)
    else:
        raise Exception("Unsupported type %s" % str(type(data)))
        
def no_convert(data):
    if isinstance(data, tuple):
        return tuple([d for d in data])
    elif isinstance(data, np.ndarray):
        return data
    else:
        raise Exception("Unsupported type %s" % str(type(data)))


### Pluggable benchmark function 

In [None]:
class SpeedupBenchmark(object):
    
    def __init__(self, converter = pandas_convert):
        self.name = "speedup"
        self.converter = converter
    
    def __str__(self):
        return "Speedup"
    
    def run(self, algo, rows, dims, data):

        data2 = self.converter(data)
        cu_start = time.time()
        algo.cuml(data2)
        cu_elapsed = time.time() - cu_start
        
        sk_start = time.time()
        algo.sk(data)
        sk_elapsed = time.time() - float(sk_start)

        # Needs to return the calculation and the name given to it.
        return sk_elapsed / float(cu_elapsed)

In [None]:
class BenchmarkRunner(object):
    
    def __init__(self, 
                 benchmarks = [SpeedupBenchmark()],
                 out_filename = "benchmark.pickle",
                 rerun = RERUN_BENCH,
                 n_runs = 3,
                 bench_rows = [2**x for x in range(13, 20)],
                 bench_dims = [64, 128, 256, 512]):

        self.benchmarks = benchmarks
        self.rerun = rerun
        self.n_runs = n_runs
        self.bench_rows = bench_rows[:MAX_BENCH_ROW_COUNTS]
        self.bench_dims = bench_dims[:MAX_BENCH_FEATURE_COUNTS]
        self.out_filename = out_filename        
        
    def load_results(self):
        
        if os.path.exists(self.out_filename):
            print("Loaded previous benchmark results from %s" % (self.out_filename))
            with open(self.out_filename, 'rb') as f:
                return pickle.load(f)
                
        else:
            return {}
        
    def store_results(self, final_results):
        with open(self.out_filename, 'wb') as f:
            pickle.dump(final_results, f)
        
            
    def run(self, algo):
        
        final_results = self.load_results()
        
        for benchmark in self.benchmarks:
            if algo.name in final_results:
                results = final_results[algo.name]
            else:
                results = {}
                final_results[algo.name] = results

            for n_rows in self.bench_rows:
                for n_dims in self.bench_dims:     
                    if (n_rows, n_dims, benchmark.name) not in results or self.rerun:

                        print("Running %s. (nrows=%d, n_dims=%d)" % (str(algo), n_rows, n_dims))

                        data = algo.load_data(n_rows, n_dims)
                        runs = [benchmark.run(algo, n_rows, n_dims, data) for i in range(self.n_runs)]
                        results[(n_rows, n_dims, benchmark.name)] = np.mean(runs)

                        print("Benchmark for %s = %f" % (str((n_rows, n_dims, benchmark.name)), 
                                                         results[(n_rows, n_dims, benchmark.name)]))
                        
                        self.store_results(final_results)

                            
    def chart(self, algo, title = "cuML vs SKLearn"):
        
        for benchmark in self.benchmarks:
        
            results = self.load_results()[algo.name]

            final = {}

            plts = []
            for dim in self.bench_dims:
                data = {k: v for (k, v) in results.items() if dim == k[1]}

                if len(data) > 0:
                    data = [(k[0], v) for k, v in data.items()]
                    data.sort(key = lambda x: x[0])

                    final[dim] = list(map(lambda x: x[1], data))

                    keys = list(map(lambda x: np.log2(x[0]), data))
                line = plt.plot(keys, final[dim], label = str(dim), linewidth = 3,  marker = 'o', markersize = 7)

                plts.append(line[0])
            leg = plt.legend(handles = plts, fontsize = 30)
            leg.set_title("Dimensions", prop = {'size':'x-large'})    
            plt.title("%s %s: %s" % (algo, benchmark, title), fontsize = 30)

            plt.ylabel(str(benchmark), fontsize = 20)
            plt.xlabel("Training Examples (2^x)", fontsize = 40)

            plt.tick_params(axis='both', which='major', labelsize=15)
            plt.tick_params(axis='both', which='minor', labelsize=15)

            plt.show()

In [None]:
class BaseAlgorithm(object):
    def __init__(self, load_data = load_data_X):
        self.load_data = load_data

# Benchmarks and Results

### Nearest Neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
from cuml.neighbors import NearestNeighbors as cumlNN

class NearestNeighborsAlgo(BaseAlgorithm):
    
    def __init__(self, n_neighbors = 1024, load_data = load_data_X):
        self.n_neighbors = n_neighbors
        self.name = "nearest_neighbors"

        BaseAlgorithm.__init__(self, load_data)
        
    def __str__(self):
        return "NearestNeighbors"
        
    def sk(self, X):
        knn_sk = NearestNeighbors(n_neighbors = self.n_neighbors, algorithm = 'brute', n_jobs=N_JOBS_SKLEARN)
        knn_sk.fit(X)
        D_sk,I_sk = knn_sk.kneighbors(X[0:100])

    def cuml(self, X):
        knn_cuml = cumlNN(n_neighbors = self.n_neighbors)
        knn_cuml.fit(X)
        D_cuml,I_cuml = knn_cuml.kneighbors(X[0:100])


In [None]:
runner = BenchmarkRunner(benchmarks = [SpeedupBenchmark(no_convert)], bench_rows = [2**x for x in range(11, 17)])
runner.run(NearestNeighborsAlgo(load_data = load_data_X_npy))

In [None]:
runner = BenchmarkRunner()
runner.chart(NearestNeighborsAlgo())

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN as skDBSCAN
from cuml import DBSCAN as cumlDBSCAN

class DBSCANAlgo(BaseAlgorithm):
    
    def __init__(self, eps = 3, min_samples = 2):
        self.name = "dbscan"
        self.eps = 3
        self.min_samples = 2
        BaseAlgorithm.__init__(self)
        
    def __str__(self):
        return "DBSCAN"

    def sk(self, X):
        clustering_sk = skDBSCAN(eps = self.eps, min_samples = self.min_samples, algorithm = "brute", n_jobs=N_JOBS_SKLEARN)
        clustering_sk.fit(X)

    def cuml(self, X):
        clustering_cuml = cumlDBSCAN(eps = self.eps, min_samples = self.min_samples)
        clustering_cuml.fit(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(10, 17)])
runner.run(DBSCANAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(10, 17)])
runner.chart(DBSCANAlgo())

### UMAP

In [None]:
from umap import UMAP as skUMAP
from cuml.manifold.umap import UMAP as cumlUMAP

class UMAPAlgo(BaseAlgorithm):
    
    def __init__(self, n_neighbors = 5, n_epochs = 500):
        self.name = "umap"
        self.n_neighbors = n_neighbors
        self.n_epochs = n_epochs
        BaseAlgorithm.__init__(self)
        
    def __str__(self):
        return "UMAP"

    def sk(self, X):
        clustering_sk = skUMAP(n_neighbors = self.n_neighbors, n_epochs = self.n_epochs)
        clustering_sk.fit(X)

    def cuml(self, X):
        clustering_cuml = cumlUMAP(n_neighbors = self.n_neighbors, n_epochs = self.n_epochs)
        clustering_cuml.fit(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(12, 16)])
runner.run(UMAPAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(12, 16)])
runner.chart(UMAPAlgo())

### K-means Clustering

In [None]:
from sklearn.cluster import KMeans as skKmeans
from cuml.cluster import KMeans as cumlKmeans

class KMeansAlgo(BaseAlgorithm):
    
    def __init__(self, n_clusters=5):
        self.name = "kmeans"
        self.n_clusters = n_clusters
        BaseAlgorithm.__init__(self, load_data_X_npy)
        
    def __str__(self):
        return "KMeans"

    def sk(self, X):
        clustering_sk = skKmeans(n_clusters=self.n_clusters, n_jobs=N_JOBS_SKLEARN)
        clustering_sk.fit(X)

    def cuml(self, X):
        clustering_cuml = cumlKmeans(n_clusters=self.n_clusters)
        clustering_cuml.fit(X)

In [None]:
runner = BenchmarkRunner(benchmarks = [SpeedupBenchmark(no_convert)], bench_rows = [2**x for x in range(12, 18, 2)])
runner.run(KMeansAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(12, 18, 2)])
runner.chart(KMeansAlgo())

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression as skLR
from cuml.linear_model import LinearRegression as cumlLR

class LinearRegressionAlgo(BaseAlgorithm):
    def __init__(self):
        BaseAlgorithm.__init__(self, load_data_Xy)
        self.name = "linear_regression"
        
    def __str__(self):
        return "Linear Regression"

    def sk(self, data):
        X, y = data
        clustering_sk = skLR(n_jobs=N_JOBS_SKLEARN)
        clustering_sk.fit(X, y)

    def cuml(self, data):
        X, y = data
        cuml_lr = cumlLR()
        cuml_lr.fit(X, y)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(15, 18)])
runner.run(LinearRegressionAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(15, 18)])
runner.chart(LinearRegressionAlgo())

### PCA / SVD

In [None]:
from sklearn.decomposition import PCA as skPCA
from cuml import PCA as cumlPCA

class PCAAlgo(BaseAlgorithm):
    
    def __init__(self, n_components = 10, load_data = load_data_mortgage_X):
        self.n_components = 10
        self.name = "pca"
        BaseAlgorithm.__init__(self, load_data = load_data)
        
    def __str__(self):
        return "PCA"

    def sk(self, X):
        skpca = skPCA(n_components = 10)
        skpca.fit(X)

    def cuml(self, X):
        cumlpca = cumlPCA(n_components = 10)
        cumlpca.fit(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(18, 20)])
runner.run(PCAAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(18, 20)])
runner.chart(PCAAlgo())

In [None]:
from sklearn.ensemble import RandomForestClassifier as skRFC
from cuml.ensemble import RandomForestClassifier as cumlRFC

class RandomForestClassifierAlgo(BaseAlgorithm):
    
    def __init__(self, n_estimators = 1000, max_depth = 8, load_data = load_data_mortgage_Xy):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.name = "random_forest_classifier"
        BaseAlgorithm.__init__(self, load_data = load_data)
        
    def __str__(self):
        return "Random Forest Classifier"

    def sk(self, data):
        X, y = data
        skrfc = skRFC(n_jobs = -1, n_estimators = self.n_estimators, max_depth = self.max_depth)
        skrfc.fit(X, y.astype(np.int32))
        
    def cuml(self, data):
        X, y = data
        cumlrfc = cumlRFC(n_estimators = self.n_estimators, max_depth = self.max_depth)
        cumlrfc.fit(X, y.astype(np.int32))

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(18, 20)])
runner.run(RandomForestClassifierAlgo())

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(18, 20)])
runner.chart(RandomForestClassifierAlgo())

In [None]:
from sklearn.random_projection import GaussianRandomProjection as skGRP
from cuml.random_projection import GaussianRandomProjection as cumlGRP

class GaussianRandomProjectionAlgo(BaseAlgorithm):
    
    def __init__(self, load_data = load_data_mortgage_X):
        self.name = "gaussian_random_projection"
        BaseAlgorithm.__init__(self, load_data = load_data)
        
    def __str__(self):
        return "Gaussian Random Projection"

    def sk(self, data):
        X = data
        skrfc = skGRP(n_components = 2)
        skrfc.fit(X)
        skrfc.transform(X)

    def cuml(self, data):
        X = data
        cumlrfc = cumlGRP(n_components = 2)
        cumlrfc.fit(X)
        cumlrfc.transform(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(11, 20)])
runner.run(GaussianRandomProjectionAlgo())

In [None]:
from sklearn.random_projection import SparseRandomProjection as skSRP
from cuml.random_projection import SparseRandomProjection as cumlSRP

class SparseRandomProjection(BaseAlgorithm):
    
    def __init__(self, load_data = load_data_mortgage_X):
        self.name = "gaussian_random_projection"
        BaseAlgorithm.__init__(self, load_data = load_data)
        
    def __str__(self):
        return "Gaussian Random Projection"

    def sk(self, data):
        X = data
        skrfc = skSRP(n_components = 2)
        skrfc.fit(X)
        skrfc.transform(X)

    def cuml(self, data):
        X = data
        cumlrfc = cumlSRP(n_components = 2)
        cumlrfc.fit(X)
        cumlrfc.transform(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(11, 25)])
runner.run(SparseRandomProjection())

In [None]:
from sklearn.manifold.tsne import trustworthiness as skTrust
from cuml.metrics. import SparseRandomProjection as cumlSRP

class SparseRandomProjection(BaseAlgorithm):
    
    def __init__(self, load_data = load_data_mortgage_X):
        self.name = "gaussian_random_projection"
        BaseAlgorithm.__init__(self, load_data = load_data)
        
    def __str__(self):
        return "Gaussian Random Projection"

    def sk(self, data):
        X = data
        skrfc = skSRP(n_components = 2)
        skrfc.fit(X)
        skrfc.transform(X)

    def cuml(self, data):
        X = data
        cumlrfc = cumlSRP(n_components = 2)
        cumlrfc.fit(X)
        cumlrfc.transform(X)