# Benchmark and Bounds Tests

The purpose of this notebook is to benchmark all of the single GPU cuML algorithms against their skLearn counterparts, while also providing the ability to check upper bounds. 

In [None]:
import numpy as np
import pandas as pd
import cudf
import os
import time
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
rcParams['figure.dpi'] = 100

sns.set_style("darkgrid")

### Data loading functions

In [None]:
import gzip

def load_data_X(nrows, ncols, dtype = np.float32):
    """
    Generate a single dataframe with specified rows and cols
    """
    X = np.random.rand(nrows,ncols)
    df = pd.DataFrame({'fea%d'%i:X[:,i].astype(dtype) for i in range(X.shape[1])})
    return df

def load_data_Xy(nrows, ncols):
    """
    Generate a dataframe and series based on rows and cols
    """
    X = load_data_X(rows, cols)
    y = load_data_X(rows, 1)["fea0"]
    return X, y

In [None]:
def pandas_convert(data):
    if isinstance(data, tuple):
        return tuple([cudf.DataFrame.from_pandas(d) for d in data])
    else:
        return cudf.DataFrame.from_pandas(data)

### Pluggable benchmark function 

In [None]:
class SpeedupBenchmark(object):
    
    def __init__(self):
        self.name = "speedup"
    
    def __str__(self):
        return "Speedup"
    
    def run(self, algo, rows, dims, data):

        sk_start = time.time()
        algo.sk(data)
        sk_elapsed = time.time() - float(sk_start)

        data = pandas_convert(data)
        cu_start = time.time()
        algo.cuml(data)
        cu_elapsed = time.time() - cu_start

        # Needs to return the calculation and the name given to it.
        return sk_elapsed / float(cu_elapsed)

In [None]:
class BenchmarkRunner(object):
    
    def __init__(self, 
                 benchmarks = [SpeedupBenchmark()],
                 out_filename = "benchmark.pickle",
                 rerun = False,
                 n_runs = 3,
                 bench_rows = [2**x for x in range(13, 20)],
                 bench_dims = [64, 128, 256, 512]):

        self.benchmarks = benchmarks
        self.rerun = rerun
        self.n_runs = n_runs
        self.bench_rows = bench_rows
        self.bench_dims = bench_dims
        self.out_filename = out_filename
        
        
    def load_results(self):
        
        if os.path.exists(self.out_filename):
            print("Loaded previous benchmark results from %s" % (self.out_filename))
            with open(self.out_filename, 'rb') as f:
                return pickle.load(f)
                
        else:
            return {}
        
    def store_results(self, final_results):
        with open(self.out_filename, 'wb') as f:
            pickle.dump(final_results, f)
        
            
    def run(self, algo):
        
        final_results = self.load_results()
        
        for benchmark in self.benchmarks:
            if algo.name in final_results:
                results = final_results[algo.name]
            else:
                results = {}
                final_results[algo.name] = results

            for n_rows in self.bench_rows:
                for n_dims in self.bench_dims:                    
                    if (n_rows, n_dims, benchmark.name) not in results or self.rerun:

                        print("Running %s. (nrows=%d, n_dims=%d)" % (str(algo), n_rows, n_dims))

                        data = algo.load_data(n_rows, n_dims)
                        runs = [benchmark.run(algo, n_rows, n_dims, data) for i in range(self.n_runs)]
                        results[(n_rows, n_dims, benchmark.name)] = np.mean(runs)

                        print("Benchmark for %s = %f" % (str((n_rows, n_dims, benchmark.name)), results[(n_rows, n_dims, benchmark.name)]))
                        
                        self.store_results(final_results)

                            
    def chart(self, algo, title = "cuML vs SKLearn"):
        
        for benchmark in self.benchmarks:
        
            results = self.load_results()[algo.name]

            final = {}

            plts = []
            for dim in self.bench_dims:
                data = {k: v for (k, v) in results.items() if dim == k[1]}

                if len(data) > 0:
                    data = [(k[0], v) for k, v in data.items()]
                    data.sort(key = lambda x: x[0])

                    final[dim] = list(map(lambda x: x[1], data))

                    keys = list(map(lambda x: np.log2(x[0]), data))
                line = plt.plot(keys, final[dim], label = str(dim), linewidth = 3,  marker = 'o', markersize = 7)

                plts.append(line[0])
            leg = plt.legend(handles = plts, fontsize = 10)
            leg.set_title("Dimensions", prop = {'size':'x-large'})    
            plt.title("%s %s: %s" % (algo, benchmark, title), fontsize = 20)

            plt.ylabel(str(benchmark), fontsize = 10)
            plt.xlabel("Training Examples (2^x)", fontsize = 10)

            plt.tick_params(axis='both', which='major', labelsize=5)
            plt.tick_params(axis='both', which='minor', labelsize=5)

            plt.show()

In [None]:
class BaseAlgorithm(object):
    
    def __init__(self):
        self.load_data = load_data_X

In [None]:
from sklearn.neighbors import NearestNeighbors
from cuml import KNN as cumlKNN

class kNNAlgo(BaseAlgorithm):
    
    def __init__(self, n_neighbors = 1024):
        self.n_neighbors = n_neighbors
        self.name = "knn"
        BaseAlgorithm.__init__(self)
        
    def __str__(self):
        return "kNN"
        
    def sk(self, X):
        knn_sk = NearestNeighbors(n_neighbors = self.n_neighbors, algorithm = 'brute').fit(X)
        D_sk,I_sk = knn_sk.kneighbors(X)

    def cuml(self, X):
        knn_cuml = cumlKNN()
        knn_cuml.fit(X)
        D_cuml,I_cuml = knn_cuml.query(X,self.n_neighbors)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(13, 20)]).run(kNNAlgo())

In [None]:
runner.chart(kNNAlgo())

In [None]:
from sklearn.cluster import DBSCAN as skDBSCAN
from cuml import DBSCAN as cumlDBSCAN

class DBSCANAlgo(BaseAlgorithm):
    
    def __init__(self, eps = 3, min_samples = 2):
        self.name = "dbscan"
        self.eps = 3
        self.min_samples = 2
        BaseAlgorithm.__init__(self)
        
    def __str__(self):
        return "DBSCAN"

    def sk(X):
        clustering_sk = skDBSCAN(eps = eps, min_samples = min_samples, algorithm = "brute")
        clustering_sk.fit(X)

    def cuml(X):
        clustering_cuml = cumlDBSCAN(eps = eps, min_samples = min_samples)
        clustering_cuml.fit(X)

In [None]:
runner = BenchmarkRunner(bench_rows = [2**x for x in range(10, 17)]).run(DBSCANAlgo())

In [None]:
runner.chart(DBSCANAlgo())

In [None]:
from sklearn.linear_model import LinearRegression as skLR
from cuml import LinearRegression as cumlLR

def LinearRegressionAlgo(BaseAlgorithm):

    def __init__(self):
        self.name = "linear_regression"
        self.data_load = load_data_Xy
        
    def __str__(self):
        return "Linear Regression"

    def sk(data):
        X, y = data
        clustering_sk = skLR()
        clustering_sk.fit(X, y)

    def cuml(data):
        X, y = data
        X = cudf.DataFrame.from_pandas(X)
        y = cudf.Series.from_pandas(y)

        cuml_lr = cumlLR()
        cuml_lr.fit(X, y)


In [None]:
runner = BenchmarkRunner().run(LinearRegressionAlgo())

In [None]:
runner.chart(LinearRegressionAlgo())

In [None]:
from sklearn.decomposition import PCA as skPCA
from cuml import PCA as cumlPCA

class PCAAlgo(BaseAlgorithm):
    
    def __init__(self, n_components = 10):
        self.n_components = 10
        BaseAlgorithm.__init__(self)
        
    def __str__(self):
        return "PCA"

    def sk(X):
        skpca = skPCA(n_components = 10)
        skpca.fit(X)

    def cuml(X):
        X = cudf.DataFrame.from_pandas(X)
        cumlpca = cumlPCA(n_components = 10)
        cumlpca.fit(X)

In [None]:
runner = BenchmarkRunner().run(PCAAlgo())

In [None]:
runner.chart(PCAAlgo())