# Benchmark and Bounds Tests

The purpose of this notebook is to benchmark all of the single GPU cuML algorithms against their skLearn counterparts, while also providing the ability to find and verify upper bounds.

Each benchmark returns a Panda with the results, which can then be analyzed, manipulated, and stored to disk. 

## Notebook Credits
**Authorship**
Original Author: Corey Nolet <br />
Last Edit: Taurean Dyer, 9/25/2019<br />

Last Edit: Corey Nolet, 10/04/2019
    
### Test System Specs
Test System Hardware: DGX-1 
Test System Software: Ubuntu 16.04  
RAPIDS Version: 0.10.0pre - Conda Install  
Driver: 410.48
CUDA: 10.0  

### Known Working Systems
RAPIDS Versions: 0.10+

In [None]:
import cuml
import pandas as pd

from cuml.benchmark.runners import SpeedupComparisonRunner
from cuml.benchmark.algorithms import algorithm_by_name

print(cuml.__version__)

In [None]:
N_REPS = 3  # Number of times each test is repeated

DATA_NEIGHBORHOODS = "blobs"
DATA_CLASSIFICATION = "classification"
DATA_REGRESSION = "regression"
DATA_NLP = "20newsgroups"

INPUT_TYPE = "numpy"

benchmark_results = []

def enrich_result(algorithm, runner, result):
    result["algo"] = algorithm
    result["dataset_name"] = runner.dataset_name
    result["input_type"] = runner.input_type
    return result

def execute_benchmark(algorithm, runner):
    results = runner.run(algorithm_by_name(algorithm), verbose=True)
    results = [enrich_result(algorithm, runner, result) for result in results]
    benchmark_results.extend(results)

## Neighbors

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(8, 12)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("NearestNeighbors", runner)

In [None]:
benchmark_results

### Nearest Neighbors

### KNeighborsClassifier

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("KNeighborsClassifier", runner)

### KNeighborsRegressor

In [None]:
algorithm = "KNeighborsRegressor"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("KNeighborsRegressor", runner)

## Clustering

### DBSCAN

In [None]:
algorithm = "DBSCAN"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("DBSCAN", runner)

### K-means Clustering

In [None]:
algorithm = "KMeans"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(12, 22)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("KMeans", runner)

## Manifold Learning

### UMAP

In [None]:
algorithm = "UMAP"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[100, 500, 1000, 10000],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("UMAP", runner)

### T-SNE

In [None]:
algorithm = "TSNE"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[100, 500, 1000, 10000],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("TSNE", runner)

## Linear Models

### Linear Regression

In [None]:
algorithm = "LinearRegression"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("LinearRegression", runner)

### Logistic Regression

In [None]:
results = "LogisticRegression"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("LogisticRegression", runner)

### Ridge Regression

In [None]:
algorithm = "Ridge"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("Ridge", runner)

### Lasso Regression

In [None]:
algorithm = "Lasso"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("Lasso", runner)

### ElasticNet Regression

In [None]:
algorithm = "ElasticNet"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("ElasticNet", runner)

### Mini-batch SGD Classifier

In [None]:
algorithm = "MBSGDClassifier"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("MBSGDClassifier", runner)

## Decomposition

### PCA

In [None]:
algorithm = "PCA"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[100, 500, 1000, 10000],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("PCA", runner)

### TSVD

In [None]:
algorithm = "TSVD"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[100, 500, 1000, 10000],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("TSVD", runner)

## Ensemble

### Random Forest Classifier

In [None]:
algorithm = "RandomForestClassifier"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("RandomForestClassifier", runner)

### Random Forest Regressor

In [None]:
algorithm = "RandomForestRegressor"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("RandomForestRegressor", runner)

### FIL

In [None]:
algorithm = "FIL"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("FIL", runner)

### Sparse FIL

In [None]:
algorithm = "Sparse-FIL-SKL"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("Sparse-FIL-SKL", runner)

## Naive Bayes

### Multinomial Naive Bayes

In [None]:
algorithm = "MultinomialNB"
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)],
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NLP,
    input_type="csr",
    n_reps=N_REPS
)

execute_benchmark("MultinomialNB", runner)

## Random Projection

### Gaussian Random Projection

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(17, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("GaussianRandomProjection", runner)

### Sparse Random Projection

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(17, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_NEIGHBORHOODS,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("SparseRandomProjection", runner)

## SVM

### SVC

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 12)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_CLASSIFICATION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("SVC", runner)

### SVR

In [None]:
runner = cuml.benchmark.runners.SpeedupComparisonRunner(
    bench_rows=[2**x for x in range(11, 24)], 
    bench_dims=[64, 128, 256],
    dataset_name=DATA_REGRESSION,
    input_type=INPUT_TYPE,
    n_reps=N_REPS
)

execute_benchmark("SVR", runner)

## Build Dataframe With Results

In [None]:
%matplotlib inline

In [None]:
df = pd.DataFrame(benchmark_results)

In [None]:
def chart_single_algo_speedup(df, algorithm):
    df = df.loc[df.algo == algorithm]
    df = df.pivot(index="n_samples", columns="n_features", values="speedup")
    df.plot.bar()

In [None]:
def chart_all_algo_speedup(df):
    df = df[["algo", "speedup"]].groupby(["algo"]).mean()
    df.plot.bar()

In [None]:
chart_algo_speedup(df, "NearestNeighbors")

In [None]:
chart_all_algo_speedup(df)

In [None]:
df.to_csv("benchmark_results.csv")