# Tutorial: Accelerating pygpc by using differen computing backends

This tutorial shows how to accelerate pygpc by choosing different computing backends. At the moment, the following backends are available:
1. Implementation in **Python**: pygpc names this backend **python**   
2. Implementation in **C++**: pygpc names this backend **cpu**
3. Implementation in **C++** and **OpenMP**: pygpc names this backend **omp**
4. Implementation in **CUDA-C++**: pygpc names this backend **cuda**, an Nvidia GPU is required

# Let's get started

## First, we set up the benchmark parameters
We define the number of samples, the dimensionality of the parameter space and the maximum number of basis functions. This will determine the size of the gPC matrix and therefore the compute time.

In [1]:
n_dim = 4
n_samples = 100000
n_basis_order = 4
n_coeffs = 1000

n_iterations = 50

## Then, we set up the problem and the grid

In [2]:
import pygpc
import numpy as np
from collections import OrderedDict

# define model
model = pygpc.testfunctions.DiscontinuousRidgeManufactureDecay()

# define parameters
parameters = OrderedDict()
for i_dim in range(n_dim):
    parameters["x"+str(i_dim)] = pygpc.Beta(pdf_shape=[1, 1], pdf_limits=[1.2, 2])

# define problem
problem = pygpc.Problem(model, parameters)

# define grid
options = dict()
grid = pygpc.Random(parameters_random=problem.parameters_random,
                                      n_grid=n_samples,
                                      options={"n_grid": n_samples, "seed": 1})

# define gPC
gpc = pygpc.Reg(problem=problem,
                order=[n_basis_order] * n_dim,
                order_max=n_basis_order,
                order_max_norm=1,
                interaction_order=n_dim,
                interaction_order_current=n_dim,
                options=options)
gpc.grid = grid

# get number of basis functions
n_basis = pygpc.get_num_coeffs_sparse([n_basis_order] * n_dim, n_basis_order, n_dim, n_dim, n_dim, 1)

# create coefficient matrix
coeffs = np.ones((len(gpc.basis.b), n_coeffs))

## Now, we use the different backends for computation
Per default the **omp**-backend is set. Let's try them all and see how the performance changes.

In [None]:
%%capture

import time

time_create_gpc_matrix = OrderedDict()
time_create_gpc_matrix["Python"] = []
time_create_gpc_matrix["C++"] = []
time_create_gpc_matrix["C++ OpenMP"] = []
time_create_gpc_matrix["CUDA"] = []

time_get_approximation = OrderedDict()
time_get_approximation["Python"] = []
time_get_approximation["C++"] = []
time_get_approximation["C++ OpenMP"] = []
time_get_approximation["CUDA"] = []

# warmup to wake gpu up from idle
for _ in range(10):
    gpc.backend = "cuda"
    # gpc.create_gpc_matrix(b=gpc.basis.b, x=gpc.grid.coords_norm)
    
# benchmark
for _ in range(n_iterations):
    # python backend
    gpc.backend = "python"
    
    # benchmark create_gpc_matrix
    start = time.time()
    gpc.create_gpc_matrix(b=gpc.basis.b, x=gpc.grid.coords_norm)
    stop = time.time()
    time_create_gpc_matrix["Python"].append(stop-start)
    
    # benchmark get_approximation
    start = time.time()
    gpc.get_approximation(coeffs, x=gpc.grid.coords_norm)
    stop = time.time()
    time_get_approximation["Python"].append(stop-start)
    
    # cpu backend
    gpc.backend = "cpu"
    
    # benchmark create_gpc_matrix
    start = time.time()
    gpc.create_gpc_matrix(b=gpc.basis.b, x=gpc.grid.coords_norm)
    stop = time.time()
    time_create_gpc_matrix["C++"].append(stop-start)
    
    # benchmark get_approximation
    start = time.time()
    gpc.get_approximation(coeffs, x=gpc.grid.coords_norm)
    stop = time.time()
    time_get_approximation["C++"].append(stop-start)
    
    # omp backend
    gpc.backend = "omp"
    
    # benchmark create_gpc_matrix
    start = time.time()
    gpc.create_gpc_matrix(b=gpc.basis.b, x=gpc.grid.coords_norm)
    stop = time.time()
    time_create_gpc_matrix["C++ OpenMP"].append(stop-start)
    
    # benchmark get_approximation
    start = time.time()
    gpc.get_approximation(coeffs, x=gpc.grid.coords_norm)
    stop = time.time()
    time_get_approximation["C++ OpenMP"].append(stop-start)
    
    # cuda backend
    gpc.backend = "cuda"
    
    # benchmark create_gpc_matrix
    start = time.time()
    # gpc.create_gpc_matrix(b=gpc.basis.b, x=gpc.grid.coords_norm)
    stop = time.time()
    time_create_gpc_matrix["CUDA"].append(stop-start)
    
    # benchmark get_approximation
    start = time.time()
    # gpc.get_approximation(coeffs, x=gpc.grid.coords_norm)
    stop = time.time()
    time_get_approximation["CUDA"].append(stop-start)

## Finally, let's plot the results

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

sns.set()

# create_gpc_matrix
ind_lst = []
label_lst = []

plt.figure()

for ind, (label, times) in enumerate(time_create_gpc_matrix.items()):
    
    ind_lst.append(ind)
    label_lst.append(label)
    
    plt.bar(ind, np.mean(times), yerr=np.std(times))

plt.yscale("log")
plt.xticks(ind_lst, label_lst)
plt.title("create_gpc_matrix, n_samples: {}, n_basis: {}".format(n_samples, n_basis))
plt.show()

# get_approximation
ind_lst = []
label_lst = []

plt.figure()

for ind, (label, times) in enumerate(time_get_approximation.items()):
    
    ind_lst.append(ind)
    label_lst.append(label)
    
    plt.bar(ind, np.mean(times), yerr=np.std(times))

plt.yscale("log")
plt.xticks(ind_lst, label_lst)
plt.title("get_approximation, n_samples: {}, n_basis: {}".format(n_samples, n_basis))
plt.show()