## Adding a kernel to RAJAPerf 

In this section we will be adding a simple kernel to RAJAPerf, into a new group called tutorial with kernel name Tutorial_KERNEL. 
The tutorial uses several skeletal files for sequential and openmp, base and RAJA variants.
The bulk of this notebook is to populate these skeletal files.

We will compile and run them using the same charting techniques we demonstrated earlier in a subsequent notebook.

[Back to Table of Contents](./00-intro-and-contents.ipynb)

## Always run the following cell to init/reinit notebook


In [None]:
#pragma cling add_include_path("/opt/conda/include/")
#pragma cling add_library_path("/opt/conda/lib/")
#pragma cling load("libomp")

#pragma cling add_include_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/camp-2022.03.2-nh5kkatqzxgiwxusi4ddpyhf2zcqxyow/include/")
#pragma cling add_include_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/raja-2022.03.0-f7p4trkfeq4gcmqnt5623haejgcvwlvl/include/")
#pragma cling add_library_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/raja-2022.03.0-f7p4trkfeq4gcmqnt5623haejgcvwlvl/lib/")
#pragma cling load("libRAJA")

#pragma cling add_include_path("/home/jovyan/code/RAJAPerf/build_gcc/include/")
#pragma cling add_include_path("/home/jovyan/code/RAJAPerf/src/")
#pragma cling add_library_path("/home/jovyan/code/RAJAPerf/build_gcc/lib/")
#pragma cling load("libcommon")

## Introducing Polybench Covariance kernel as our surrogate

Using version 4 of the kernel
```
static
void kernel_covariance(int m, int n,
		       DATA_TYPE float_n,
		       DATA_TYPE POLYBENCH_2D(data,M,N,m,n),
		       DATA_TYPE POLYBENCH_2D(symmat,M,M,m,m),
		       DATA_TYPE POLYBENCH_1D(mean,M,m))
{
  int i, j, k;
#pragma scop
  for (j = 0; j < _PB_M; j++)
    {
      mean[j] = SCALAR_VAL(0.0);
      for (i = 0; i < _PB_N; i++)
        mean[j] += data[i][j];
      mean[j] /= float_n;
    }

  for (i = 0; i < _PB_N; i++)
    for (j = 0; j < _PB_M; j++)
      data[i][j] -= mean[j];

  for (i = 0; i < _PB_M; i++)
    for (j = i; j < _PB_M; j++)
      {
        cov[i][j] = SCALAR_VAL(0.0);
        for (k = 0; k < _PB_N; k++)
	  cov[i][j] += data[k][i] * data[k][j];
        cov[i][j] /= (float_n - SCALAR_VAL(1.0));
        cov[j][i] = cov[i][j];
      }
#pragma endscop

}


```

## create macros for loop bodies
We'll use relative indexing, since we're using Real_ptr vs actual Matrices
For consistency we'll use i,j,k = M,N,M index spaces in all the variants below

In [None]:

// this is going into the header for the kernel
#define POLYBENCH_VAR_BODY0 \
  mean[i] = 0.0;

#define POLYBENCH_VAR_BODY1 \
  mean[i] += ldata[(j*m)+i];

#define POLYBENCH_VAR_BODY2 \
  mean[i] /= float_n;

#define POLYBENCH_VAR_BODY3 \
  ldata[(j*m)+i] -= mean[j];

#define POLYBENCH_VAR_BODY4 \
  symmat[(i*m)+k] = 0.0;

#define POLYBENCH_VAR_BODY5 \
  symmat[(i*m)+k] += ldata[(j*m)+i] * ldata[(j*m)+k];

#define POLYBENCH_VAR_BODY6 \
  symmat[(i*m)+k] /= (float_n - 1.0); \
  symmat[(k*m)+i] = symmat[(i*m)+k];

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
const int M = 2;
const int N = 4;
rajaperf::Real_ptr g_data;
rajaperf::Real_ptr g_mean;
rajaperf::Real_ptr g_symmat;
rajaperf::detail::resetDataInitCount();
rajaperf::detail::allocAndInitDataRandSign(g_data,N*M,RAJA::DATA_ALIGN,(rajaperf::VariantID)0);
rajaperf::detail::allocAndInitDataConst(g_mean,M,RAJA::DATA_ALIGN,0.0,(rajaperf::VariantID)0);
rajaperf::detail::allocAndInitDataConst(g_symmat,M*M,RAJA::DATA_ALIGN,0.0,(rajaperf::VariantID)0);

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
// let's create a pure sequential kernel for now to check what the output looks like
// Note by default we create 1d index spaces but later we'll setup RAJA views to index 2d
void kernel_cov(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
  int i, j, k;

  int _PB_M = m;
  int _PB_N = n;
  double float_n = n;
  
  rajaperf::Real_ptr ldata;
  rajaperf::detail::allocData(ldata,n*m,RAJA::DATA_ALIGN,(rajaperf::VariantID)0);
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*n*m);
    
  /* Determine mean of column vectors of input data matrix */
  for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY0;
      for (j = 0; j < _PB_N; j++) {
        POLYBENCH_VAR_BODY1; 
      }
      POLYBENCH_VAR_BODY2;
  }

  /* Center the column vectors. */
  for (j = 0; j < _PB_N; j++) {
    for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY3;
    }
  }

  /* Calculate the m * m covariance matrix. */
  for (i = 0; i < _PB_M; i++) {
    for (k = i; k < _PB_M; k++) { // note this iteration space k is variable dependent on i
      POLYBENCH_VAR_BODY4;
      for (j = 0; j < _PB_N; j++) {
	    POLYBENCH_VAR_BODY5;
      }
      POLYBENCH_VAR_BODY6
    }
  }
    
  rajaperf::detail::deallocData(ldata, (rajaperf::VariantID)0);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

## setup lamdba seq to make it easier to code up the RAJA seq variant

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_lambda(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
    int i, j, k;
    rajaperf::Real_ptr ldata;
    rajaperf::detail::allocData(ldata,m*n,RAJA::DATA_ALIGN,(rajaperf::VariantID)0);
    std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m*n);
    double float_n = n;
    
    using Index_type = RAJA::Index_type;
    using Real_type = RAJA::Real_type;
    
    auto poly_var_base_lam0 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY0;
    };
    
    auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY1;
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY2;
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY3;
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i, Index_type k) {
      POLYBENCH_VAR_BODY4;
    };

    auto poly_var_base_lam5 = [=] (Index_type i, Index_type j, Index_type k) {
      POLYBENCH_VAR_BODY5;
    };
    
    auto poly_var_base_lam6 = [=] (Index_type i, Index_type k) {
      POLYBENCH_VAR_BODY6;
    };

    for(Index_type i = 0; i < m; ++i) {
        poly_var_base_lam0(i);
        for(Index_type j = 0; j < n; ++j) {
            poly_var_base_lam1(i,j);
        }
        poly_var_base_lam2(i);
    }
    for(Index_type j = 0; j < n; ++j) {
        for(Index_type i = 0; i < m; ++i) {
            poly_var_base_lam3(i,j);
        }
    }
#if 1
    for(Index_type i = 0; i < m; ++i) {
        for(Index_type k = i; k < m; ++k) {
            poly_var_base_lam4(i,k);
            for(Index_type j = 0; j < n; ++j) {
                poly_var_base_lam5(i,j,k);
            }
            poly_var_base_lam6(i,k);
        }
    }
#endif                
    rajaperf::detail::deallocData(ldata, (rajaperf::VariantID)0);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_lambda(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

## setup RAJA views and RAJA lambda bodies in terms of the RAJA views
The cell below illustrates how to setup a view with a simple 2D layout type

In [None]:
#include "RAJA/RAJA.hpp"
int L=2;

double* A = new double [L * L];

RAJA::View<double,RAJA::Layout<2> > Aview(A, L, L);
Aview(0,0) = 0.0;
Aview(0,1) = 1.0;
Aview(1,0) = 2.0;
Aview(1,1) = 3.0;
for(int i = 0; i < L; ++i) {
    for(int j = 0; j < L; ++j) {
      printf("%p ",&Aview(i,j));
    }
}

In [None]:
template <typename T>
void printValues(T* C, int N)
{
  for (int i = 0; i < N; ++i) {
    std::cout << "array[" << i << "] = " << C[i] << std::endl;
    }
};

In [None]:
#define POLYBENCH_VAR_VIEWS_RAJA \
  using VIEW_1 = RAJA::View<Real_type, RAJA::Layout<1> >; \
  using VIEW_2 = RAJA::View<Real_type, RAJA::Layout<2> >; \
  VIEW_1 meanview(mean,m); \
  VIEW_2 dataview(ldata,m,n); \
  VIEW_2 symmatview(symmat,m,m);

#define POLYBENCH_VAR_BODY0_RAJA \
  meanview(j) = 0.0;

#define POLYBENCH_VAR_BODY1_RAJA \
  meanview(j) += dataview(i,j);

#define POLYBENCH_VAR_BODY2_RAJA \
  meanview(j) /= float_n;

#define POLYBENCH_VAR_BODY3_RAJA \
  dataview(i,j) -= meanview(j);

#define POLYBENCH_VAR_BODY4_RAJA \
  symmatview(j1,j2) = 0.0;

#define POLYBENCH_VAR_BODY5_RAJA \
  symmatview(j1,j2) += dataview(i,j1) * dataview(i,j2);

#define POLYBENCH_VAR_BODY6_RAJA \
  symmatview(j2,j1) = symmatview(j1,j2);



## setup a RAJA Seq function


In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_raja(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
#if 1
    using Index_type = RAJA::Index_type;
    using Real_type = RAJA::Real_type;
    //Index_type i,j,k;
    rajaperf::Real_ptr ldata;
    rajaperf::detail::allocData(ldata,m*n,RAJA::DATA_ALIGN,(rajaperf::VariantID)0);
    std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m*n);
#endif
    double float_n = n;
   
    #define POLYBENCH_VAR_VIEWS_RAJA \
      using VIEW_1 = RAJA::View<Real_type, RAJA::Layout<1> >; \
      using VIEW_2 = RAJA::View<Real_type, RAJA::Layout<2> >; \
      VIEW_1 meanview(mean,m); \
      VIEW_2 dataview(ldata,n,m); \
      VIEW_2 symmatview(symmat,m,m);
    
    POLYBENCH_VAR_VIEWS_RAJA;

    //RAJA::View<Real_type, RAJA::Layout<1> > meanview(mean,m);
    //RAJA::View<Real_type, RAJA::Layout<2> > dataview(ldata,n,m);
    //RAJA::View<Real_type, RAJA::Layout<2> > symmatview(symmat,m,m);

    auto poly_var_base_lam0 = [=] (Index_type i) {
        meanview(i) = 0.0;
    };
    
   auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
       meanview(i) += dataview(j,i);
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
        meanview(i) /= float_n;
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
        dataview(j,i) -= meanview(i);
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i) {
        for(Index_type kk=i; kk < m; kk++) {
          symmatview(i,kk) = 0.0;
          for(Index_type jj=0; jj<n; jj++) {
              symmatview(i,kk) += dataview(jj,i) * dataview(jj,kk);
          }
          symmatview(kk,i) = symmatview(i,kk);
        }
    };

     using EXECPOL = RAJA::KernelPolicy<
                        RAJA::statement::For<0, RAJA::loop_exec,    // over m
                          RAJA::statement::Lambda<0, RAJA::Segs<0> >,  // i
                          RAJA::statement::For<1, RAJA::loop_exec,  // over n
                            RAJA::statement::Lambda<1, RAJA::Segs<0,1> > // i,j
                          >,
                          RAJA::statement::Lambda<2, RAJA::Segs<0> > // i
                        >,
                        RAJA::statement::For<1, RAJA::loop_exec,  // over n
                          RAJA::statement::For<0, RAJA::loop_exec, // over m
                            RAJA::statement::Lambda<3, RAJA::Segs<0,1> > // i,j
                          >
                        >,
                        RAJA::statement::For<0, RAJA::loop_exec, // over m
                          RAJA::statement::Lambda<4, RAJA::Segs<0> >
                        >
                      >;  
 
   RAJA::kernel_param<EXECPOL>(
          RAJA::make_tuple(RAJA::RangeSegment{0,m}, RAJA::RangeSegment{0,n}),
          RAJA::tuple<double>{0.0}, // we're using kernel_param for the convenience of specifying iteration spaces (Segs), so we need this param tuple
          poly_var_base_lam0,
          poly_var_base_lam1,
          poly_var_base_lam2,
          poly_var_base_lam3,
          poly_var_base_lam4
        );
 
  rajaperf::detail::deallocData(ldata, (rajaperf::VariantID)0);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_raja(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}