## Adding a kernel to RAJAPerf 

In this section we will be adding a simple kernel to RAJAPerf, into a new group called tutorial with kernel name Tutorial_KERNEL. 
The tutorial uses several skeletal files for sequential and openmp, base and RAJA variants.
The bulk of this notebook is to populate these skeletal files.

We will compile and run them using the same charting techniques we demonstrated earlier in a subsequent notebook.

[Back to Table of Contents](./00-intro-and-contents.ipynb)

## Always run the following cell to init/reinit notebook


In [None]:
#pragma cling add_include_path("/opt/conda/include/")
#pragma cling add_library_path("/opt/conda/lib/")
#pragma cling load("libomp")

#pragma cling add_include_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/camp-2022.03.2-nh5kkatqzxgiwxusi4ddpyhf2zcqxyow/include/")
#pragma cling add_include_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/raja-2022.03.0-f7p4trkfeq4gcmqnt5623haejgcvwlvl/include/")
#pragma cling add_library_path("/home/jovyan/spack/opt/spack/linux-ubuntu22.04-x86_64/gcc-10.4.0/raja-2022.03.0-f7p4trkfeq4gcmqnt5623haejgcvwlvl/lib/")
#pragma cling load("libRAJA")

#pragma cling add_include_path("/home/jovyan/code/RAJAPerf/build_gcc/include/")
#pragma cling add_include_path("/home/jovyan/code/RAJAPerf/src/")
#pragma cling add_library_path("/home/jovyan/code/RAJAPerf/build_gcc/lib/")
#pragma cling load("libcommon")

## Introducing Polybench Covariance kernel as our surrogate

Using version 4 of the kernel
```
static
void kernel_covariance(int m, int n,
		       DATA_TYPE float_n,
		       DATA_TYPE POLYBENCH_2D(data,M,N,m,n),
		       DATA_TYPE POLYBENCH_2D(symmat,M,M,m,m),
		       DATA_TYPE POLYBENCH_1D(mean,M,m))
{
  int i, j, k;
#pragma scop
  for (j = 0; j < _PB_M; j++)
    {
      mean[j] = SCALAR_VAL(0.0);
      for (i = 0; i < _PB_N; i++)
        mean[j] += data[i][j];
      mean[j] /= float_n;
    }

  for (i = 0; i < _PB_N; i++)
    for (j = 0; j < _PB_M; j++)
      data[i][j] -= mean[j];

  for (i = 0; i < _PB_M; i++)
    for (j = i; j < _PB_M; j++)
      {
        cov[i][j] = SCALAR_VAL(0.0);
        for (k = 0; k < _PB_N; k++)
	  cov[i][j] += data[k][i] * data[k][j];
        cov[i][j] /= (float_n - SCALAR_VAL(1.0));
        cov[j][i] = cov[i][j];
      }
#pragma endscop

}


```

## create macros for loop bodies
We'll use relative indexing for the non-RAJA variants, since we're using Real_ptr vs actual Matrices.

For consistency we'll use i,j,k = M,N,M index spaces in all the variants below

For the RAJA variants we'll use RAJA Views

Note for the last loop body, the inner for loop is dependent on the outer loop index.
So, most of the symmat calculation is put into a loop body.

In [None]:
#define POLYBENCH_VAR_BODY0 \
  mean[i] = 0.0;

#define POLYBENCH_VAR_BODY1 \
  mean[i] += ldata[(j*m)+i];

#define POLYBENCH_VAR_BODY2 \
  mean[i] /= float_n;

#define POLYBENCH_VAR_BODY3 \
  ldata[(j*m)+i] -= mean[i];

#define POLYBENCH_VAR_BODY4 \
  for(int kk = i; kk < m; kk++) { \
    symmat[(i*m)+kk] = 0.0; \
    for(int jj = 0; jj < n; jj++) { \
      symmat[(i*m)+kk] += ldata[(jj*m)+i] * ldata[(jj*m)+kk]; \
    } \
    symmat[(i*m)+kk] /= (float_n - 1.0); \
    symmat[(kk*m)+i] = symmat[(i*m)+kk]; \
  }


## setup RAJA views and RAJA lambda bodies in terms of the RAJA views
The cell below illustrates how to setup a view with a simple 2D layout type
Note that the rightmost index j in our example is fastest.

In [None]:
#include "RAJA/RAJA.hpp"
int L=2;

double* A = new double [L * L];

RAJA::View<double,RAJA::Layout<2> > Aview(A, L, L);
Aview(0,0) = 0.0;
Aview(0,1) = 1.0;
Aview(1,0) = 2.0;
Aview(1,1) = 3.0;
for(int i = 0; i < L; ++i) {
    for(int j = 0; j < L; ++j) {
      printf("%p ",&Aview(i,j));
    }
}

delete[] A;

In [None]:
#include "RAJA/RAJA.hpp"

#define POLYBENCH_VAR_VIEWS_RAJA \
  using VIEW_1 = RAJA::View<Real_type, RAJA::Layout<1> >; \
  using VIEW_2 = RAJA::View<Real_type, RAJA::Layout<2> >; \
  VIEW_1 meanview(mean,m); \
  VIEW_2 dataview(ldata,n,m); \
  VIEW_2 symmatview(symmat,m,m);

#define POLYBENCH_VAR_BODY0_RAJA \
  meanview(i) = 0.0;

#define POLYBENCH_VAR_BODY1_RAJA \
  meanview(i) += dataview(j,i);

#define POLYBENCH_VAR_BODY2_RAJA \
  meanview(i) /= float_n;

#define POLYBENCH_VAR_BODY3_RAJA \
  dataview(j,i) -= meanview(i);

#define POLYBENCH_VAR_BODY4_RAJA \
  for(Index_type kk=i; kk < m; kk++) { \
    symmatview(i,kk) = 0.0; \
    for(Index_type jj=0; jj<n; jj++) { \
      symmatview(i,kk) += dataview(jj,i) * dataview(jj,kk); \
    } \
    symmatview(i,kk) /= (float_n - 1.0); \
    symmatview(kk,i) = symmatview(i,kk); \
  }



In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"

const int M = 2;
const int N = 4;
//rajaperf::Real_ptr g_data = (rajaperf::Real_ptr)rajaperf::detail::allocHostData(N*M, RAJA::DATA_ALIGN);
//rajaperf::Real_ptr g_mean = (rajaperf::Real_ptr)rajaperf::detail::allocHostData(M, RAJA::DATA_ALIGN);
//rajaperf::Real_ptr g_symmat = (rajaperf::Real_ptr)rajaperf::detail::allocHostData(M*M, RAJA::DATA_ALIGN);
rajaperf::Real_ptr g_data = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
rajaperf::Real_ptr g_mean = (rajaperf::Real_ptr)malloc(M*sizeof(rajaperf::Real_ptr));
rajaperf::Real_ptr g_symmat = (rajaperf::Real_ptr)malloc(M*M*sizeof(rajaperf::Real_ptr));
rajaperf::detail::resetDataInitCount();
rajaperf::detail::initDataRandSign(g_data,N*M);
rajaperf::detail::initDataConst(g_mean,M,0.0);
rajaperf::detail::initDataConst(g_symmat,M*M,0.0);
for(int i=0;i<M;i++) printf("%lf ",g_mean[i]);

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
// let's create a pure sequential kernel for now to check what the output looks like
// Note by default we create 1d index spaces but later we'll setup RAJA views to index 2d
void kernel_cov(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
  int i, j, k;

  int _PB_M = m;
  int _PB_N = n;
  double float_n = n;
  
  rajaperf::Real_ptr ldata = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*n*m);
    
  /* Determine mean of column vectors of input data matrix */
  for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY0;
      for (j = 0; j < _PB_N; j++) {
        POLYBENCH_VAR_BODY1; 
      }
      POLYBENCH_VAR_BODY2;
  }

  /* Center the column vectors. */
  for (j = 0; j < _PB_N; j++) {
    for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY3;
    }
  }

  /* Calculate the m * m covariance matrix. */
  for (i = 0; i < _PB_M; i++) {
    POLYBENCH_VAR_BODY4;
  }
    
  printf("Done with kernel\n");
    
  free(ldata);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::detail::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::detail::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::detail::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

## setup lamdba seq to make it easier to code up the RAJA seq variant

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_lambda(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
    int i, j, k;
    rajaperf::Real_ptr ldata = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
    std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m*n);
    double float_n = n;
    
    using Index_type = RAJA::Index_type;
    using Real_type = RAJA::Real_type;
    
    auto poly_var_base_lam0 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY0;
    };
    
    auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY1;
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY2;
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY3;
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY4;
    };

    for(Index_type i = 0; i < m; ++i) {
        poly_var_base_lam0(i);
        for(Index_type j = 0; j < n; ++j) {
            poly_var_base_lam1(i,j);
        }
        poly_var_base_lam2(i);
    }
    
    for(Index_type j = 0; j < n; ++j) {
        for(Index_type i = 0; i < m; ++i) {
            poly_var_base_lam3(i,j);
        }
    }

    for(Index_type i = 0; i < m; ++i) {
        poly_var_base_lam4(i);
    }
              
    free(ldata);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_lambda(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::detail::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::detail::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::detail::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

## setup a RAJA Seq function


In [None]:
//%%file raja_seq_cov.txt
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_raja(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
    using Index_type = RAJA::Index_type;
    using Real_type = RAJA::Real_type;
    //Index_type i,j,k;
    rajaperf::Real_ptr ldata = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
    std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m*n);

    double float_n = n;
    
    POLYBENCH_VAR_VIEWS_RAJA;
    
    auto poly_var_base_lam0 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY0_RAJA; 
    };
    
   auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY1_RAJA; 
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY2_RAJA; 
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY3_RAJA; 
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY4_RAJA;
    };

     using EXECPOL = RAJA::KernelPolicy<
                        RAJA::statement::For<0, RAJA::loop_exec,    // over m
                          RAJA::statement::Lambda<0, RAJA::Segs<0> >,  // i
                          RAJA::statement::For<1, RAJA::loop_exec,  // over n
                            RAJA::statement::Lambda<1, RAJA::Segs<0,1> > // i,j
                          >,
                          RAJA::statement::Lambda<2, RAJA::Segs<0> > // i
                        >,
                        RAJA::statement::For<1, RAJA::loop_exec,  // over n
                          RAJA::statement::For<0, RAJA::loop_exec, // over m
                            RAJA::statement::Lambda<3, RAJA::Segs<0,1> > // i,j
                          >
                        >,
                        RAJA::statement::For<0, RAJA::loop_exec, // over m
                          RAJA::statement::Lambda<4, RAJA::Segs<0> >
                        >
                      >;  
 
   RAJA::kernel_param<EXECPOL>(
          RAJA::make_tuple(RAJA::RangeSegment{0,m}, RAJA::RangeSegment{0,n}),
          RAJA::tuple<double>{0.0}, // we're using kernel_param for the convenience of specifying iteration spaces (Segs), so we need this param tuple
          poly_var_base_lam0,
          poly_var_base_lam1,
          poly_var_base_lam2,
          poly_var_base_lam3,
          poly_var_base_lam4
        );
 
  free(ldata);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_raja(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::detail::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::detail::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::detail::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_openmp(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{
  int i, j, k;

  int _PB_M = m;
  int _PB_N = n;
  double float_n = n;
  
  rajaperf::Real_ptr ldata = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*n*m);
    
  /* Determine mean of column vectors of input data matrix */
  #pragma omp parallel for
  for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY0;
      for (j = 0; j < _PB_N; j++) {
        POLYBENCH_VAR_BODY1; 
      }
      POLYBENCH_VAR_BODY2;
  }

  #pragma omp parallel for collapse(2)
  /* Center the column vectors. */
  for (j = 0; j < _PB_N; j++) {
    for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY3;
    }
  }

  #pragma omp parallel for
  /* Calculate the m * m covariance matrix. */
  for (i = 0; i < _PB_M; i++) {
    POLYBENCH_VAR_BODY4;
  }
    
  free(ldata);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_openmp(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::detail::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::detail::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::detail::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

In [None]:
//%%file raja_openmp_cov.txt
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
void kernel_cov_raja_openmp(int m, int n, rajaperf::Real_ptr data, rajaperf::Real_ptr mean, rajaperf::Real_ptr symmat)
{

    using Index_type = RAJA::Index_type;
    using Real_type = RAJA::Real_type;
    
    rajaperf::Real_ptr ldata = (rajaperf::Real_ptr)malloc(N*M*sizeof(rajaperf::Real_ptr));
    std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m*n);

    double float_n = n;
    
    POLYBENCH_VAR_VIEWS_RAJA;
    
    auto poly_var_base_lam0 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY0_RAJA; 
    };
    
   auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY1_RAJA; 
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY2_RAJA; 
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY3_RAJA; 
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY4_RAJA;
    };

     using EXECPOL = RAJA::KernelPolicy<
                        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,    // over m
                          RAJA::statement::Lambda<0, RAJA::Segs<0> >,  // i
                          RAJA::statement::For<1, RAJA::loop_exec,  // over n
                            RAJA::statement::Lambda<1, RAJA::Segs<0,1> > // i,j
                          >,
                          RAJA::statement::Lambda<2, RAJA::Segs<0> > // i
                        >,
                        RAJA::statement::For<1, RAJA::omp_parallel_for_exec,  // over n
                          RAJA::statement::For<0, RAJA::loop_exec, // over m
                            RAJA::statement::Lambda<3, RAJA::Segs<0,1> > // i,j
                          >
                        >,
                        RAJA::statement::For<0, RAJA::omp_parallel_for_exec, // over m
                          RAJA::statement::Lambda<4, RAJA::Segs<0> >
                        >
                      >;  
 
   RAJA::kernel_param<EXECPOL>(
          RAJA::make_tuple(RAJA::RangeSegment{0,m}, RAJA::RangeSegment{0,n}),
          RAJA::tuple<double>{0.0}, // we're using kernel_param for the convenience of specifying iteration spaces (Segs), so we need this param tuple
          poly_var_base_lam0,
          poly_var_base_lam1,
          poly_var_base_lam2,
          poly_var_base_lam3,
          poly_var_base_lam4
        );
 
  free(ldata);
}

In [None]:
#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"
{
  kernel_cov_raja_openmp(M,N,g_data,g_mean,g_symmat);
  long double checksum_mean = rajaperf::detail::calcChecksum(g_mean,M,1.0);
  long double checksum_symmat = rajaperf::detail::calcChecksum(g_symmat,M*M,1.0);
  long double checksum_data = rajaperf::detail::calcChecksum(g_data,N*M,1.0);
  printf("%8.12Lf\n",checksum_mean);
  printf("%8.12Lf\n",checksum_symmat);
  printf("%8.12Lf\n",checksum_data);
}

## Save code segments 
This next section will save code fragments to files which we will copy/paste into the skeletons via script in the next notebook  

We'll use the %%file Jupyter magic which Xeus-cling provides to do this.  

Currently, we cannot use both %%file and run the code, so we do the saving in a separate section

## TUTORIAL_KERNEL.hpp
So in the following source, we'll insert these code fragments found in the following cells
data_setup.txt
loop_bodies.txt
class_variables.txt
```
///
/// TUTORIAL_KERNEL kernel reference implementation:
///


#ifndef RAJAPerf_TUTORIAL_KERNEL_HPP
#define RAJAPerf_TUTORIAL_KERNEL_HPP

// insert data_setup.txt

// insert loop_bodies.txt

#include "common/KernelBase.hpp"

namespace rajaperf
{

class RunParams;

namespace tutorial
{

class TUTORIAL_KERNEL : public KernelBase
{
public:

  TUTORIAL_KERNEL(const RunParams& params);

  ~TUTORIAL_KERNEL();

  void setUp(VariantID vid, size_t tune_idx);
  void updateChecksum(VariantID vid, size_t tune_idx);
  void tearDown(VariantID vid, size_t tune_idx);

  void runSeqVariant(VariantID vid, size_t tune_idx);
  void runOpenMPVariant(VariantID vid, size_t tune_idx);
  
private:
  // insert class_variables.txt
};

} // end namespace tutorial
} // end namespace rajaperf

#endif // closing endif for header file include guard
```

## data setup macro
The kernel header will define a DATA_SETUP macro which introduces the loop body variables in terms of the class variables m_

In [None]:
%%file ../work/data_setup.txt
#define TUTORIAL_KERNEL_DATA_SETUP \
  Real_ptr mean = m_mean; \
  Real_ptr data = m_data; \
  Real_ptr symmat = m_symmat; \
  Real_ptr ldata = m_ldata; \
  const Index_type m = m_m; \
  const Index_type n = m_n; \
  Index_type i, j; \
  Index_type _PB_M = m_m; \
  Index_type _PB_N = m_n; \
  Real_type float_n = (Real_type)m_n;


In [None]:
%%file ../work/loop_bodies.txt

#define POLYBENCH_VAR_BODY0 \
  mean[i] = 0.0;

#define POLYBENCH_VAR_BODY1 \
  mean[i] += ldata[(j*m)+i];

#define POLYBENCH_VAR_BODY2 \
  mean[i] /= float_n;

#define POLYBENCH_VAR_BODY3 \
  ldata[(j*m)+i] -= mean[i];

#define POLYBENCH_VAR_BODY4 \
  for(int kk = i; kk < m; kk++) { \
    symmat[(i*m)+kk] = 0.0; \
    for(int jj = 0; jj < n; jj++) { \
      symmat[(i*m)+kk] += ldata[(jj*m)+i] * ldata[(jj*m)+kk]; \
    } \
    symmat[(i*m)+kk] /= (float_n - 1.0); \
    symmat[(kk*m)+i] = symmat[(i*m)+kk]; \
  }
  
#define TUTORIAL_KERNEL_VIEWS_RAJA \
  using VIEW_1 = RAJA::View<Real_type, RAJA::Layout<1> >; \
  using VIEW_2 = RAJA::View<Real_type, RAJA::Layout<2> >; \
  VIEW_1 meanview(mean,m); \
  VIEW_2 dataview(ldata,n,m); \
  VIEW_2 symmatview(symmat,m,m);

#define POLYBENCH_VAR_BODY0_RAJA \
  meanview(i) = 0.0;

#define POLYBENCH_VAR_BODY1_RAJA \
  meanview(i) += dataview(j,i);

#define POLYBENCH_VAR_BODY2_RAJA \
  meanview(i) /= float_n;

#define POLYBENCH_VAR_BODY3_RAJA \
  dataview(j,i) -= meanview(i);

#define POLYBENCH_VAR_BODY4_RAJA \
  for(Index_type kk=i; kk < m; kk++) { \
    symmatview(i,kk) = 0.0; \
    for(Index_type jj=0; jj<n; jj++) { \
      symmatview(i,kk) += dataview(jj,i) * dataview(jj,kk); \
    } \
    symmatview(i,kk) /= (float_n - 1.0); \
    symmatview(kk,i) = symmatview(i,kk); \
  }




In [None]:
%%file ../work/class_variables.txt
Index_type m_m;
Index_type m_n;
Real_ptr m_mean;
Real_ptr m_data;
Real_ptr m_symmat;
Real_ptr m_ldata;

## TUTORIAL_KERNEL.cpp
In the following source we'll insert the following code fragments
defaults.txt
kernel_setup.txt
checksum.txt
kernel_teardown.txt
```
#include "TUTORIAL_KERNEL.hpp"

#include "RAJA/RAJA.hpp"
#include "common/DataUtils.hpp"


namespace rajaperf
{
namespace tutorial
{


TUTORIAL_KERNEL::TUTORIAL_KERNEL(const RunParams& params)
  : KernelBase(rajaperf::Tutorial_KERNEL, params)
{
  // insert defaults.txt 
  checksum_scale_factor = 0.001 *
              ( static_cast<Checksum_type>(getDefaultProblemSize()) /
                                           getActualProblemSize() );

  setUsesFeature(Kernel);

  setVariantDefined( Base_Seq );
  setVariantDefined( RAJA_Seq );

  setVariantDefined( Base_OpenMP );
  setVariantDefined( RAJA_OpenMP );

}

TUTORIAL_KERNEL::~TUTORIAL_KERNEL()
{
}

void TUTORIAL_KERNEL::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
  (void) vid;
  // insert kernel_setup.txt
}

void TUTORIAL_KERNEL::updateChecksum(VariantID vid, size_t tune_idx)
{
  // insert checksum.txt
  
}

void TUTORIAL_KERNEL::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
  (void) vid;
  // insert kernel_teardown.txt
}

} // end namespace tutorial
} // end namespace rajaperf
```

In [None]:
%%file ../work/defaults.txt
  Index_type m_default = 1000;
  Index_type n_default = 1000;
  
  setDefaultProblemSize(m_default * n_default);
  setDefaultReps(1);

  m_m = std::sqrt( getTargetProblemSize() ) +1;
  m_n = m_m;

  setActualProblemSize(m_m * m_n);
  setItsPerRep (m_m * m_n); // not quite correct but ok for tutorial
  setKernelsPerRep(1);
  setBytesPerRep(sizeof(Real_type) * m_m * m_n);
  setFLOPsPerRep(m_m * m_n); 
  

In [None]:
%%file ../work/kernel_setup.txt
  rajaperf::detail::resetDataInitCount();
  allocAndInitDataRandSign(m_data,m_m * m_n,vid);
  allocAndInitDataConst(m_mean,m_m,0.0,vid);
  allocAndInitDataConst(m_symmat,m_m*m_m,0.0,vid);
  allocData(m_ldata,m_n*m_m,vid);

In [None]:
%%file ../work/checksum.txt
  checksum[vid][tune_idx] += rajaperf::detail::calcChecksum(m_symmat,m_m * m_m, checksum_scale_factor);
  

In [None]:
%%file ../work/kernel_teardown.txt
  deallocData(m_data, vid);
  deallocData(m_mean, vid);
  deallocData(m_symmat, vid);
  deallocData(m_ldata, vid);

## TUTORIAL_KERNEL-seq.cpp
In the sequential variants definition we'll insert the following code fragments
cov_seq.txt
cov_lambda_bodies.txt
raja_exec_policy.txt
raja_kernel_seq.txt
```
#include "TUTORIAL_KERNEL.hpp"

#include "RAJA/RAJA.hpp"

#include <iostream>


namespace rajaperf
{
namespace tutorial
{


void TUTORIAL_KERNEL::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
  const Index_type run_reps= getRunReps();

  TUTORIAL_KERNEL_DATA_SETUP;

  switch ( vid ) {

    case Base_Seq : {

      startTimer();
      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
       // insert cov_seq.txt
      }
      stopTimer();

      break;
    }

#if defined(RUN_RAJA_SEQ)
    case RAJA_Seq : {

      TUTORIAL_KERNEL_VIEWS_RAJA;

      // insert cov_lambda_bodies.txt

      // insert raja_exec_policy.txt
      
      startTimer();
      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
        // insert raja_kernel_seq.txt
      }
      stopTimer();

      break;
    }
#endif // RUN_RAJA_SEQ

    default : {
      getCout() << "\n  TUTORIAL_KERNEL : Unknown variant id = " << vid << std::endl;
    }

  }

}

} // end namespace tutorial
} // end namespace rajaperf
```


In [None]:
%%file ../work/cov_seq.txt
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m_n*m_m);
    
  /* Determine mean of column vectors of input data matrix */
  for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY0;
      for (j = 0; j < _PB_N; j++) {
        POLYBENCH_VAR_BODY1; 
      }
      POLYBENCH_VAR_BODY2;
  }

  /* Center the column vectors. */
  for (j = 0; j < _PB_N; j++) {
    for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY3;
    }
  }

  /* Calculate the m * m covariance matrix. */
  for (i = 0; i < _PB_M; i++) {
    POLYBENCH_VAR_BODY4;
  }

In [None]:
%%file ../work/cov_lambda_bodies.txt
    auto poly_var_base_lam0 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY0_RAJA; 
    };
    
    auto poly_var_base_lam1 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY1_RAJA; 
    };
    
    auto poly_var_base_lam2 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY2_RAJA; 
    };
  
    auto poly_var_base_lam3 = [=] (Index_type i, Index_type j) {
      POLYBENCH_VAR_BODY3_RAJA; 
    };
    
    auto poly_var_base_lam4 = [=] (Index_type i) {
      POLYBENCH_VAR_BODY4_RAJA;
    };

In [None]:
%%file ../work/raja_exec_policy.txt
   using EXECPOL = RAJA::KernelPolicy<
                        RAJA::statement::For<0, RAJA::loop_exec,    // over m
                          RAJA::statement::Lambda<0, RAJA::Segs<0> >,  // i
                          RAJA::statement::For<1, RAJA::loop_exec,  // over n
                            RAJA::statement::Lambda<1, RAJA::Segs<0,1> > // i,j
                          >,
                          RAJA::statement::Lambda<2, RAJA::Segs<0> > // i
                        >,
                        RAJA::statement::For<1, RAJA::loop_exec,  // over n
                          RAJA::statement::For<0, RAJA::loop_exec, // over m
                            RAJA::statement::Lambda<3, RAJA::Segs<0,1> > // i,j
                          >
                        >,
                        RAJA::statement::For<0, RAJA::loop_exec, // over m
                          RAJA::statement::Lambda<4, RAJA::Segs<0> >
                        >
                      >;  
 

In [None]:
%%file ../work/raja_kernel.txt
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*m_n*m_m);
  RAJA::kernel_param<EXECPOL>(
          RAJA::make_tuple(RAJA::RangeSegment{0,m}, RAJA::RangeSegment{0,n}),
          RAJA::tuple<double>{0.0}, // we're using kernel_param for the convenience of specifying iteration spaces (Segs), so we need this param tuple
          poly_var_base_lam0,
          poly_var_base_lam1,
          poly_var_base_lam2,
          poly_var_base_lam3,
          poly_var_base_lam4
        );

## TUTORIAL_KERNEL-omp.cpp
In the openmp variants definition we'll insert the following code fragments
cov_openmp.txt
cov_lambda_bodies.txt
raja_exec_policy_omp.txt
raja_kernel.txt
```
#include "TUTORIAL_KERNEL.hpp"

#include "RAJA/RAJA.hpp"

#include <iostream>

namespace rajaperf
{
namespace tutorial
{


void TUTORIAL_KERNEL::runOpenMPVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
{
#if defined(RAJA_ENABLE_OPENMP) && defined(RUN_OPENMP)

  const Index_type run_reps= getRunReps();

  TUTORIAL_KERNEL_DATA_SETUP;

  switch ( vid ) {

    case Base_OpenMP : {

      startTimer();
      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
        // insert cov_openmp.txt
      }
      stopTimer();

      break;
    }

    case RAJA_OpenMP : {

      TUTORIAL_KERNEL_VIEWS_RAJA;

      // insert cov_lambda_bodies.txt

      // insert raja_exec_policy_omp.txt
    
      startTimer();
      for (RepIndex_type irep = 0; irep < run_reps; ++irep) {
        // insert raja_kernel_omp.txt
      }
      stopTimer();

      break;
    }

    default : {
      getCout() << "\n  TUTORIAL_KERNEL : Unknown variant id = " << vid << std::endl;
    }

  }

#else
  RAJA_UNUSED_VAR(vid);
#endif
}

} // end namespace tutorial
} // end namespace rajaperf
```

In [None]:
%%file ../work/cov_openmp.txt
  std::memcpy(ldata,data,sizeof(rajaperf::Real_ptr)*n*m);
    
  /* Determine mean of column vectors of input data matrix */
  #pragma omp parallel for
  for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY0;
      for (j = 0; j < _PB_N; j++) {
        POLYBENCH_VAR_BODY1; 
      }
      POLYBENCH_VAR_BODY2;
  }

  #pragma omp parallel for collapse(2)
  /* Center the column vectors. */
  for (j = 0; j < _PB_N; j++) {
    for (i = 0; i < _PB_M; i++) {
      POLYBENCH_VAR_BODY3;
    }
  }

  #pragma omp parallel for
  /* Calculate the m * m covariance matrix. */
  for (i = 0; i < _PB_M; i++) {
    POLYBENCH_VAR_BODY4;
  }
    

In [None]:
%%file ../work/raja_exec_policy_omp.txt
  using EXECPOL = RAJA::KernelPolicy<
                        RAJA::statement::For<0, RAJA::omp_parallel_for_exec,    // over m
                          RAJA::statement::Lambda<0, RAJA::Segs<0> >,  // i
                          RAJA::statement::For<1, RAJA::loop_exec,  // over n
                            RAJA::statement::Lambda<1, RAJA::Segs<0,1> > // i,j
                          >,
                          RAJA::statement::Lambda<2, RAJA::Segs<0> > // i
                        >,
                        RAJA::statement::For<1, RAJA::omp_parallel_for_exec,  // over n
                          RAJA::statement::For<0, RAJA::loop_exec, // over m
                            RAJA::statement::Lambda<3, RAJA::Segs<0,1> > // i,j
                          >
                        >,
                        RAJA::statement::For<0, RAJA::omp_parallel_for_exec, // over m
                          RAJA::statement::Lambda<4, RAJA::Segs<0> >
                        >
                      >;  
 