# Assignment 2
**Samuel Sheehy (Student ID 18143565)**

**Techniques of High Performance Computing (PHAS0102)**

**MSc Scientific Computing, University College London**, **2019-2020**

## Setup

In [2]:
from scipy.sparse import find, random, csr_matrix
from scipy.sparse.linalg import LinearOperator
import numpy as np
import pyopencl as cl

import time

class Timer:    
    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.interval = self.end - self.start

## Question 1: OpenCL CSR matrix-vector product
Key Points:
* Derive from LinearOperator (only need to provide ```matvec```)
* Init Operaot with CSR elements (data, indices, indptr array)
* Only transfer these elements to the device in its initialisation
* Make use of SIMD features provided by Intel's AVX2 technology (i.e. use the CPU, the SIMD data-types, and make sure it's using its automatic parallelisation

#### Approach:
The matvec is a series of additions and multiplications that must be done multiple times for different data elements. To optimise performance, we parallelise each row/vector-element operation, and within each of these we use SIMD features to maximise execution speed.


In [3]:
def generate_sparse_matrix(rows, columns, density=0.3):
    """
    Generate a random sparse matrix of shape (rows x columns)
    in CRS format and return the index pointer array, column 
    indices and data as three distinct objects as well as a
    dense copy of the matrix.
    """
    sparse_matrix = random(rows, columns, density=density, format='csr')
    return (sparse_matrix.indptr, sparse_matrix.indices,
            sparse_matrix.data), sparse_matrix.A

rows = 10000
columns = 10000
(sparse_matrix), A_copy = generate_sparse_matrix(rows, columns, 0.5)

In [6]:
def bytesize(N, dtype_str):
    """
    Returns the number of bytes that an array of element
    size N occupies
    """
    if dtype_str == 'float64':
        return N*4
    elif dtype_str == 'uint64':
        return N*4
    elif dtype_str == 'double':
        return N*8


class OpenCL_LOp(LinearOperator):
    """
    Created in reference to the PyLops Linear Operator tutorial.
    """
    def __init__(self, rows, columns, sparse_matrix, data_type=np.float64):
        """
        Set up the subclass, giving it the required properties: shape,
        data type and whether its inverse can be found explicitly, and
        the matrix itself
        """
        # Setup parameters
        self.shape = [rows, columns]
        self.dtype = data_type
        self.explicit = False
        # Store the sparse matrix
        self.indptr, self.indices, self.data = sparse_matrix
        # Create the OpenCL tool
        self.openCL_matvec = initialise_OCL_kernel(self.indptr,
            self.indices, self.data, self.shape)

    def _matvec(self, x):
        """
        Apply the initialised operator to the given vector.
        """
        return self.openCL_matvec(x)


def initialise_slow_kernel(indptr, indicies, data, shape):
    """
    Generate a simple operator to perform
    the matrix-vector product of the given sparse
    matrix on a vector x.
    """
    def func(x):
        """"""
        y = np.zeros(shape[0])
        n = len(indptr) - 1
        for i in range(n):
            y[i] = data[indptr[i]:indptr[i+1]].dot(
                x[indices[indptr[i]:indptr[i+1]]]
            )
        return y
        """"""
    return func


def initialise_OCL_kernel(indptr, indicies, data, shape):
    """
    Generate OpenCL-implemented operator to perform
    the matrix-vector product of the given sparse
    matrix on a vector x.
    """
    # Unpack shape parameter
    mat_rows, columns = shape
    # OpenCL C-Language Kernel
    # This implements the row-wise dot product
    kernel = """
        __kernel void matvec(
            __global uint *indptr,
            __global uint *indicies,
            __global double *data,
            __global double *myvector,
            __global double *result
        )
        {        
        // Get Global Identifiers i.e. the row index
        int row_index = get_global_id(0);

        // Do the dot product
        int start = indptr[row_index];
        int end = indptr[row_index + 1];
        int k;
        double sum = 0;
        double8 data_8;
        double8 vector_8;
        double8 product_8;
        int extra = (end-start) % 8;
        
        for(k=start; k < end-extra; k=k+8){
            
            data_8 = (double8)(data[k + 0],
                               data[k + 1],
                               data[k + 2],
                               data[k + 3],
                               data[k + 4],
                               data[k + 5],
                               data[k + 6],
                               data[k + 7]);
                               
            vector_8 = (double8)(myvector[indicies[k + 0]],
                                 myvector[indicies[k + 1]],
                                 myvector[indicies[k + 2]],
                                 myvector[indicies[k + 3]],
                                 myvector[indicies[k + 4]],
                                 myvector[indicies[k + 5]],
                                 myvector[indicies[k + 6]],
                                 myvector[indicies[k + 7]]);
        
            product_8 = data_8 * vector_8;
            
            sum += product_8.s1 + product_8.s2
                   + product_8.s3 + product_8.s4
                   + product_8.s5 + product_8.s6
                   + product_8.s7 + product_8.s0;
        }
        
        // Move the counter up to the last accessed
        // element and calculate the products individually for the 
        // terms that are too few for a double8
        for(int k = end-extra; k < end; k=k+1){
            sum += data[k]*myvector[indicies[k]];
        }       
        
        // Output answer
        result[row_index] = sum;
        
        }
    """
    # Initialise the OpenCL environment
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    
    # Create the buffers to hold the sparse matrix data
    indptr_buffer = cl.Buffer(ctx, mf.COPY_HOST_PTR | mf.READ_ONLY, hostbuf=indptr)
    indicies_buffer = cl.Buffer(ctx, mf.COPY_HOST_PTR | mf.READ_ONLY, hostbuf=indicies)
    data_buffer = cl.Buffer(ctx, mf.COPY_HOST_PTR | mf.READ_ONLY, hostbuf=data)
    
    # Create the buffer to hold the vector data
    vector_buffer = cl.Buffer(ctx, mf.READ_WRITE, bytesize(columns, 'double'))
    mapped_vector_buffer, linking_event = cl.enqueue_map_buffer(
        queue, vector_buffer, cl.map_flags.WRITE, 0, columns, np.dtype('double'))
    linking_event.wait()
    
    # Create buffer to hold the result
    result_buffer = cl.Buffer(ctx, mf.WRITE_ONLY, size=bytesize(mat_rows, 'double'))
    
    # Build the Kernel
    prg = cl.Program(ctx, kernel)
    prg.build()
    matvec_kernel = prg.matvec
    
    # Define what to be executed when matvec is called
    def exec_kernel(x):
        # Rewrite vector buffer
        with mapped_vector_buffer.base:
            mapped_vector_buffer[:] = x
        # Run the kernel
        matvec_kernel(
            # Workspace parameters
            queue, (mat_rows, ), (1, ),
            # Buffers
            indptr_buffer, indicies_buffer, data_buffer,
            vector_buffer, result_buffer
            #wait_for=filling_vector
        )
        
        result, _ = cl.enqueue_map_buffer(queue, result_buffer, cl.map_flags.READ,
                        0, (mat_rows, ), np.double)
        return result
    # Pass the kernel executor to the class
    return exec_kernel


mylop = OpenCL_LOp(rows, columns, sparse_matrix)
x = np.ones(columns)

with Timer() as t:
    control = A_copy.dot(x)
print(t.interval)

with Timer() as t:
    myres = mylop.matvec(x)
print(t.interval)

# print(np.linalg.norm(control - myres))
np.allclose(control, myres)

0.07194209098815918
0.05453634262084961


True

## Question 2: Solving a Poisson Problem with OpenCL

To consider:
* Parallelisation is over all interior points

Steps:
1. Implement a LOP
2. Integrate LOP into CG solver
3. Analyse with M: (a) convergence of residual, (b) number of iterations
4. Plot the solution

### Implementation of the LOP

In [3]:
class Poisson_CLOP(LinearOperator):
    """
    Created in reference to the PyLops Linear Operator tutorial.
    """
    def __init__(self, rows, columns, A_matrix, data_type=np.float64):
        """
        Create a Linear Operator for solving the 5-point stencil
        implementation of the Poisson Problem (using OpenCL to
        perform the calculations).
        """
        #sparse_matrix Setup parameters
        self.shape = [rows, columns]
        self.dtype = data_type
        self.explicit = False
        # Create the OpenCL tool
        self.poisson_matvec = initialise_Poisson_kernel(A_matrix)
    
    def _matvec(self, u):
        """
        Apply the linear operator to the given vector u.
        """
        return self.poisson_matvec(u)
    
def initialise_Poisson_kernel(A_matrix):
    """
    Setup an OpenCL kernel that will apply the matrix A-matrix
    to a vector of compatible size.
    """
   # Unpack shape parameter
    mat_rows, columns = A_matrix.shape
    # OpenCL C-Language Kernel
    # This implements the row-wise dot product
    kernel = """
        __kernel void matvec(
            __global double *matrix,
            __global double *vector,
            __global double *result
        )
        {        
        
        result = vector;
        
        }
    """
    # Initialise the OpenCL environment
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    
    # Create the buffers to hold the sparse matrix data
    matrix_buffer = cl.Buffer(ctx, mf.COPY_HOST_PTR | mf.READ_ONLY, hostbuf=A_matrix)
 
    # Create the buffer to hold the vector data
    vector_buffer = cl.Buffer(ctx, mf.READ_WRITE, bytesize(columns, 'double'))
    mapped_vector_buffer, linking_event = cl.enqueue_map_buffer(
        queue, vector_buffer, cl.map_flags.WRITE, 0, columns, np.dtype('double'))
    linking_event.wait()
    
    # Create buffer to hold the result
    result_buffer = cl.Buffer(ctx, mf.WRITE_ONLY, size=bytesize(mat_rows, 'double'))
    
    # Build the Kernel
    prg = cl.Program(ctx, kernel)
    prg.build()
    matvec_kernel = prg.matvec
    
    # Define what to be executed when matvec is called
    def exec_kernel(x):
        # Rewrite vector buffer
        with mapped_vector_buffer.base:
            mapped_vector_buffer[:] = x
        # Run the kernel
        matvec_kernel(
            # Workspace parameters
            queue, (mat_rows, ), (1, ),
            # Buffers
            matrix_buffer,
            vector_buffer,
            result_buffer
        )
        
        result, _ = cl.enqueue_map_buffer(queue, result_buffer, cl.map_flags.READ,
                        0, (mat_rows, ), np.double)
        return result
    # Pass the kernel executor to the class
    return exec_kernel