# Environment Setup

In [39]:
# Check Linux distro. and its version
!cat /etc/os-release

PRETTY_NAME="Ubuntu 22.04.4 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.4 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy


In [40]:
# Check if CUDA (including drivers) is installed
!/usr/local/cuda/bin/nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0


In [None]:
# If not, then install CUDA. Important notes:
# 1. Better follow the instruction on https://developer.nvidia.com/cuda-downloads
# 2. Mind your PC's settings: Operating System -> Architecture -> Distribution -> Version -> Installer Type: deb (local)
# 3. Sample (student uncomments below code snippet and run it on purpose)
#!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
#!mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
#!wget https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda-repo-ubuntu2204-12-6-local_12.6.3-560.35.05-1_amd64.deb
#!dpkg -i cuda-repo-ubuntu2204-12-6-local_12.6.3-560.35.05-1_amd64.deb
#!cp /var/cuda-repo-ubuntu2204-12-6-local/cuda-*-keyring.gpg /usr/share/keyrings/
#!apt-get update
#!apt-get -y install cuda-toolkit-12-6

# Task 1: Getting Started with OpenCL

In [48]:
util = """
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>

#define CL_TARGET_OPENCL_VERSION 120
#include <CL/opencl.h>

#define MAX_SOURCE_SIZE (0x100000)

#define MAXVALUE 100

//--------------------------------------------------------
void dump (int *x, int N, int M)
{
    for (int i = 0; i < N; i++)
        {
            for (int j = 0; j < M; j++)
                std::cout << x[i * M + j] << " ";
            std::cout << std::endl;
        }
    std::cout << "----------------------------------------" << std::endl;
}

void numberGen (int N, int max, int *store)
{
    #pragma omp parallel for
    for (int i = 0; i < N; i++)
        store[i] = rand () % max;
}

const char* getErrorString(cl_int error) {
    switch (error) {
        case CL_SUCCESS: return "Success!";
        case CL_DEVICE_NOT_FOUND: return "Device not found.";
        case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
        case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
        case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
        case CL_OUT_OF_RESOURCES: return "Out of resources";
        case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
        case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
        case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
        case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
        case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
        case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
        case CL_MAP_FAILURE: return "Map failure";
        case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Misaligned sub buffer offset";
        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Execution status error for events in wait list";
        case CL_COMPILE_PROGRAM_FAILURE: return "Compile program failure";
        case CL_LINKER_NOT_AVAILABLE: return "Linker not available";
        case CL_LINK_PROGRAM_FAILURE: return "Link program failure";
        case CL_DEVICE_PARTITION_FAILED: return "Device partition failed";
        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "Kernel argument information not available";
        default: return "Unknown error";
    }
}

"""

In [118]:
main = """
int main(int argc, char *argv[]) {
    if (argc != 4) {
        std::cout << "Use : " << argv[0] << " N K M" << std::endl;
        exit(1);
    }
    srand(time(0));
    const int N = atoi(argv[1]);
    const int K = atoi(argv[2]);
    const int M = atoi(argv[3]);

    // Create the 3 vectors
    int *A = new int[N * K];
    int *B = new int[K * M];
    int *C = new int[N * M];

    numberGen(N * K, MAXVALUE, A);
    numberGen(K * M, MAXVALUE, B);
    
    struct timeval begin, end;

    
    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("matmul_opencl_kernel.cl", "r");
    if (!fp) {
        std::cerr << "Failed to load kernel" << std::endl;
        exit(1);
    }

    source_str = (char *)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    // Get platform and device information
    cl_platform_id platform_id = NULL;
    cl_device_id device_id = NULL;
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;
    cl_int ret;
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);

    gettimeofday(&begin, 0);

    // Check available platforms
    cl_uint num_platforms;
    ret = clGetPlatformIDs(0, NULL, &num_platforms);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to get number of platforms: " << getErrorString(ret) << std::endl;
        return -1;
    }
    
    std::cout << "Number of OpenCL platforms: " << num_platforms << std::endl;
    
    // Allocate memory for platform IDs
    cl_platform_id *platforms = (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id));
    ret = clGetPlatformIDs(num_platforms, platforms, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to get platform IDs: " << getErrorString(ret) << std::endl;
        free(platforms);
        return -1;
    }
    
    // Print platform names
    for (cl_uint i = 0; i < num_platforms; i++) {
        char platform_name[128];
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        std::cout << "Platform " << i << ": " << platform_name << std::endl;
    }
    
    // Get device IDs
    cl_uint num_devices;
    ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to get number of devices: " << getErrorString(ret) << std::endl;
        free(platforms);
        return -1;
    }
    
    std::cout << "Number of devices: " << num_devices << std::endl;
    
    // Allocate memory for device IDs
    cl_device_id *devices = (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
    ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to get device IDs: " << getErrorString(ret) << std::endl;
        free(devices);
        free(platforms);
        return -1;
    }
    
    // Print device names
    for (cl_uint i = 0; i < num_devices; i++) {
        char device_name[128];
        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(device_name), device_name, NULL);
        std::cout << "Device " << i << ": " << device_name << std::endl;
    }
    
    // Clean up
    free(devices);
    free(platforms);

    // Create an OpenCL context for device in device_id
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create OpenCL context: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Create a command queue for device in device_id
    cl_command_queue queue = clCreateCommandQueue(context, device_id, 0, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create command queue: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Declare memory buffers
    cl_mem a_mem_obj;
    cl_mem b_mem_obj;
    cl_mem c_mem_obj;

    // Create memory buffers on the device for each vector
    a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, N * K * sizeof(int), NULL, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create buffer for A: " << getErrorString(ret) << std::endl;
        return -1;
    }

    b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, K * M * sizeof(int), NULL, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create buffer for B: " << getErrorString(ret) << std::endl;
        return -1;
    }

    c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * M * sizeof(int), NULL, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create buffer for C: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Copy the vectors to their respective memory buffers
    ret = clEnqueueWriteBuffer(queue, a_mem_obj, CL_TRUE, 0, N * K * sizeof(int), A, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to write to A buffer: " << getErrorString(ret) << std::endl;
        return -1;
    }

    ret = clEnqueueWriteBuffer(queue, b_mem_obj, CL_TRUE, 0, K * M * sizeof(int), B, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to write to B buffer: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create program: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Build the program
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
    if (ret != CL_SUCCESS) {
        size_t log_size;
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, MAX_SOURCE_SIZE, source_str, &log_size);
        source_str[log_size] = '0'; // Null-terminate the string correctly
        std::cerr << "Error in kernel compilation:" << source_str << std::endl; // Correctly formatted print statement
        return -1;
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "matmul_kernel", &ret);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to create kernel: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Set the arguments of the kernel
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
    ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
    ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
    ret |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&N);
    ret |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&K);
    ret |= clSetKernelArg(kernel, 5, sizeof(int), (void *)&M);

    // Set local and global workgroup sizes
    size_t global_work_size[2] = {static_cast<size_t>(N), static_cast<size_t>(M)};
    size_t local_work_size[2] = {1, 1}; // You can adjust this based on your device capabilities

    // Execute the OpenCL kernel
    ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to execute kernel: " << getErrorString(ret) << std::endl;
        return -1;
    }

    // Read the memory buffer C on the device to the local variable C
    ret = clEnqueueReadBuffer(queue, c_mem_obj, CL_TRUE, 0, N * M * sizeof(int), C, 0, NULL, NULL);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to read C buffer: " << getErrorString(ret) << std::endl;
        return -1;
    }

    gettimeofday(&end, 0);

    // Print the result to the terminal
    std::cout << "Matrix A:" << std::endl;
    dump(A, N, K);
    
    std::cout << "Matrix B:" << std::endl;
    dump(B, K, M);
    
    std::cout << "Matrix C (Result):" << std::endl;
    dump(C, N, M);

    // Calculate execution time
    long seconds = end.tv_sec - begin.tv_sec;
    long microseconds = end.tv_usec - begin.tv_usec;
    double elapsed = seconds + microseconds * 1e-6;
    std::cout << "Time elapsed: " << elapsed << " seconds." << std::endl;
    

    // Clean up
    clFlush(queue);
    clFinish(queue);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(b_mem_obj);
    clReleaseMemObject(c_mem_obj);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    free(source_str);
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}

"""

In [119]:
kernel = """
__kernel void matmul_kernel(__global int *A, __global int *B, __global int *C, int N, int K, int M) {
    // Get Index
    int row = get_global_id(0);
    int col = get_global_id(1);

    // Calculate C[row][col]
    if (row < N && col < M) {
        int sum = 0;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * M + col];
        }
        C[row * M + col] = sum;
    }
}
"""

In [141]:
util = """
#include <CL/opencl.h>
#include <iostream>
#define CL_TARGET_OPENCL_VERSION 120
const char* getErrorString(cl_int error) {
    switch (error) {
        case CL_SUCCESS: return "Success!";
        case CL_DEVICE_NOT_FOUND: return "Device not found.";
        case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
        case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
        case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
        case CL_OUT_OF_RESOURCES: return "Out of resources";
        case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
        case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
        case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
        case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
        case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
        case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
        case CL_MAP_FAILURE: return "Map failure";
        case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Misaligned sub buffer offset";
        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Execution status error for events in wait list";
        case CL_COMPILE_PROGRAM_FAILURE: return "Compile program failure";
        case CL_LINKER_NOT_AVAILABLE: return "Linker not available";
        case CL_LINK_PROGRAM_FAILURE: return "Link program failure";
        case CL_DEVICE_PARTITION_FAILED: return "Device partition failed";
        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "Kernel argument information not available";
        default: return "Unknown error";
    }
}
"""

In [142]:
main = """
#define CL_TARGET_OPENCL_VERSION 120

int main() {
    cl_uint num_platforms;
    cl_int ret = clGetPlatformIDs(0, NULL, &num_platforms);
    if (ret != CL_SUCCESS) {
        std::cerr << "Failed to get number of platforms: " << getErrorString(ret) << std::endl;
        return -1;
    }

    std::cout << "Number of OpenCL platforms: " << num_platforms << std::endl;

    return 0;
}

"""

In [143]:
cpp_file = open("matmul_opencl.cpp", "w")
cpp_file.write(util)
cpp_file.write(main)
cpp_file.close()

In [121]:
opencl_kernel_file = open("matmul_opencl_kernel.cl", "w")
opencl_kernel_file.write(kernel)
opencl_kernel_file.close()

In [144]:
!g++ -o matmul matmul_opencl.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lOpenCL

    4 | #define CL_TARGET_OPENCL_VERSION 120
      | 
In file included from [01m[K/usr/local/cuda/include/CL/cl.h:20[m[K,
                 from [01m[K/usr/local/cuda/include/CL/opencl.h:24[m[K,
                 from [01m[Kmatmul_opencl.cpp:2[m[K:
[01m[K/usr/local/cuda/include/CL/cl_version.h:23:[m[K [01;36m[Knote: [m[Kthis is the location of the previous definition
   23 | #define CL_TARGET_OPENCL_VERSION 300
      | 
[01m[K/usr/local/cuda/include/CL/cl_version.h:22:104:[m[K [01;36m[Knote: [m[K‘[01m[K#pragma message: cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)[m[K’
   22 | _TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)"[01;36m[K)[m[K
      |                                                                       [01;36m[K^[m[K



In [145]:
!./matmul

Failed to get number of platforms: Unknown error


In [123]:
!./matmul 2 2 2

Failed to get number of platforms: Unknown error


## Q1. Compare the result of parallel function to sequential function.

TO-DO

## Q2. Try with different sizes of input.

TO-DO

## Q3. Using `matplotlib` to compare the correlation between input size and running time of sequential and parallel methods.

TO-DO

Suggestion: 
+ Basic: Run ./matmul multiple times with multiple sizes of data to get raw output -> Put the results in Python's lists and use matplotlib to plot.
+ Elegant: Use Python to invoke ./matmul, parse different sizes of data as the arguments of ./matmul, collect program's results, process and plot it.

## Q4. Compare runtime and efficiency of OpenMP and OpenCL.

TO-DO

## Q5. Edit your OpenCL solution to use local variable.

In [None]:
kernel = """
    // TO-DO
"""

In [None]:
!g++ -o matmul matmul_opencl.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lOpenCL
!./matmul 100 100 100

## Q6. Compare the execution time of your global and local solutions, on different sizes of input.

TO-DO (Same as Q3)

# Task 2: Pixelate an Image

In [None]:
# Install libjpeg for JPEG manipulating in C/C++
!apt-get install libjpeg-dev
# Kindly check if libjpeg is installed
!file /usr/include/jpeglib.h

In [None]:
# Prepare input image
!wget "https://raw.githubusercontent.com/opencv/opencv/refs/heads/4.x/samples/data/lena.jpg" -O input.jpg

In [None]:
seq_func="""
#define BLOCK_SIZE 8
// Sequential implementation of pixelization
void pixelize_seq(ImageData& input, ImageData& output) {
    for (int y = 0; y < input.height; y += BLOCK_SIZE) {
        for (int x = 0; x < input.width; x += BLOCK_SIZE) {
            // Calculate average color for block
            int r = 0, g = 0, b = 0;
            int count = 0;
            
            // Sum all pixels in block
            for (int by = 0; by < BLOCK_SIZE && y + by < input.height; by++) {
                for (int bx = 0; bx < BLOCK_SIZE && x + bx < input.width; bx++) {
                    int idx = ((y + by) * input.width + (x + bx)) * input.channels;
                    r += input.data[idx];
                    g += input.data[idx + 1];
                    b += input.data[idx + 2];
                    count++;
                }
            }
            
            // Calculate average
            r /= count;
            g /= count;
            b /= count;
            
            // Apply average color to block
            for (int by = 0; by < BLOCK_SIZE && y + by < input.height; by++) {
                for (int bx = 0; bx < BLOCK_SIZE && x + bx < input.width; bx++) {
                    int idx = ((y + by) * input.width + (x + bx)) * input.channels;
                    output.data[idx] = r;
                    output.data[idx + 1] = g;
                    output.data[idx + 2] = b;
                }
            }
        }
    }
}
"""

## Q1. Complete the program invoking pixelize_seq()

In [None]:
TO-DO

In [None]:
!g++ -o pixelate pixelate_opencl.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64  -lOpenCL -ljpeg
!./pixelate input.jpg

|Input|Output|
|---|---|
|![input.jpg](./input.jpg "input.jpg")|![output_seq.jpg](./output_seq.jpg "output_seq.jpg")|

## Q2. Implement pixelize_kernel() in `pixelize_kernel.cl`

In [None]:
TO-DO

In [None]:
!g++ -o pixelate pixelate_opencl.cpp -I/usr/local/cuda/include -L/usr/local/cuda/lib64  -lOpenCL -ljpeg
!./pixelate input.jpg

|Output Seq.|Output OCL|
|---|---|
|![output_seq.jpg](./output_seq.jpg "output_seq.jpg")|![output_ocl.jpg](./output_ocl.jpg "output_ocl.jpg")|

## Q3. Using `matplotlib` to compare the correlation between input size and running time of sequential and parallel methods.

In [None]:
TO-DO

## Q4. If the kernel can be optimized, implement it and use `matplotlib` to plot the comparison.