In [1]:
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpyg3r04va".


In [2]:
import subprocess

# Get OpenCV compiler flags
opencv_flags = subprocess.check_output(["pkg-config", "--cflags", "--libs", "opencv4"]).decode().strip()

# Define additional compiler arguments
custom_flags = "-gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75"

# Combine both
COMPILER_ARGS = f"{custom_flags} {opencv_flags}"

print(COMPILER_ARGS)


-gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_75,code=compute_75 -I/usr/include/opencv4 -lopencv_stitching -lopencv_alphamat -lopencv_aruco -lopencv_barcode -lopencv_bgsegm -lopencv_bioinspired -lopencv_ccalib -lopencv_dnn_objdetect -lopencv_dnn_superres -lopencv_dpm -lopencv_face -lopencv_freetype -lopencv_fuzzy -lopencv_hdf -lopencv_hfs -lopencv_img_hash -lopencv_intensity_transform -lopencv_line_descriptor -lopencv_mcc -lopencv_quality -lopencv_rapid -lopencv_reg -lopencv_rgbd -lopencv_saliency -lopencv_shape -lopencv_stereo -lopencv_structured_light -lopencv_phase_unwrapping -lopencv_superres -lopencv_optflow -lopencv_surface_matching -lopencv_tracking -lopencv_highgui -lopencv_datasets -lopencv_text -lopencv_plot -lopencv_ml -lopencv_videostab -lopencv_videoio -lopencv_viz -lopencv_wechat_qrcode -lopencv_ximgproc -lopencv_video -lopencv_xobjdetect -lopencv_objdetect -lopencv_calib3d -lopencv_imgcodecs -lopencv_features2d -lopencv_dnn -lopencv_flann -lopencv_xphoto -lop

In [None]:
# Not needed here
# from google.colab import files
# uploaded = files.upload()

In [4]:
%%cuda --compiler-args "$COMPILER_ARGS"
#include <stdio.h>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

#define CHANNELS 3  // RGB Image

__global__ void rgb_to_grayscale(unsigned char *rgb, unsigned char *gray, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < height) {
        int idx = (y * width + x) * CHANNELS; // RGB has 3 channels
        int gray_idx = y * width + x;

        // Convert to grayscale
        gray[gray_idx] = 0.299f * rgb[idx] + 0.587f * rgb[idx + 1] + 0.114f * rgb[idx + 2];
    }
}

void launch_grayscale_conversion(unsigned char *d_rgb, unsigned char *d_gray, int width, int height) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);

    // Measure GPU execution time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaDeviceSynchronize();
    cudaEventRecord(start);
    rgb_to_grayscale<<<gridDim, blockDim>>>(d_rgb, d_gray, width, height);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
    }
    cudaDeviceSynchronize();
    cudaEventRecord(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("GPU Execution Time: %f ms\n", milliseconds);
}

int main() {
    // Load image using OpenCV
    cv::Mat image = cv::imread("90mb.png");
    if (image.empty()) {
        printf("Error: Image not found!\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;

    // Convert to contiguous array
    cv::Mat image_rgb;
    cv::cvtColor(image, image_rgb, cv::COLOR_BGR2RGB);

    // Allocate CPU memory for grayscale images
    cv::Mat gray_cv(height, width, CV_8UC1);
    cv::Mat gray_cpu(height, width, CV_8UC1);
    cv::Mat gray_gpu(height, width, CV_8UC1);

    // --------------------------------------
    // CPU Grayscale Conversion (OpenCV)
    // --------------------------------------
    auto start_cv = std::chrono::high_resolution_clock::now();
    cv::cvtColor(image, gray_cv, cv::COLOR_BGR2GRAY);
    auto end_cv = std::chrono::high_resolution_clock::now();
    double cv_time = std::chrono::duration<double, std::milli>(end_cv - start_cv).count();
    printf("OpenCV Execution Time: %f ms\n", cv_time);

    // --------------------------------------
    // CPU Grayscale Conversion
    // --------------------------------------
    auto start_cpu = std::chrono::high_resolution_clock::now();
    uchar *data = image_rgb.data;
    for(int i=0; i < height; i++) {
        for(int j=0; j < width; j++) {
            int idx = (i * width + j) * CHANNELS;
            uchar r = data[idx];
            uchar g = data[idx + 1];
            uchar b = data[idx + 2];
            gray_cpu.at<uchar>(i, j) = (0.299f * r + 0.587f * g + 0.114f * b);
        }
    }
    auto end_cpu = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double, std::milli>(end_cpu - start_cpu).count();
    printf("CPU Execution Time: %f ms\n", cpu_time);

    // --------------------------------------
    // GPU Grayscale Conversion (CUDA)
    // --------------------------------------
    unsigned char *d_rgb, *d_gray;
    cudaError_t err;

    err = cudaMalloc((void**)&d_rgb, width * height * CHANNELS);
    if (err != cudaSuccess) {
        printf("CUDA malloc failed for d_rgb: %s\n", cudaGetErrorString(err));
        return -1;
    }

    err = cudaMalloc((void**)&d_gray, width * height);
    if (err != cudaSuccess) {
        printf("CUDA malloc failed for d_gray: %s\n", cudaGetErrorString(err));
        return -1;
    }

    // Copy image to device
    cudaMemcpy(d_rgb, image_rgb.data, width * height * CHANNELS, cudaMemcpyHostToDevice);

    // Launch CUDA Kernel
    launch_grayscale_conversion(d_rgb, d_gray, width, height);

    // Copy back result
    cudaMemcpy(gray_gpu.data, d_gray, width * height, cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(d_rgb);
    cudaFree(d_gray);

    // Display results
    // cv::imshow("cv_grayscale.png", gray_cv);
    // cv::imshow("cpu_grayscale.png", gray_cpu);
    // cv::imshow("gpu_grayscale.png", gray_gpu);
    // cv::waitKey(0);

    // Store results
    cv::imwrite("cv_grayscale.png", gray_cv);
    cv::imwrite("cpu_grayscale.png", gray_cpu);
    cv::imwrite("gpu_grayscale.png", gray_gpu);

    return 0;
}

OpenCV Execution Time: 1.302096 ms
CPU Execution Time: 15.204304 ms
GPU Execution Time: 0.000000 ms



In [5]:
%%cuda --compiler-args "$COMPILER_ARGS"
#include <stdio.h>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <chrono>

#define CHANNELS 3
#define FILTER_WIDTH 9
#define RADIUS (FILTER_WIDTH / 2)
#define FILTER_SCALE 369.0f

__constant__ float filter[FILTER_WIDTH * FILTER_WIDTH] = {
     1,  2,  3,  4,  5,  4,  3,  2,  1,
     2,  3,  4,  5,  6,  5,  4,  3,  2,
     3,  4,  5,  6,  7,  6,  5,  4,  3,
     4,  5,  6,  7,  8,  7,  6,  5,  4,
     5,  6,  7,  8,  9,  8,  7,  6,  5,
     4,  5,  6,  7,  8,  7,  6,  5,  4,
     3,  4,  5,  6,  7,  6,  5,  4,  3,
     2,  3,  4,  5,  6,  5,  4,  3,  2,
     1,  2,  3,  4,  5,  4,  3,  2,  1
};

__global__ void image_blur(unsigned char* input, unsigned char* output, int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width || y >= height) return;

    float r = 0.0f, g = 0.0f, b = 0.0f;

    for (int fy = -RADIUS; fy <= RADIUS; fy++) {
        for (int fx = -RADIUS; fx <= RADIUS; fx++) {
            int ix = min(max(x + fx, 0), width - 1);
            int iy = min(max(y + fy, 0), height - 1);
            int image_idx = (iy * width + ix) * CHANNELS;
            int kernel_idx = (fy + RADIUS) * FILTER_WIDTH + (fx + RADIUS);

            float weight = filter[kernel_idx] / FILTER_SCALE;

            r += input[image_idx]     * weight;
            g += input[image_idx + 1] * weight;
            b += input[image_idx + 2] * weight;
        }
    }

    int out_idx = (y * width + x) * CHANNELS;
    output[out_idx]     = static_cast<unsigned char>(min(max(int(r), 0), 255));
    output[out_idx + 1] = static_cast<unsigned char>(min(max(int(g), 0), 255));
    output[out_idx + 2] = static_cast<unsigned char>(min(max(int(b), 0), 255));
}

void launch_blur(unsigned char* d_input, unsigned char* d_output, int width, int height) {
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);

    // Timing GPU blur
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaDeviceSynchronize();
    cudaEventRecord(start);

    image_blur<<<gridDim, blockDim>>>(d_input, d_output, width, height);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA Error: %s\n", cudaGetErrorString(err));
    }

    cudaDeviceSynchronize();
    cudaEventRecord(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("GPU Execution Time: %f ms\n", milliseconds);
}

int main() {
    // Load image using OpenCV
    cv::Mat image = cv::imread("90mb.png");
    if (image.empty()) {
        printf("Error: Image not found!\n");
        return -1;
    }

    int width = image.cols;
    int height = image.rows;

    // Convert BGR to RGB
    cv::Mat image_rgb;
    cv::cvtColor(image, image_rgb, cv::COLOR_BGR2RGB);

    // Output images
    cv::Mat blur_cv, blur_cpu(height, width, CV_8UC3), blur_gpu(height, width, CV_8UC3);


    // --------------------------------------
    // CPU Gaussian Blur (OpenCV)
    // --------------------------------------
    auto start_cv = std::chrono::high_resolution_clock::now();
    cv::GaussianBlur(image_rgb, blur_cv, cv::Size(FILTER_WIDTH, FILTER_WIDTH), 0);
    auto end_cv = std::chrono::high_resolution_clock::now();
    double cv_time = std::chrono::duration<double, std::milli>(end_cv - start_cv).count();
    printf("OpenCV Execution Time: %f ms\n", cv_time);


    // --------------------------------------
    // CPU Gaussian Blur
    // --------------------------------------
    unsigned char* input_data = image_rgb.data;
    unsigned char* output_data = blur_cpu.data;
    auto start_cpu = std::chrono::high_resolution_clock::now();
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            float r = 0.0f, g = 0.0f, b = 0.0f;

            // Apply filter kernel
            for (int fy = -RADIUS; fy <= RADIUS; fy++) {
                for (int fx = -RADIUS; fx <= RADIUS; fx++) {
                    // Handle border pixels by clamping
                    int ix = min(max(x + fx, 0), width - 1);
                    int iy = min(max(y + fy, 0), height - 1);

                    int image_idx = (iy * width + ix) * CHANNELS;
                    int kernel_idx = (fy + RADIUS) * FILTER_WIDTH + (fx + RADIUS);

                    float weight = filter[kernel_idx] / FILTER_SCALE;

                    r += input_data[image_idx]     * weight;
                    g += input_data[image_idx + 1] * weight;
                    b += input_data[image_idx + 2] * weight;
                }
            }

            // Write result to output
            int out_idx = (y * width + x) * CHANNELS;
            output_data[out_idx]     = static_cast<unsigned char>(min(max(int(r), 0), 255));
            output_data[out_idx + 1] = static_cast<unsigned char>(min(max(int(g), 0), 255));
            output_data[out_idx + 2] = static_cast<unsigned char>(min(max(int(b), 0), 255));
        }
    }
    auto end_cpu = std::chrono::high_resolution_clock::now();
    double cpu_time = std::chrono::duration<double, std::milli>(end_cpu - start_cpu).count();
    printf("CPU Execution Time: %f ms\n", cpu_time);


    // --------------------------------------
    // GPU Gaussian Blur
    // --------------------------------------

    // GPU memory allocation
    unsigned char *d_input, *d_output;
    cudaMalloc((void**)&d_input, width * height * CHANNELS);
    cudaMalloc((void**)&d_output, width * height * CHANNELS);

    // Copy to device
    cudaMemcpy(d_input, image_rgb.data, width * height * CHANNELS, cudaMemcpyHostToDevice);

    // Run GPU blur
    launch_blur(d_input, d_output, width, height);

    // Copy back result
    cudaMemcpy(blur_gpu.data, d_output, width * height * CHANNELS, cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_input);
    cudaFree(d_output);

    // Save results
    cv::cvtColor(blur_cv, blur_cv, cv::COLOR_RGB2BGR);
    cv::cvtColor(blur_cpu, blur_cpu, cv::COLOR_RGB2BGR);
    cv::cvtColor(blur_gpu, blur_gpu, cv::COLOR_RGB2BGR);
    cv::imwrite("cv_blur.png", blur_cv);
    cv::imwrite("cpu_blur.png", blur_cpu);
    cv::imwrite("gpu_blur.png", blur_gpu);

    return 0;
}

OpenCV Execution Time: 6.366156 ms
CPU Execution Time: 1816.760842 ms
GPU Execution Time: 0.000000 ms

