In [1]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-558m61xw
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-558m61xw
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=26324170f8d8be86fc543c08eef3c566793cdeb1f0ec99db87c0cab2f1f09226
  Stored in directory: /tmp/pip-ephem-wheel-cache-u0t50008/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e7bc7
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [3]:
%%cu

#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <bits/stdc++.h>

using namespace std;

__global__ void add_parallel(int* A, int* B, int* C, int N) {

    int id = blockDim.x * blockIdx.x + threadIdx.x;

    C[id] = A[id] + B[id];
}

void add_serial(int* A, int* B, int* C, int N) {

    for (int i = 0; i < N; i++)
        C[i] = A[i] + B[i];

    return;
}

void initialize(int* array, int N) {

    for (int i = 0; i < N; i++)
        array[i] = (1 + rand() % 100);

    return;
}

void print(int* array, int N) {

    for (int i = 0; i < N; i++)
        cout << array[i] << "  ";

    cout << endl;
}


int main(void) {

    int N = 20;

    int size_in_bytes = N * sizeof(int);

    int* A = (int*) malloc (size_in_bytes);
    int* B = (int*) malloc (size_in_bytes);
    int* C = (int*) malloc (size_in_bytes);

    initialize(A, N);
    initialize(B, N);

    int *X, *Y, *Z;

    cudaMalloc(&X, size_in_bytes);
    cudaMalloc(&Y, size_in_bytes);
    cudaMalloc(&Z, size_in_bytes);

    cudaMemcpy(X, A, size_in_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, size_in_bytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 32;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    add_parallel<<<blocksPerGrid, threadsPerBlock>>> (X, Y, Z, N);

    cudaMemcpy(C, Z, size_in_bytes, cudaMemcpyDeviceToHost);

    // cout << "Time Taken : " << (double) (clock() - start_time) / CLOCKS_PER_SEC; << endl;

    print(A, N);
    print(B, N);
    print(C, N);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}


84  87  78  16  94  36  87  93  50  22  63  28  91  60  64  27  41  27  73  37  
12  69  68  30  83  31  63  24  68  36  30  3  23  59  70  68  94  57  12  43  
96  156  146  46  177  67  150  117  118  58  93  31  114  119  134  95  135  84  85  80  



In [None]:
%%cu

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <bits/stdc++.h>

#define BLOCK_SIZE 32

using namespace std;

__global__ void matrix_multiplication(int* device_A, int* device_B, int* device_C, int N) {

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N)
    {

      device_C[row * N + col] = 0;

      for (int k = 0; k < N; k++)
          device_C[row * N + col] += device_A[row * N + k] * device_B[k * N + col];
    }

}

void print_matrix(int* matrix, int N) {

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
            cout << matrix[N * i + j] << "  ";

        cout << endl;
    }

    cout << endl;
}

void populate_matrix(int* matrix, int N) {

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            matrix[N * i + j] = (1 + rand() % 100);

    cout << endl;
}

int main(void) {

    int N = 32;
    int size_in_bytes = N * N * sizeof(int);

    int* host_A = (int*) malloc(size_in_bytes);
    int* host_B = (int*) malloc(size_in_bytes);
    int* host_C = (int*) malloc(size_in_bytes);

    populate_matrix(host_A, N);
    populate_matrix(host_B, N);

    cout << "Matrix A " << endl;
    print_matrix(host_A, N);

    cout << "Matrix B " << endl;
    print_matrix(host_B, N);

    int *device_A, *device_B, *device_C;

    cudaMalloc(&device_A, size_in_bytes);
    cudaMalloc(&device_B, size_in_bytes);
    cudaMalloc(&device_C, size_in_bytes);

    cudaMemcpy(device_A, host_A, size_in_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_B, host_B, size_in_bytes, cudaMemcpyHostToDevice);

    int block_size = BLOCK_SIZE;
    int grid_size = (N * BLOCK_SIZE - 1) / BLOCK_SIZE;

    dim3 blocks(block_size, block_size);
    dim3 grids (grid_size, grid_size);

    matrix_multiplication<<<grids, blocks>>>(device_A, device_B, device_C, N);

    cudaMemcpy(host_C, device_C, size_in_bytes, cudaMemcpyDeviceToHost);

    cout << "Matrix C " << endl;
    print_matrix(host_C, N);

    free(host_A);
    free(host_B);
    free(host_C);

    cudaFree(device_A);
    cudaFree(device_B);
    cudaFree(device_C);

    return 0;
}




Matrix A 
84  87  78  16  94  36  87  93  50  22  63  28  91  60  64  27  41  27  73  37  12  69  68  30  83  31  63  24  68  36  30  3  
23  59  70  68  94  57  12  43  30  74  22  20  85  38  99  25  16  71  14  27  92  81  57  74  63  71  97  82  6  26  85  28  
37  6  47  30  14  58  25  96  83  46  15  68  35  65  44  51  88  9  77  79  89  85  4  52  55  100  33  61  77  69  40  13  
27  87  95  40  96  71  35  79  68  2  98  3  18  93  53  57  2  81  87  42  66  90  45  20  41  30  32  18  98  72  82  76  
10  28  68  57  98  54  87  66  7  84  20  25  29  72  33  30  4  20  71  69  9  16  41  50  97  24  19  46  47  52  22  56  
80  89  65  29  42  51  94  1  35  65  25  15  88  57  44  92  28  66  60  37  33  52  38  29  76  8  75  22  59  96  30  38  
36  94  19  29  44  12  29  30  77  5  44  64  14  39  7  41  5  19  29  89  70  18  18  97  25  44  71  84  91  100  73  26  
45  91  6  40  55  87  70  83  43  65  98  8  56  5  49  12  23  29  100  44  47  69  41  23  12  1

In [8]:
%%cu

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <ctime>
#include <bits/stdc++.h>

#define BLOCK_SIZE 32

using namespace std;

__global__ void parallel_matrix_multiplication(int* device_A, int* device_B, int* device_C, int N) {

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N)
    {

      device_C[row * N + col] = 0;

      for (int k = 0; k < N; k++)
          device_C[row * N + col] += device_A[row * N + k] * device_B[k * N + col];
    }

}

void serial_matrix_multiplication(int* host_A, int* host_B, int* host_result_matrix, int N) {

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            host_result_matrix[N * i + j] = 0;

            for (int k = 0; k < N; k++)
            {
                host_result_matrix[N * i + j]  += host_A[i * N + k] * host_B[k * N + j];
            }
        }
    }

    return;
}

void print_matrix(int* matrix, int N) {

    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
            cout << matrix[N * i + j] << "  ";

        cout << endl;
    }

    cout << endl;
}

void populate_matrix(int* matrix, int N) {

    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            matrix[N * i + j] = (1 + rand() % 100);

    cout << endl;
}

int main(void) {

    int N = 16;
    int size_in_bytes = N * N * sizeof(int);

    int* host_A = (int*) malloc(size_in_bytes);
    int* host_B = (int*) malloc(size_in_bytes);
    int* host_C = (int*) malloc(size_in_bytes);
    int* host_result_matrix = (int*) malloc (size_in_bytes);


    populate_matrix(host_A, N);
    populate_matrix(host_B, N);

    cout << "Matrix A " << endl;
    print_matrix(host_A, N);

    cout << "Matrix B " << endl;
    print_matrix(host_B, N);

    clock_t serial_start_time = clock();
    serial_matrix_multiplication(host_A, host_B, host_result_matrix, N);
    clock_t serial_end_time = clock();
    double serial_time = double(serial_end_time - serial_start_time) / CLOCKS_PER_SEC;
    cout << "Serial Matrix Multiplication Time: " << serial_time << " seconds" << endl;

    cout << "Serial Matrix Multiplication : " << endl;
    print_matrix(host_result_matrix, N);

    int *device_A, *device_B, *device_C;

    cudaMalloc((void**) &device_A, size_in_bytes);
    cudaMalloc((void**) &device_B, size_in_bytes);
    cudaMalloc((void**) &device_C, size_in_bytes);

    cudaMemcpy(device_A, host_A, size_in_bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_B, host_B, size_in_bytes, cudaMemcpyHostToDevice);

    int block_size = BLOCK_SIZE;
    int grid_size = (N * BLOCK_SIZE - 1) / BLOCK_SIZE;

    dim3 blocks(block_size, block_size);
    dim3 grids (grid_size, grid_size);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    parallel_matrix_multiplication<<<grids, blocks>>>(device_A, device_B, device_C, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float parallel_time;
    cudaEventElapsedTime(&parallel_time, start, stop);
    cout << "Parallel Matrix Multiplication Time: " << parallel_time / 1000 << " seconds" << endl;

    cudaMemcpy(host_C, device_C, size_in_bytes, cudaMemcpyDeviceToHost);

    cout << "Matrix C " << endl;
    print_matrix(host_C, N);

    free(host_A);
    free(host_B);
    free(host_C);

    cudaFree(device_A);
    cudaFree(device_B);
    cudaFree(device_C);

    return 0;
}




Matrix A 
84  87  78  16  94  36  87  93  50  22  63  28  91  60  64  27  
41  27  73  37  12  69  68  30  83  31  63  24  68  36  30  3  
23  59  70  68  94  57  12  43  30  74  22  20  85  38  99  25  
16  71  14  27  92  81  57  74  63  71  97  82  6  26  85  28  
37  6  47  30  14  58  25  96  83  46  15  68  35  65  44  51  
88  9  77  79  89  85  4  52  55  100  33  61  77  69  40  13  
27  87  95  40  96  71  35  79  68  2  98  3  18  93  53  57  
2  81  87  42  66  90  45  20  41  30  32  18  98  72  82  76  
10  28  68  57  98  54  87  66  7  84  20  25  29  72  33  30  
4  20  71  69  9  16  41  50  97  24  19  46  47  52  22  56  
80  89  65  29  42  51  94  1  35  65  25  15  88  57  44  92  
28  66  60  37  33  52  38  29  76  8  75  22  59  96  30  38  
36  94  19  29  44  12  29  30  77  5  44  64  14  39  7  41  
5  19  29  89  70  18  18  97  25  44  71  84  91  100  73  26  
45  91  6  40  55  87  70  83  43  65  98  8  56  5  49  12  
23  29  100  44  47  69  41  2