<a href="https://colab.research.google.com/github/siavashadpey/gpu_intro/blob/master/tiled_matrix_multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-gqt061ub
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-gqt061ub
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=0e1ee533c034636eae7974a98f393f5f33c58dd2a384374d0f4f0eb68b666c67
  Stored in directory: /tmp/pip-ephem-wheel-cache-jdcyfbtx/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [26]:
%%cu
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <vector>
#include <fstream>
#include <iostream>
#include <time.h>

#include <cuda_runtime.h>
#include <cassert>

#define Nblock 16

// Matrix multiplication kernel
__global__ void MatrixMultiplicationKernel(const float *A, const float *B, float *C, const int N) {

    // indices of element (i.e. within tile)
    const int lcol = threadIdx.x; 
    const int lrow = threadIdx.y;

    // gloabl indices of element (i.e. matrix)
    const int grow = blockIdx.y*Nblock + threadIdx.y;
    const int gcol = blockIdx.x*Nblock + threadIdx.x;

    // initialize element's value to 0
    float cij = 0.0;

    // loop through different tiles impacting Cij
    for (int itile = 0; itile < N/Nblock; ++itile) {
        
        // we're going to fetch the elements of A and B 
        // in the the current tile to the shared memory
        __shared__ float Atile[Nblock][Nblock];
        __shared__ float Btile[Nblock][Nblock];
        Atile[lrow][lcol] = A[grow*N + (Nblock*itile + lcol)];
        Btile[lrow][lcol] = B[(itile*Nblock + lrow)*N + gcol];
        // sync to make sure Atile and Btile are fully loaded
        __syncthreads();

        // add current tile's contribution to cij
        for (int k = 0; k < Nblock; ++k) {
            cij += Atile[lrow][k]*Btile[k][lcol];
        }
        // sync threads again before loading new tile's A and B
        __syncthreads();
    }

    // assign calculated value to correct location in C matrix
    C[grow*N + gcol] = cij;
}

// serial implementation of a matrix multiplication
// used mainly to verify GPU's implementation and compare computation time
void matmul(const float *A, const float *B, float *C, const int N) {
    for (int i=0; i < N; ++i) {
        for (int j=0; j < N; ++j) {
            C[i*N + j] = 0.0;
            for (int k=0; k < N; ++k) {
                C[i*N + j] += A[i*N + k]*B[k*N + j];
            }
        }
    }
}

int run() {
    // Produce A and B matrices
    const int N = 512;
    float A[N][N];
    float B[N][N];
    for (int i = 0; i < N; ++i) { 
        for (int j = 0; j < N; ++j) {
            A[i][j] = (float) (rand())/(float)(RAND_MAX);
            B[i][j] = (float) (rand())/(float)(RAND_MAX);
        }
    }

    // time computation of C=A*B on CPU
    cudaEvent_t start_cpu, stop_cpu;
    float dt_cpu = -1;
    cudaEventCreate(&start_cpu);
    cudaEventCreate(&stop_cpu);
    cudaEventRecord(start_cpu);

    float C_cpu[N][N];
    matmul((float*)A, (float*)B, (float*)C_cpu, N);
    
    cudaEventRecord(stop_cpu);
    cudaEventSynchronize(stop_cpu);
    cudaEventElapsedTime(&dt_cpu, start_cpu, stop_cpu);

    // allocate memory on GPU
    float *dA;
    float *dB;
    float *dC;
    const size_t size = N*N*sizeof(float);
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    
    // copy input matrices to GPU
    cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, size, cudaMemcpyHostToDevice);

    // launch kernel and time it
    cudaEvent_t start_gpu, stop_gpu;
    float dt_gpu = -1;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    cudaEventRecord(start_gpu);

    dim3 NBlocksPerGrid(N/Nblock,N/Nblock);
    dim3 NThreadsPerBlock(Nblock, Nblock);
    MatrixMultiplicationKernel<<<NBlocksPerGrid, NThreadsPerBlock>>>(dA, dB, dC, N);

    // copy GPU's output to CPU
    float C_gpu[N][N];
    cudaMemcpy(C_gpu, dC, size, cudaMemcpyDeviceToHost);
    
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);
    cudaEventElapsedTime(&dt_gpu, start_gpu, stop_gpu);

    // check for errors on kernel call
    cudaError err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Error %s\n", cudaGetErrorString(err));
    else
        fprintf(stderr, "No kernel error detected\n");


    // verify GPU computation
    for (int i = 0; i < N; ++i) { 
        for (int j = 0; j < N; ++j) {
            /**
            printf("i= %d.\n", i);
            printf("j= %d.\n", j);
            printf("cpu: %f.\n", C_cpu[i][j]);
            printf("gpu: %f.\n", C_gpu[i][j]);
            printf("diff: %f.\n", C_cpu[i][j] -  C_gpu[i][j]);
            **/
            if ((C_cpu[i][j] - C_gpu[i][j])/C_cpu[i][j] >= 1E-6) {
                printf("i= %d.\n", i);
                printf("j= %d.\n", j);
                printf("cpu: %f.\n", C_cpu[i][j]);
                printf("gpu: %f.\n", C_gpu[i][j]);
                printf("diff: %f.\n", C_cpu[i][j] -  C_gpu[i][j]);
            }
        }
    }

    // free space on GPU
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    // print info
    printf("All is good!\n");
    printf("CPU computation time: %f.\n", dt_cpu);
    printf("GPU computation time: %f.\n", dt_gpu);

    return 0;
}

int main(int argc, char **argv) {
    int err = run();
    return err;
}

No kernel error detected
All is good!
CPU computation time: 1367.365112.
GPU computation time: 2.310752.

