<a href="https://colab.research.google.com/github/siavashadpey/gpu_intro/blob/master/tiled_matrix_multiplication_nicer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-mxqxzny6
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-mxqxzny6
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=184d7ee9e0d7b91ffc93f66709c0ba4d2c897be2107aacf2060cbc9cd5584ee7
  Stored in directory: /tmp/pip-ephem-wheel-cache-srcdya9d/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [28]:
%%cu
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <vector>
#include <fstream>
#include <iostream>
#include <time.h>

#include <cuda_runtime.h>
#include <cassert>

#define Nblock 16

typedef struct {
    int width;
    int height;
    int stride;
    float* elements;
} Matrix;

__device__ float GetElement(const Matrix A, int row, int col)
{
 return A.elements[row * A.stride + col];
}
// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col, float value)
{
    A.elements[row * A.stride + col] = value;
}

__device__ Matrix GetSubMatrix(Matrix A, int row, int col) 
{
    Matrix Asub;
    Asub.width    = Nblock;
    Asub.height   = Nblock;
    Asub.stride   = A.stride;
    Asub.elements = &A.elements[A.stride * Nblock * row + Nblock * col];
    return Asub;
}

// Matrix multiplication kernel
__global__ void MatrixMultiplicationKernel(const Matrix A, const Matrix B, const Matrix C) {
    // Block row and column
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;

    //Matrix Csub = GetSubMatrix(C, blockRow, blockCol);

    float cij = 0;
    int N = A.width;

    int row = threadIdx.y;
    int col = threadIdx.x;

    // loop through different tiles impacting Cij
    for (int itile = 0; itile < N/Nblock; ++itile) {
        
        Matrix Asub = GetSubMatrix(A, blockRow, itile);
        Matrix Bsub = GetSubMatrix(B, itile, blockCol);
        
        __shared__ float As[Nblock][Nblock];
        __shared__ float Bs[Nblock][Nblock];

        As[row][col] = GetElement(Asub, row, col);
        Bs[row][col] = GetElement(Bsub, row, col);
        __syncthreads();

        // compute current tile's contribution to cij
        for (int k = 0; k < Nblock; ++k)
            cij += As[row][k]*Bs[k][col];
        // sync threads again before loading new tile's A and B
        __syncthreads();
    }

    //SetElement(Csub, row, col, cij);
    C.elements[(blockRow*Nblock + row)*N + blockCol*Nblock + col] = cij;
}

void matmul(const float *A, const float *B, float *C, const int N) {
    for (int i=0; i < N; ++i) {
        for (int j=0; j < N; ++j) {
            C[i*N + j] = 0.0;
            for (int k=0; k < N; ++k) {
                C[i*N + j] += A[i*N + k]*B[k*N + j];
            }
        }
    }
}

int run() {
    // Produce A and B matrices
    const int N = 512;
    float A[N][N];
    float B[N][N];
    for (int i = 0; i < N; ++i) { 
        for (int j = 0; j < N; ++j) {
            A[i][j] = (float) (rand())/(float)(RAND_MAX);
            B[i][j] = (float) (rand())/(float)(RAND_MAX);
        }
    }

    //for (int j = 0; j < N; ++j) {
    //    printf("%f, ", A[0][j]);
    //    printf("%f, \n", B[j][0]);
    //}

    // time computation of C=A*B on CPU
    cudaEvent_t start_cpu, stop_cpu;
    float dt_cpu = -1;
    cudaEventCreate(&start_cpu);
    cudaEventCreate(&stop_cpu);
    cudaEventRecord(start_cpu);

    float C_cpu[N][N];
    matmul((float*)A, (float*)B, (float*)C_cpu, N);
    
    cudaEventRecord(stop_cpu);
    cudaEventSynchronize(stop_cpu);
    cudaEventElapsedTime(&dt_cpu, start_cpu, stop_cpu);

    // allocate memory on GPU
    Matrix dA; dA.width = N; dA.height = N; dA.stride = N;
    Matrix dB; dB.width = N; dB.height = N; dB.stride = N;
    Matrix dC; dC.width = N; dC.height = N; dC.stride = N;
    const size_t size = N*N*sizeof(float);
    cudaMalloc(&dA.elements, size);
    cudaMalloc(&dB.elements, size);
    cudaMalloc(&dC.elements, size);

    // copy matrices to GPU
    cudaMemcpy(dA.elements, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB.elements, B, size, cudaMemcpyHostToDevice);

    // queue kernel
    cudaEvent_t start_gpu, stop_gpu;
    float dt_gpu = -1;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    cudaEventRecord(start_gpu);

    dim3 NBlocksPerGrid(N/Nblock,N/Nblock);
    dim3 NThreadsPerBlock(Nblock, Nblock);
    MatrixMultiplicationKernel<<<NBlocksPerGrid, NThreadsPerBlock>>>(dA, dB, dC);

    // copy GPU's output to CPU
    float C_gpu[N][N];
    cudaMemcpy(C_gpu, dC.elements, size, cudaMemcpyDeviceToHost);
    
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);
    cudaEventElapsedTime(&dt_gpu, start_gpu, stop_gpu);

    // Check for errors on kernel call
    cudaError err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Error %s\n", cudaGetErrorString(err));
    else
        fprintf(stderr, "No kernel error detected\n");


    // check GPU computation
    for (int i = 0; i < N; ++i) { 
        for (int j = 0; j < N; ++j) {
            /**
            printf("i= %d.\n", i);
            printf("j= %d.\n", j);
            printf("cpu: %f.\n", C_cpu[i][j]);
            printf("gpu: %f.\n", C_gpu[i][j]);
            printf("diff: %f.\n", C_cpu[i][j] -  C_gpu[i][j]);
            **/
            if ((C_cpu[i][j] - C_gpu[i][j])/C_cpu[i][j] >= 1E-6) {
                printf("i= %d.\n", i);
                printf("j= %d.\n", j);
                printf("cpu: %f.\n", C_cpu[i][j]);
                printf("gpu: %f.\n", C_gpu[i][j]);
                printf("diff: %f.\n", C_cpu[i][j] -  C_gpu[i][j]);
            }
        }
    }

    // free space
    cudaFree(dA.elements);
    cudaFree(dB.elements);
    cudaFree(dC.elements);

    // print info
    printf("All is good!.\n");
    printf("CPU computation time: %f.\n", dt_cpu);
    printf("GPU computation time: %f.\n", dt_gpu);

    return 0;
}

int main(int argc, char **argv) {
    int err = run();
    return err;
}

No kernel error detected
All is good!.
CPU computation time: 1065.824219.
GPU computation time: 0.926784.

