In [8]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-vo55o_2q
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-vo55o_2q
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=444d46d03a337a376c7fb85aaa6c81a5b672e716363b52aae36da2597eab3c0e
  Stored in directory: /tmp/pip-ephem-wheel-cache-u7oi7xkf/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin


In [9]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [11]:
%%cu 
#include <stdio.h>
#include <stdlib.h>

#define BLK_SIZE 1 

__global__ void matMulBoth(int *a,int *b ,int *c,int m ,int n,int k){
    int rowId = blockIdx.y * blockDim.y + threadIdx.y;
    int colId = blockIdx.x * blockDim.x + threadIdx.x;
    int temp=0;
    if(colId < k && rowId <m){
        for(int i=0;i<n;i++){
            temp +=a[rowId*n+i]+b[i*k+colId];
        }
        c[rowId*k+colId] = temp;
    }
}

int main(){
    int m =3,n=3,k=4;
    int *h_a, *h_b, *h_c;
    cudaMallocHost((void **) &h_a, sizeof(int)*m*n);
    cudaMallocHost((void **) &h_b, sizeof(int)*n*k);
    cudaMallocHost((void **) &h_c, sizeof(int)*m*k);
    
    //assign random values
    
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < n; ++j) {
            h_a[i * n + j] = 3*i+j;
        }
    }

    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < k; ++j) {
            h_b[i * k + j] = i+4*j;
        }
    }

    //memory space on device 
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **) &d_a, sizeof(int)*m*n);
    cudaMalloc((void **) &d_b, sizeof(int)*n*k);
    cudaMalloc((void **) &d_c, sizeof(int)*m*k);

    //matrix copy from host to device memory 
    cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice);

    unsigned int gridRows = (m + BLK_SIZE - 1) / BLK_SIZE;
    unsigned int gridCols = (k + BLK_SIZE - 1) / BLK_SIZE;
    dim3 dimGrid(gridCols, gridRows);
    dim3 dimBlock(BLK_SIZE, BLK_SIZE);

    matMulBoth<<<dimGrid,dimBlock>>>(d_a,d_b,d_c,m,n,k);
    cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);

    printf("A\n");
    for(int i=0;i<m;i++){
        for(int j=0;j<n;j++){
            printf("%d , = ",h_a[i]);
        }
    }
    printf("\n");

    printf("B\n");
    for(int i=0;i<n;i++){
        for(int j=0;j<k;j++){
            printf("%d , = ",h_b[i]);
        }
    }
    printf("\n");

    printf("C\n");
    for(int i=0;i<m;i++){
        for(int j=0;j<k;j++){
            printf("%d , = ",h_c[i]);
        }
    }
    printf("\n");

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);
    return 0;
}


