In [129]:
%%cu
#include<bits/stdc++.h>
using namespace std;

__global__ void vectorAdd(int *a, int *b, int *result, int n){
    int tid = threadIdx.x + ( blockIdx.x * blockDim.x );
    if(tid < n){
        result[tid] = a[tid] + b[tid];
    }
}

__global__ void matVecMul(int *a, int *b, int *result, int n){
    int row = threadIdx.x + ( blockIdx.x * blockDim.x );
    int sum = 0;
    if(row < n){
        int start = row*n;
        for(int j=0; j<n; j++){
            sum = sum + a[start + j] * b[j];
        }
        result[row] = sum;
    }
}

__global__ void matMul(int *a, int *b, int *c, int n){
    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;
   
    if(row<n && col<n){
        for(int j=0;j<n;j++){
            c[n*row + col] += a[row*n + j] * b[j*n + col];
        }
    }
}


int main(){
    cout << "\nVector Addition\n";
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 1<<24;
    int size = n * sizeof(int);

    a = new int [n];
    b = new int [n];
    c = new int [n];
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);

    for(int i=0; i<n; i++){
        a[i] = rand() % 10 + 1;
        b[i] = rand() % 10 + 1;
    }

    cudaEvent_t start, end;

    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    int threads = 1<<10;
    int blocks = (n+threads-1)/threads;
    cudaEventRecord(start);

    vectorAdd<<<blocks,threads>>>(a_dev, b_dev, c_dev, n);

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);

    int err_cnt = 0;
    for(int i=0; i<n; i++)
        if(a[i]+b[i]!=c[i]) err_cnt++;

    cout << "Error Count : " << err_cnt << endl;

    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    delete[] a;
    delete[] b;
    delete[] c;

    float time = 0.0;
    cudaEventElapsedTime(&time, start, end);
    cout << "Time Elapsed : " << time << "ms" << endl;




    cout << "\nMatrix and Vector Multiplication" << endl;
    n = 1<<12;
    size = n * sizeof(int);
    a = new int [n*n];
    b = new int [n];
    c = new int [n];
    
    cudaMalloc(&a_dev, n*size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);

    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++)
            a[(i*n) + j] = rand() % 10 + 1;
        b[i] = rand() % 10 + 1;
    }

    cudaMemcpy(a_dev, a, n*size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);

    cudaEventRecord(start);

    matVecMul<<<n/256+1,256>>>(a_dev, b_dev, c_dev, n);

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);

    err_cnt = 0;
    for(int i=0; i<n; i++){
        int sum = 0;
        for(int j=0; j<n; j++)
            sum += a[i*n + j] * b[j];
        if(sum != c[i]) err_cnt++;
    }
    cout << "\nError Count : " << err_cnt << endl;

    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    delete[] a;
    delete[] b;
    delete[] c;

    time = 0.0;
    cudaEventElapsedTime(&time, start, end);
    cout << "Time Elapsed : " << time << "ms" << endl;




    cout << "\nMatrix Multiplication" << endl;
    n = 1<<10;
    size = n * n * sizeof(int);
    a = new int [n*n];
    b = new int [n*n];
    c = new int [n*n];
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);

    for(int i=0; i<n*n; i++){
        a[i] = rand() % 10 + 1;
        b[i] = rand() % 10 + 1;
    }

    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid(n/16, n/16);

    cudaEventRecord(start);

    matMul<<<blocksPerGrid, threadsPerBlock>>>(a_dev, b_dev, c_dev, n);

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);

    err_cnt = 0;
    for(int i=0; i<n; i++){
        for(int j=0; j<n; j++){
            int sum = 0;
            for(int k=0; k<n; k++)
                sum += a[i*n + k] * b[k*n + j];
            if(sum != c[i*n + j]){
                //cout << sum << "\t" << c[i*n + j] << endl;
                err_cnt++;
            }
        }
    }
    cout << "\nError Count : " << err_cnt << endl;

    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    delete[] a;
    delete[] b;
    delete[] c;

    time = 0.0;
    cudaEventElapsedTime(&time, start, end);
    cout << "Time Elapsed : " << time << "ms" << endl;
    
    return 0;
}


Vector Addition
Error Count : 0
Time Elapsed : 0.375872ms

Matrix and Vector Multiplication

Error Count : 0
Time Elapsed : 1.00682ms

Matrix Multiplication

Error Count : 0
Time Elapsed : 6.63306ms

