<a href="https://colab.research.google.com/github/rbaygildin/learn-gpgpu/blob/master/notebooks/hello_world.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Learn CUDA C++ - Hello world!

In [None]:
%%shell
nvidia-smi

Thu Jan 13 16:55:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces



In [None]:
%%writefile hello.cu

#include <stdio.h>

__global__ void cuda_hello(){
    printf("Hello World from GPU!\n");
}

int main() {
    cuda_hello<<<1,1>>>(); 
    cudaDeviceSynchronize();
    return 0;
}

Writing hello.cu


In [None]:
%%shell
nvcc hello.cu -o hello



In [None]:
%%shell
./hello
nvprof ./hello

Hello World from GPU!
==156== NVPROF is profiling process 156, command: ./hello
Hello World from GPU!
==156== Profiling application: ./hello
==156== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  79.390us         1  79.390us  79.390us  79.390us  cuda_hello(void)
      API calls:   99.76%  299.15ms         1  299.15ms  299.15ms  299.15ms  cudaLaunchKernel
                    0.15%  442.63us         1  442.63us  442.63us  442.63us  cuDeviceTotalMem
                    0.05%  152.58us       101  1.5100us     144ns  63.574us  cuDeviceGetAttribute
                    0.03%  99.210us         1  99.210us  99.210us  99.210us  cudaDeviceSynchronize
                    0.01%  27.066us         1  27.066us  27.066us  27.066us  cuDeviceGetName
                    0.00%  6.1700us         1  6.1700us  6.1700us  6.1700us  cuDeviceGetPCIBusId
                    0.00%  1.3930us         3     464ns     205ns     831ns  cuDev



In [None]:
%%writefile random.cu

#include <stdio.h>

#define N 5
#define NSIZE (N * sizeof(int))

__global__ void add_vectors(int* a, int* b, int* res){
    int i = blockIdx.x;
    printf("Call at block%d\n", i);
    res[i] = a[i] + b[i];
    printf("Set res as %d + %d = %d\n", a[i], b[i], res[i]);
}

int main(){
    int* h_a = (int*) malloc(NSIZE);
    int* h_b = (int*) malloc(NSIZE);
    int* h_res = (int*) malloc(NSIZE);
   

    printf("Initialize a and b\n");
    for(int i = 0; i < N; i++){
        h_a[i] = i;
        h_b[i] = N - i;
        printf("Set a = %d, b = %d\n", h_a[i], h_b[i]);
    }

    int* d_a = NULL;
    int* d_b = NULL;
    int* d_res = NULL;
    cudaMalloc((void**)&d_a, NSIZE);
    cudaMalloc((void**)&d_b, NSIZE);
    cudaMalloc((void**)&d_res, NSIZE);

    cudaMemcpy(d_a, h_a, NSIZE, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, NSIZE, cudaMemcpyHostToDevice);


    add_vectors<<<N, 1>>>(d_a, d_b, d_res);
    cudaDeviceSynchronize();

    cudaMemcpy(h_res, d_res, NSIZE, cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_res);

    for(int i = 0; i < N; i++){
        printf("res[%d] = %d\n", i, h_res[i]);
    }

    free(h_a);
    free(h_b);
    free(h_res);
    return 0;
}

Overwriting random.cu


In [None]:
%%shell
nvcc random.cu -o random
./random

Initialize a and b
Set a = 0, b = 5
Set a = 1, b = 4
Set a = 2, b = 3
Set a = 3, b = 2
Set a = 4, b = 1
Call at block 2
Call at block 0
Call at block 4
Call at block 3
Call at block 1
Set res as 2 + 3 = 5
Set res as 0 + 5 = 5
Set res as 1 + 4 = 5
Set res as 3 + 2 = 5
Set res as 4 + 1 = 5
res[0] = 5
res[1] = 5
res[2] = 5
res[3] = 5
res[4] = 5


