In [11]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:30:10_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter




In [14]:
%%cuda
#include <stdio.h>
#include <cuda_runtime.h>

__global__
void addArrays(int *a, int *b, int *c, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        c[tid] = a[tid] + b[tid];
    }
}

int main() {
    const int n = 10;
    int a[n], b[n], c[n];
    int *d_a, *d_b, *d_c;

    size_t sz = n * sizeof(int);
    
    for (int i = 0; i < n; ++i) {
        a[i] = i;
        b[i] = 2 * i;
    }
    
    cudaMalloc((void **)&d_a, sz);
    cudaMalloc((void **)&d_b, sz);
    cudaMalloc((void **)&d_c, sz);
    
    cudaMemcpy(d_a, a, sz, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sz, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;

    addArrays<<<numBlocks, blockSize>>>(d_a, d_b, d_c, n);

    cudaDeviceSynchronize();
    cudaMemcpy(c, d_c,sz, cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    
    for (int i = 0; i < n; ++i) {
        printf("%d + %d = %d\n", a[i], b[i], c[i]);
    }

    return 0;
}


0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27

