In [1]:
!nvidia-smi


Mon Jun 16 21:05:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   60C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [42]:
%%writefile vector_addition.cu
#include <iostream>

__global__ void addVectors(const float *v1, const float *v2, float *result, int N) {
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N) {
      result[index] = v1[index] + v2[index];
    }
}

int main() {
  const int N = 10;
  float A[N], B[N], RES[N];

  float *device_A, *device_B, *device_RES;

  cudaMalloc(&device_A, N * sizeof(float));
  cudaMalloc(&device_B, N * sizeof(float));
  cudaMalloc(&device_RES, N * sizeof(float));


  for (int i = 0; i < N; i++) {
    A[i] = i;
    B[i] = i;
  }

  std::cout << "Initialized A[0] = " << A[0] << ", B[0] = " << B[0] << std::endl;

  cudaMemcpy(device_A, A, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(device_B, B, N * sizeof(float), cudaMemcpyHostToDevice);


  int blockSize = 256; // 32 * 2 (must be a multiple of 32)
  // int gridSize = ceil((float)N/blockSize);
  int gridSize = (N + blockSize - 1) / blockSize;
  addVectors<<<gridSize, blockSize>>>(device_A, device_B, device_RES, N);

  cudaDeviceSynchronize();


  cudaMemcpy(RES, device_RES, N * sizeof(float), cudaMemcpyDeviceToHost);

  cudaFree(device_A);
  cudaFree(device_B);
  cudaFree(device_RES);

  for (int i = 0; i < N; i++) {
    printf("%f %f %f\n", A[i], B[i], RES[i]);
  }

  return 0;

}

Overwriting vector_addition.cu


In [43]:
# !nvcc -o vector_addit vector_addition.cu
!nvcc -o vector_addit vector_addition.cu -arch=sm_75

In [44]:
!./vector_addit

Initialized A[0] = 0, B[0] = 0
0.000000 0.000000 0.000000
1.000000 1.000000 2.000000
2.000000 2.000000 4.000000
3.000000 3.000000 6.000000
4.000000 4.000000 8.000000
5.000000 5.000000 10.000000
6.000000 6.000000 12.000000
7.000000 7.000000 14.000000
8.000000 8.000000 16.000000
9.000000 9.000000 18.000000
