In [3]:
%%writefile vector_addition.cu
#include <iostream>
#include <cuda_runtime.h>
using namespace std;

// CUDA kernel: adds A[i] + B[i] and stores in C[i]
__global__ void add(int* A, int* B, int* C, int N) {
    int i = threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}

// Print array
void print(const int* arr, int N) {
    for (int i = 0; i < N; i++) cout << arr[i] << " ";
    cout << "\n";
}

int main() {
    int N;
    cout << "Enter the number of elements: ";
    cin >> N;

    int* A = new int[N];
    int* B = new int[N];
    int* C = new int[N];

    cout << "Enter " << N << " elements for array A: ";
    for (int i = 0; i < N; i++) cin >> A[i];

    cout << "Enter " << N << " elements for array B: ";
    for (int i = 0; i < N; i++) cin >> B[i];

    int *dA, *dB, *dC;
    size_t size = N * sizeof(int);

    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);

    cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, B, size, cudaMemcpyHostToDevice);

    add<<<1, N>>>(dA, dB, dC, N);

    cudaMemcpy(C, dC, size, cudaMemcpyDeviceToHost);

    cout << "A + B = ";
    print(C, N);

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}


Overwriting vector_addition.cu


In [4]:
!nvcc -arch=sm_75 vector_addition.cu -o vector_addition
!./vector_addition


Enter the number of elements: 4
Enter 4 elements for array A: 4 7 2 6
Enter 4 elements for array B: 3 2 1 8
A + B = 7 9 3 14 
