<a href="https://colab.research.google.com/github/romiebanerjee/cuda-examples/blob/master/hello_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda_runtime.h> // Needed for CUDA functions and types

// 1. KERNEL DEFINITION
// This function will execute on the GPU
__global__ void addArrays(int n, float *a, float *b, float *result) {
    // Calculate a unique index for each thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < 10){
    printf("index: %d = %d * %d + %d \n", index, blockIdx.x, blockDim.x, threadIdx.x);

   // printf("bockIdx = %d %d %d \n", blockIdx.x, blockIdx.y, blockIdx.z);
   // printf("blockDim = %d %d %d\n", blockDim.x, blockDim.y, blockDim.z);
   // printf("threadIdx = %d %d %d \n", threadIdx.x, threadIdx.y, threadIdx.z);
    }
    // Check if this thread's index is within the array bounds
    if (index < n) {
        // This single line of code is executed in parallel by ALL threads
        result[index] = a[index] + b[index];
    }
}

int main() {
    // 2. SETUP PROBLEM SIZE AND HOST (CPU) MEMORY
    int numElements = 1000000;
    size_t size = numElements * sizeof(float);

    // Allocate and initialize host arrays
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_result = (float *)malloc(size); // To store results from GPU

    for (int i = 0; i < numElements; i++) {
        h_a[i] = 1.0f; // Initialize array a with 1.0
        h_b[i] = 2.0f; // Initialize array b with 2.0
    }

    // 3. ALLOCATE DEVICE (GPU) MEMORY
    float *d_a = NULL, *d_b = NULL, *d_result = NULL;
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_result, size);

    // 4. COPY DATA FROM HOST TO DEVICE
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // 5. CONFIGURE AND LAUNCH THE KERNEL
    // Define the execution configuration
    int threadsPerBlock = 256; // A common choice
    printf("threadsPerBlock = %d \n", threadsPerBlock);

    // Calculate the number of blocks needed to cover the entire array
    int blocksPerGrid = (numElements) / threadsPerBlock;
    printf("blocksPerGrid = %d \n", blocksPerGrid);

    // Launch the kernel on the GPU
    // Syntax: <<<Number of Blocks, Threads per Block>>>
    addArrays<<<blocksPerGrid, threadsPerBlock>>>(numElements, d_a, d_b, d_result);

    // 6. COPY RESULT BACK FROM DEVICE TO HOST
    cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);

    // 7. VERIFY THE RESULTS
    // Check the first and last few elements for correctness
    for (int i = 0; i < 5; i++) {
        printf("Element %d: %.1f + %.1f = %.1f (expected 3.0)\n",
               i, h_a[i], h_b[i], h_result[i]);
    }
    printf("...\n");
    for (int i = numElements-5; i < numElements; i++) {
        printf("Element %d: %.1f + %.1f = %.1f (expected 3.0)\n",
               i, h_a[i], h_b[i], h_result[i]);
    }

    // 8. FREE ALL ALLOCATED MEMORY
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);
    free(h_a);
    free(h_b);
    free(h_result);

    printf("Done!\n");
    return 0;
}

Overwriting vector_add.cu


In [86]:
!nvcc -arch=sm_89 vector_add.cu -o vector_add

In [87]:
!./vector_add

threadsPerBlock = 256 
blocksPerGrid = 3906 
index: 0 = 0 * 256 + 0 
index: 1 = 0 * 256 + 1 
index: 2 = 0 * 256 + 2 
index: 3 = 0 * 256 + 3 
index: 4 = 0 * 256 + 4 
index: 5 = 0 * 256 + 5 
index: 6 = 0 * 256 + 6 
index: 7 = 0 * 256 + 7 
index: 8 = 0 * 256 + 8 
index: 9 = 0 * 256 + 9 
Element 0: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 1: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 2: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 3: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 4: 1.0 + 2.0 = 3.0 (expected 3.0)
...
Element 999995: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999996: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999997: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999998: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999999: 1.0 + 2.0 = 0.0 (expected 3.0)
Done!


In [88]:
%%writefile matrix_add.cu
#include <cuda_runtime.h>
#include <iostream>

// Kernel definition for matrix addition
__global__ void matrixAdd(int N, float* A, float* B, float* C) {
    // Calculate row and column indices
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    // Check if within bounds
    if (i < N && j < N) {
        // Linear index for 1D array representation
        int idx = j * N + i;
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    const int N = 1024; // Matrix size (1024x1024)
    size_t size = N * N * sizeof(float);

    // Allocate host memory
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    // Initialize host matrices
    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 threadsPerBlock(16, 16); // 256 threads per block
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrixAdd<<<numBlocks, threadsPerBlock>>>(N, d_A, d_B, d_C);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify result (optional)
    bool success = true;
    for (int i = 0; i < N * N; i++) {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
            success = false;
            break;
        }
    }
    std::cout << "Result: " << (success ? "PASS" : "FAIL") << std::endl;

    // Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Writing matrix_add.cu


In [89]:
!nvcc -arch=sm_89 matrix_add.cu -o matrix_add

In [90]:
!./matrix_add

Result: PASS
