<a href="https://colab.research.google.com/github/romiebanerjee/cuda-examples/blob/master/hello_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
%%writefile hello-2dim.cu
#include <stdio.h>
#include <unistd.h>

//DEMO program that shows the identifying variables of threads in a 2-dim grid and thread block:
__global__ void hello( )
{
   printf("grid coord: (%d,%d), thread coord: (%d,%d), grid dim:   (%d,%d), block dim:    (%d,%d)\n",
           blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y,
           gridDim.x, gridDim.y, blockDim.x, blockDim.y);
}

int main()
{
   dim3 gridShape  = dim3( 3, 2 );
   dim3 blockShape = dim3( 2, 3 );

   hello<<< gridShape, blockShape>>>( );

   printf("I am the CPU: Hello World ! \n");
   cudaDeviceSynchronize();
}

Overwriting hello-2dim.cu


In [28]:
!nvcc -arch=sm_89 hello-2dim.cu -o hello-2dim

In [29]:
!./hello-2dim

I am the CPU: Hello World ! 
grid coord: (1,0), thread coord: (0,0), grid dim:   (3,2), block dim:    (2,3)
grid coord: (1,0), thread coord: (1,0), grid dim:   (3,2), block dim:    (2,3)
grid coord: (1,0), thread coord: (0,1), grid dim:   (3,2), block dim:    (2,3)
grid coord: (1,0), thread coord: (1,1), grid dim:   (3,2), block dim:    (2,3)
grid coord: (1,0), thread coord: (0,2), grid dim:   (3,2), block dim:    (2,3)
grid coord: (1,0), thread coord: (1,2), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (0,0), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (1,0), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (0,1), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (1,1), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (0,2), grid dim:   (3,2), block dim:    (2,3)
grid coord: (2,1), thread coord: (1,2), grid dim:   (3,2), block dim:    (2,3)
grid coord: (0,0), thre

In [37]:
%%writefile hello-2dim-ID.cu
//DEMO program that shows how to compute unique ID for threads in a 2-dim grid and thread block
#include <stdio.h>
#include <unistd.h>

__global__ void hello( )
{
   printf("blockIdx:(%d,%d), threadIdx:(%d,%d) -> Row,Col=(%d,%d)\n",
           blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y,
           blockIdx.x * blockDim.x + threadIdx.x,   // rowID
           blockIdx.y * blockDim.y + threadIdx.y);  // columnID
}

int main()
{
   dim3 blockShape = dim3( 2, 3 );
   dim3 gridShape  = dim3( 3, 2 );

   hello<<< gridShape, blockShape>>>( );

   printf("I am the CPU: Hello World ! \n");
   cudaDeviceSynchronize();
}

Overwriting hello-2dim-ID.cu


In [38]:
!nvcc -arch=sm_89 hello-2dim-ID.cu -o hello-2dim-ID

In [39]:
!./hello-2dim-ID

I am the CPU: Hello World ! 
blockIdx:(1,0), threadIdx:(0,0) -> Row,Col=(2,0)
blockIdx:(1,0), threadIdx:(1,0) -> Row,Col=(3,0)
blockIdx:(1,0), threadIdx:(0,1) -> Row,Col=(2,1)
blockIdx:(1,0), threadIdx:(1,1) -> Row,Col=(3,1)
blockIdx:(1,0), threadIdx:(0,2) -> Row,Col=(2,2)
blockIdx:(1,0), threadIdx:(1,2) -> Row,Col=(3,2)
blockIdx:(2,1), threadIdx:(0,0) -> Row,Col=(4,3)
blockIdx:(2,1), threadIdx:(1,0) -> Row,Col=(5,3)
blockIdx:(2,1), threadIdx:(0,1) -> Row,Col=(4,4)
blockIdx:(2,1), threadIdx:(1,1) -> Row,Col=(5,4)
blockIdx:(2,1), threadIdx:(0,2) -> Row,Col=(4,5)
blockIdx:(2,1), threadIdx:(1,2) -> Row,Col=(5,5)
blockIdx:(0,0), threadIdx:(0,0) -> Row,Col=(0,0)
blockIdx:(0,0), threadIdx:(1,0) -> Row,Col=(1,0)
blockIdx:(0,0), threadIdx:(0,1) -> Row,Col=(0,1)
blockIdx:(0,0), threadIdx:(1,1) -> Row,Col=(1,1)
blockIdx:(0,0), threadIdx:(0,2) -> Row,Col=(0,2)
blockIdx:(0,0), threadIdx:(1,2) -> Row,Col=(1,2)
blockIdx:(2,0), threadIdx:(0,0) -> Row,Col=(4,0)
blockIdx:(2,0), threadIdx:(1,0) -> Row,C

In [40]:
%%writefile vector_add.cu
#include <stdio.h>
#include <cuda_runtime.h> // Needed for CUDA functions and types

// 1. KERNEL DEFINITION
// This function will execute on the GPU
__global__ void addArrays(int n, float *a, float *b, float *result) {
    // Calculate a unique index for each thread
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    // Check if this thread's index is within the array bounds
    if (index < n) {
        // This single line of code is executed in parallel by ALL threads
        result[index] = a[index] + b[index];
    }
}

int main() {
    // 2. SETUP PROBLEM SIZE AND HOST (CPU) MEMORY
    int numElements = 1000000;
    size_t size = numElements * sizeof(float);

    // Allocate and initialize host arrays
    float *h_a = (float *)malloc(size);
    float *h_b = (float *)malloc(size);
    float *h_result = (float *)malloc(size); // To store results from GPU

    for (int i = 0; i < numElements; i++) {
        h_a[i] = 1.0f; // Initialize array a with 1.0
        h_b[i] = 2.0f; // Initialize array b with 2.0
    }

    // 3. ALLOCATE DEVICE (GPU) MEMORY
    float *d_a = NULL, *d_b = NULL, *d_result = NULL;
    cudaMalloc((void**)&d_a, size);
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_result, size);

    // 4. COPY DATA FROM HOST TO DEVICE
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // 5. CONFIGURE AND LAUNCH THE KERNEL
    // Define the execution configuration
    int threadsPerBlock = 256; // A common choice
    printf("threadsPerBlock = %d \n", threadsPerBlock);

    // Calculate the number of blocks needed to cover the entire array
    int blocksPerGrid = (numElements) / threadsPerBlock;
    printf("blocksPerGrid = %d \n", blocksPerGrid);

    // Launch the kernel on the GPU
    // Syntax: <<<Number of Blocks, Threads per Block>>>
    addArrays<<<blocksPerGrid, threadsPerBlock>>>(numElements, d_a, d_b, d_result);

    // 6. COPY RESULT BACK FROM DEVICE TO HOST
    cudaMemcpy(h_result, d_result, size, cudaMemcpyDeviceToHost);

    // 7. VERIFY THE RESULTS
    // Check the first and last few elements for correctness
    for (int i = 0; i < 5; i++) {
        printf("Element %d: %.1f + %.1f = %.1f (expected 3.0)\n",
               i, h_a[i], h_b[i], h_result[i]);
    }
    printf("...\n");
    for (int i = numElements-5; i < numElements; i++) {
        printf("Element %d: %.1f + %.1f = %.1f (expected 3.0)\n",
               i, h_a[i], h_b[i], h_result[i]);
    }

    // 8. FREE ALL ALLOCATED MEMORY
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);
    free(h_a);
    free(h_b);
    free(h_result);

    printf("Done!\n");
    return 0;
}

Writing vector_add.cu


In [41]:
!nvcc -arch=sm_89 vector_add.cu -o vector_add

In [42]:
!./vector_add

threadsPerBlock = 256 
blocksPerGrid = 3906 
Element 0: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 1: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 2: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 3: 1.0 + 2.0 = 3.0 (expected 3.0)
Element 4: 1.0 + 2.0 = 3.0 (expected 3.0)
...
Element 999995: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999996: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999997: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999998: 1.0 + 2.0 = 0.0 (expected 3.0)
Element 999999: 1.0 + 2.0 = 0.0 (expected 3.0)
Done!


In [43]:
%%writefile matrix_add.cu
#include <cuda_runtime.h>
#include <iostream>

// Kernel definition for matrix addition
__global__ void matrixAdd(int N, float* A, float* B, float* C) {
    // Calculate row and column indices
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    // Check if within bounds
    if (i < N && j < N) {
        // Linear index for 1D array representation
        int idx = j * N + i;
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    const int N = 1024; // Matrix size (1024x1024)
    size_t size = N * N * sizeof(float);

    // Allocate host memory
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);
    float* h_C = (float*)malloc(size);

    // Initialize host matrices
    for (int i = 0; i < N * N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }
    printf("h_A[0] = %f \n", h_A[0]);
    printf("h_B[0] = %f \n", h_B[0]);

    // Allocate device memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 threadsPerBlock(16, 16); // 256 threads per block
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Launch kernel
    matrixAdd<<<numBlocks, threadsPerBlock>>>(N, d_A, d_B, d_C);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    printf("h_C[0] = %f \n", h_C[0]);

    // Verify result (optional)
    bool success = true;
    for (int i = 0; i < N * N; i++) {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
            success = false;
            break;
        }
    }
    std::cout << "Result: " << (success ? "PASS" : "FAIL") << std::endl;

    // Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Overwriting matrix_add.cu


In [44]:
!nvcc -arch=sm_89 matrix_add.cu -o matrix_add

In [45]:
!./matrix_add

h_A[0] = 0.840188 
h_B[0] = 0.394383 
h_C[0] = 1.234571 
Result: PASS


In [46]:
%%writefile matrix_multiply.cu
#include <cuda_runtime.h>
#include <iostream>
#include <cstdlib> // for rand()

__global__ void matrixMulKernel(int M, int N, int K, const float* A, const float* B, float* C) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; ++k) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

void matrixMultiply(int M, int N, int K, const float* h_A, const float* h_B, float* h_C) {
    size_t sizeA = M * K * sizeof(float);
    size_t sizeB = K * N * sizeof(float);
    size_t sizeC = M * N * sizeof(float);

    float *d_A, *d_B, *d_C;

    // 1. Allocate device memory
    cudaMalloc((void**)&d_A, sizeA);
    cudaMalloc((void**)&d_B, sizeB);
    cudaMalloc((void**)&d_C, sizeC);

    // 2. Copy data to device
    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    // 3. Configure and launch kernel
    dim3 blockDim(16, 16);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y);

    matrixMulKernel<<<gridDim, blockDim>>>(M, N, K, d_A, d_B, d_C);

    // 4. Copy result back
    cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);

    // 5. Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main() {
    int M = 512, N = 512, K = 512; // Matrix dimensions

    // Allocate host memory
    size_t sizeA = M * K * sizeof(float);
    size_t sizeB = K * N * sizeof(float);
    size_t sizeC = M * N * sizeof(float);

    float* h_A = (float*)malloc(sizeA);
    float* h_B = (float*)malloc(sizeB);
    float* h_C = (float*)malloc(sizeC);

    // Initialize matrices with random values
    for (int i = 0; i < M * K; i++) h_A[i] = static_cast<float>(rand()) / RAND_MAX;
    for (int i = 0; i < K * N; i++) h_B[i] = static_cast<float>(rand()) / RAND_MAX;

    // Run matrix multiplication
    matrixMultiply(M, N, K, h_A, h_B, h_C);

    // Print a sample result
    std::cout << "C[0][0] = " << h_C[0] << std::endl;
    std::cout << "C[100][100] = " << h_C[100 * N + 100] << std::endl;

    // Cleanup
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

Overwriting matrix_multiply.cu


In [47]:
!nvcc -arch=sm_89 matrix_multiply.cu -o matrix_multiply

In [48]:
!./matrix_multiply

C[0][0] = 135.775
C[100][100] = 124.573
