### Basic Command for accessing the CUDA Version and GPU Information
####Runtime > Change runtime type > Setting the Hardware accelerator to GPU > Save

In [1]:
!ls /usr/local/

bin    cuda	cuda-12.5	  etc	 include  libexec     man  sbin   src
colab  cuda-12	dist_metrics.pxd  games  lib	  LICENSE.md  opt  share


In [2]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [3]:
!nvidia-smi

Tue May  6 04:55:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   51C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Random Input

 #### Matrix Multiplication using CUDA C

In [4]:
%%writefile matrix_mul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 16  // You can increase this to 512 or 1024 for bigger matrices

// CUDA Kernel for Matrix Multiplication
__global__ void matrixMulKernel(int *A, int *B, int *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        int sum = 0;
        for (int k = 0; k < width; k++) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

void fillMatrix(int *mat, int width) {
    for (int i = 0; i < width * width; i++) {
        mat[i] = rand() % 10; // fill with random values 0-9
    }
}

void printMatrix(int *mat, int width) {
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < width; j++) {
            printf("%4d ", mat[i * width + j]);
        }
        printf("\n");
    }
}

int main() {
    int size = N * N * sizeof(int);

    // Allocate memory on host
    int *h_A = (int *)malloc(size);
    int *h_B = (int *)malloc(size);
    int *h_C = (int *)malloc(size);

    // Fill host matrices with random values
    fillMatrix(h_A, N);
    fillMatrix(h_B, N);

    // Allocate memory on device
    int *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy host matrices to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 dimBlock(16, 16);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x,
                 (N + dimBlock.y - 1) / dimBlock.y);

    // Launch CUDA kernel
    matrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print results
    printf("Matrix A:\n");
    printMatrix(h_A, N);
    printf("\nMatrix B:\n");
    printMatrix(h_B, N);
    printf("\nMatrix C (A x B):\n");
    printMatrix(h_C, N);

    // Free memory
    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    return 0;
}

Writing matrix_mul.cu


In [5]:
!nvcc -arch=sm_75 matrix_mul.cu -o matrix_mul

In [6]:
!./matrix_mul

Matrix A:
   3    6    7    5    3    5    6    2    9    1    2    7    0    9    3    6 
   0    6    2    6    1    8    7    9    2    0    2    3    7    5    9    2 
   2    8    9    7    3    6    1    2    9    3    1    9    4    7    8    4 
   5    0    3    6    1    0    6    3    2    0    6    1    5    5    4    7 
   6    5    6    9    3    7    4    5    2    5    4    7    4    4    3    0 
   7    8    6    8    8    4    3    1    4    9    2    0    6    8    9    2 
   6    6    4    9    5    0    4    8    7    1    7    2    7    2    2    6 
   1    0    6    1    5    9    4    9    0    9    1    7    7    1    1    5 
   9    7    7    6    7    3    6    5    6    3    9    4    8    1    2    9 
   3    9    0    8    8    5    0    9    6    3    8    5    6    1    1    5 
   9    8    4    8    1    0    3    0    4    4    4    4    7    6    3    1 
   7    5    9    6    2    1    7    8    5    7    4    1    8    5    9    7 
   5    3    8    

#### Addition of two large vectors

In [7]:
%%writefile vector_add.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 1000000  // 1 million elements

// CUDA Kernel to perform vector addition
__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

// Fill array with random integers
void fillArray(int *arr, int n) {
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 100;
    }
}

int main() {
    int size = N * sizeof(int);

    // Allocate memory on host
    int *h_a = (int *)malloc(size);
    int *h_b = (int *)malloc(size);
    int *h_c = (int *)malloc(size);

    // Fill vectors with random data
    fillArray(h_a, N);
    fillArray(h_b, N);

    // Allocate memory on device
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy vectors to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, N);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print first 10 results for verification
    printf("Vector Addition Result (first 10 elements):\n");
    for (int i = 0; i < 10; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Free memory
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}

Writing vector_add.cu


In [8]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add

In [9]:
!./vector_add

Vector Addition Result (first 10 elements):
83 + 89 = 172
86 + 63 = 149
77 + 84 = 161
15 + 93 = 108
93 + 81 = 174
35 + 55 = 90
86 + 6 = 92
92 + 93 = 185
49 + 61 = 110
21 + 50 = 71


### User Entered Input

#### Addition of two large vectors

In [10]:
%%writefile vector_add.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// CUDA Kernel to perform vector addition
__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n;
    printf("Enter the number of elements (up to 1 million): ");
    scanf("%d", &n);

    if (n <= 0 || n > 1000000) {
        printf("Invalid input. Please enter a number between 1 and 1000000.\n");
        return 1;
    }

    size_t size = n * sizeof(int);
    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;

    // Allocate host memory
    h_a = (int *)malloc(size);
    h_b = (int *)malloc(size);
    h_c = (int *)malloc(size);

    // Initialize host arrays with user input
    printf("Enter %d elements for vector A:\n", n);
    for (int i = 0; i < n; i++) {
        scanf("%d", &h_a[i]);
    }

    printf("Enter %d elements for vector B:\n", n);
    for (int i = 0; i < n; i++) {
        scanf("%d", &h_b[i]);
    }

    // Allocate device memory
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Launch the vector addition kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

    // Copy the result back to host
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Print the first 10 and last 10 results
    printf("Results (first 10 and last 10):\n");
    for (int i = 0; i < 10; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }
    printf("...\n");
    for (int i = n - 10; i < n; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // Free memory
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}

Overwriting vector_add.cu


In [11]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add

In [12]:
!./vector_add

Enter the number of elements (up to 1 million): 10
Enter 10 elements for vector A:
1 2 3 4 5 6 7 8 9 10
Enter 10 elements for vector B:
10 2 30 40 5 0 70 90 50 100
Results (first 10 and last 10):
1 + 10 = 11
2 + 2 = 4
3 + 30 = 33
4 + 40 = 44
5 + 5 = 10
6 + 0 = 6
7 + 70 = 77
8 + 90 = 98
9 + 50 = 59
10 + 100 = 110
...
1 + 10 = 11
2 + 2 = 4
3 + 30 = 33
4 + 40 = 44
5 + 5 = 10
6 + 0 = 6
7 + 70 = 77
8 + 90 = 98
9 + 50 = 59
10 + 100 = 110


In [13]:
%%writefile matrix_mul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

// CUDA Kernel for Matrix Multiplication
__global__ void matrixMulKernel(int *A, int *B, int *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        int sum = 0;
        for (int k = 0; k < width; k++) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

int main() {
    int width;
    printf("Enter the matrix size (width x width): ");
    scanf("%d", &width);

    if (width <= 0 || width > 1024) {
        printf("Invalid size. Please enter a value between 1 and 1024.\n");
        return 1;
    }

    size_t size = width * width * sizeof(int);
    int *h_A, *h_B, *h_C;
    int *d_A, *d_B, *d_C;

    // Allocate host memory
    h_A = (int *)malloc(size);
    h_B = (int *)malloc(size);
    h_C = (int *)malloc(size);

    // Initialize matrices with user input
    printf("Enter %d x %d elements for matrix A:\n", width, width);
    for (int i = 0; i < width * width; i++) {
        scanf("%d", &h_A[i]);
    }

    printf("Enter %d x %d elements for matrix B:\n", width, width);
    for (int i = 0; i < width * width; i++) {
        scanf("%d", &h_B[i]);
    }

    // Allocate device memory
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 blockDim(16, 16);
    dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
                 (width + blockDim.y - 1) / blockDim.y);

    // Launch kernel
    matrixMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, width);

    // Copy result back to host
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Print a portion of the result matrix
    printf("\nResult Matrix (first 4x4 and last 4x4 blocks):\n");
    printf("First 4x4 block:\n");
    for (int i = 0; i < 4 && i < width; i++) {
        for (int j = 0; j < 4 && j < width; j++) {
            printf("%6d ", h_C[i * width + j]);
        }
        printf("\n");
    }

    if (width > 4) {
        printf("\nLast 4x4 block:\n");
        for (int i = width - 4; i < width; i++) {
            for (int j = width - 4; j < width; j++) {
                printf("%6d ", h_C[i * width + j]);
            }
            printf("\n");
        }
    }

    // Free memory
    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    return 0;
}

Overwriting matrix_mul.cu


In [14]:
!nvcc -arch=sm_75 matrix_mul.cu -o matrix_mul

In [16]:
!./matrix_mul

Enter the matrix size (width x width): 2
Enter 2 x 2 elements for matrix A:
1 2 3 4
Enter 2 x 2 elements for matrix B:
5 6 7 8

Result Matrix (first 4x4 and last 4x4 blocks):
First 4x4 block:
    19     22 
    43     50 


In [None]:
2