In [1]:
!ls /usr/local

bin    cuda	cuda-12.5	  etc	 include  libexec     man  sbin   src
colab  cuda-12	dist_metrics.pxd  games  lib	  LICENSE.md  opt  share


In [2]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [3]:
!nvidia-smi

Tue Apr 15 12:58:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

**Vector Addition using CUDA**

In [6]:
%%writefile vector_add.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 1000000

// CUDA Kernel to perform vector addition
__global__ void vectorAdd(int* A, int* B, int* C, int n) {
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < n) {
    C[i] = A[i] + B[i];
  }
}

// Fill array with random integers
void fillArray(int *arr, int n){
  for (int i = 0; i < n; i++) {
    arr[i] = rand() % 100;
  }
}

int main() {
  int size = N * sizeof(int);

  // Allocate memory on host
  int *h_A = (int*)malloc(size);
  int *h_B = (int*)malloc(size);
  int *h_C = (int*)malloc(size);

  // Initialize arrays on host
  fillArray(h_A, N);
  fillArray(h_B, N);

  // Allocate memory on device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void**)&d_A, size);
  cudaMalloc((void**)&d_B, size);
  cudaMalloc((void**)&d_C, size);

  // Copy data from host to device
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  // Launch kernel on GPU
  int threadsPerBlock = 256;
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

  // Copy result back to host
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  // Print the first 10 elements of the result
  printf("Vector Addition Result (first 10 element):\n");
  for (int i = 0; i < 10; i++) {
    printf("%d + %d = %d\n", h_A[i], h_B[i], h_C[i]);
  }

  // Free memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);
  free(h_A);
  free(h_B);
  free(h_C);

  return 0;
}

Overwriting vector_add.cu


In [7]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add

In [8]:
!./vector_add

Vector Addition Result (first 10 element):
83 + 89 = 172
86 + 63 = 149
77 + 84 = 161
15 + 93 = 108
93 + 81 = 174
35 + 55 = 90
86 + 6 = 92
92 + 93 = 185
49 + 61 = 110
21 + 50 = 71


**Matrix Multiplication using CUDA C**

In [9]:
%%writefile matrix_mul.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 16

// CUDA Kernel to perform matrix multiplication
__global__ void matrixMul(int *A, int *B, int *C, int width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  // Check for valid matrix indices within bounds
  if (row < width && col < width) {
    int sum = 0;
    for (int k = 0; k < width; ++k) {
      sum += A[row * width + k] * B[k * width + col];
    }
    C[row * width + col] = sum;
  }
}

void fillMatrix(int *matrix, int width) {
  for (int i = 0; i < width * width; i++) {
    matrix[i] = rand() % 10;
  }
}

void printMatrix(int *matrix, int width) {
  for (int i = 0; i < width; i++) {
    for (int j = 0; j < width; j++) {
      printf("%4d ", matrix[i * width + j]);
    }
    printf("\n");
  }
}

int main() {
  int size = N * N * sizeof(int);

  // Allocate memory on host
  int *h_A = (int*)malloc(size);
  int *h_B = (int*)malloc(size);
  int *h_C = (int*)malloc(size);

  // Initialize matrices on host
  fillMatrix(h_A, N);
  fillMatrix(h_B, N);

  // Allocate memory on device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void**)&d_A, size);
  cudaMalloc((void**)&d_B, size);
  cudaMalloc((void**)&d_C, size);

  // Copy data from host to device
  cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

  // Define grid and block dimensions
  dim3 dimBlock(16, 16);
  dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.x - 1) / dimBlock.x);

  //Launch kernel on GPU
  matrixMul<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

  // Copy result back to host
  cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

  //Print results
  printf("Matrix A:\n");
  printMatrix(h_A, N);
  printf("\nMatrix B:\n");
  printMatrix(h_B, N);
  printf("\nMatrix C (A x B):\n");
  printMatrix(h_C, N);

  //Free memory
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);
  free(h_A);
  free(h_B);
  free(h_C);

  return 0;
}

Writing matrix_mul.cu


In [10]:
!nvcc -arch=sm_75 matrix_mul.cu -o matrix_mul

In [11]:
!./matrix_mul

Matrix A:
   3    6    7    5    3    5    6    2    9    1    2    7    0    9    3    6 
   0    6    2    6    1    8    7    9    2    0    2    3    7    5    9    2 
   2    8    9    7    3    6    1    2    9    3    1    9    4    7    8    4 
   5    0    3    6    1    0    6    3    2    0    6    1    5    5    4    7 
   6    5    6    9    3    7    4    5    2    5    4    7    4    4    3    0 
   7    8    6    8    8    4    3    1    4    9    2    0    6    8    9    2 
   6    6    4    9    5    0    4    8    7    1    7    2    7    2    2    6 
   1    0    6    1    5    9    4    9    0    9    1    7    7    1    1    5 
   9    7    7    6    7    3    6    5    6    3    9    4    8    1    2    9 
   3    9    0    8    8    5    0    9    6    3    8    5    6    1    1    5 
   9    8    4    8    1    0    3    0    4    4    4    4    7    6    3    1 
   7    5    9    6    2    1    7    8    5    7    4    1    8    5    9    7 
   5    3    8    