<a href="https://colab.research.google.com/github/reddyroh/cuda-training/blob/main/CUDA_Training_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!ls /usr/local

bin    cuda	cuda-11.8  games	       include	lib64	   man	 share
colab  cuda-11	etc	   _gcs_config_ops.so  lib	licensing  sbin  src


In [4]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [5]:
!nvidia-smi

Sat Oct 21 18:47:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
%%writefile hello.cu

#include <stdio.h>

__global__ void hello(){

  printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){

  hello<<<2, 2>>>();
  cudaDeviceSynchronize();
}

Overwriting hello.cu


In [9]:
!nvcc -o hello hello.cu

In [10]:
!./hello

Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1


In [109]:
%%writefile vector_add.cu

#include <stdio.h>
#include <cstdlib>

// error checking macro
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


const int block_size = 1024;  // CUDA maximum is 1024

__global__ void add(float* a, float* b, float* c, int N){
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < N) {
    c[index] = a[index] + b[index];
  }
}

void initialize(float* a, int N) {
  for (int i = 0; i < N; i++) {
    a[i] = std::rand()/(float)RAND_MAX;
  }
}

int main() {

  int N = block_size*20;
  int size = N * sizeof(float);
  float *a, *b, *c;
  float *d_a, *d_b, *d_c;

  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  cudaCheckErrors("cuda Malloc");

  a = (float *)malloc(size);
  initialize(a, N);
  b = (float *)malloc(size);
  initialize(b, N);
  c = (float *)malloc(size);

  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  cudaCheckErrors("cuda memcpy host to device");

  add<<<(N + block_size - 1)/block_size, block_size>>>(d_a, d_b, d_c, N);

  cudaCheckErrors("kernel launch");

  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  cudaCheckErrors("cuda memcpy device to host");

  for (int i = 0; i < 3; i++) {
    printf("a[%u]: %f, b[%u]: %f, c[%u]: %f \n", i, a[i], i, b[i], i, c[i]);
  }
}



Overwriting vector_add.cu


In [110]:
!nvcc -o vector_add vector_add.cu

In [111]:
!./vector_add

a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 


In [112]:
%%timeit
!./vector_add

a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.

In [102]:
%%writefile vector_add_cpu.cu

#include <stdio.h>
#include <cstdlib>

void initialize(float* a, int N) {
  for (int i = 0; i < N; i++) {
    a[i] = std::rand()/(float)RAND_MAX;
  }
}

int main() {

  int N = block_size*20;
  int size = N * sizeof(float);
  float *a, *b, *c;

  a = (float *)malloc(size);
  initialize(a, N);
  b = (float *)malloc(size);
  initialize(b, N);
  c = (float *)malloc(size);

  for (int i = 0; i < N; i++) {
    c[i] = a[i] + b[i];
  }

  for (int i = 0; i < 3; i++) {
    printf("a[%u]: %f, b[%u]: %f, c[%u]: %f \n", i, a[i], i, b[i], i, c[i]);
  }
}


Overwriting vector_add_cpu.cu


In [103]:
!nvcc -o vector_add_cpu vector_add_cpu.cu

[01m[0m[01mvector_add_cpu.cu(13)[0m: [01;31merror[0m: identifier "[01mblock_size[0m" is undefined

1 error detected in the compilation of "vector_add_cpu.cu".


In [104]:
!./vector_add_cpu

a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 


In [105]:
%%timeit
!./vector_add_cpu

a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.979739 
a[0]: 0.840188, b[0]: 0.118948, c[0]: 0.959136 
a[1]: 0.394383, b[1]: 0.235666, c[1]: 0.630049 
a[2]: 0.783099, b[2]: 0.196639, c[2]: 0.

In [82]:
# CUDA malloc is likely taking up the runtime https://stackoverflow.com/questions/10378124/cuda-gpu-slower-than-cpu
# Though for most cases we would be more concerned if CUDA memcpy is the bottleneck.

In [119]:
%%writefile matrix_multiply.cu

#include <stdio.h>

// these are just for timing measurments
#include <time.h>

// error checking macro
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


const int DSIZE = 4096;
const int block_size = 16;  // CUDA maximum is 1024 *total* threads in block
const float A_val = 1.0f;
const float B_val = 2.0f;

// square matrix multiply (naive) kernel: C = A * B
__global__ void mmul(const float *A, const float *B, float *C, int ds) {

  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  if (idx < ds && idy < ds) {
    float tmp = 0;
    for (int i = 0; i < ds; i++) {
      tmp += A[idy*ds + i] * B[i*ds + idx];
    }
    C[idy * ds + idx] = tmp;
  }
}

int main(){

  float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;

  // these are just for timing
  clock_t t0, t1, t2;
  double t1sum=0.0;
  double t2sum=0.0;

  // start timing
  t0 = clock();

  h_A = new float[DSIZE*DSIZE];
  h_B = new float[DSIZE*DSIZE];
  h_C = new float[DSIZE*DSIZE];
  for (int i = 0; i < DSIZE*DSIZE; i++){
    h_A[i] = A_val;
    h_B[i] = B_val;
    h_C[i] = 0;}

  // Initialization timing
  t1 = clock();
  t1sum = ((double)(t1-t0))/CLOCKS_PER_SEC;
  printf("Init took %f seconds.  Begin compute\n", t1sum);

  // Allocate device memory and copy input data over to GPU
  cudaMalloc(&d_A, DSIZE*DSIZE*sizeof(float));
  cudaMalloc(&d_B, DSIZE*DSIZE*sizeof(float));
  cudaMalloc(&d_C, DSIZE*DSIZE*sizeof(float));
  cudaCheckErrors("cudaMalloc failure");
  cudaMemcpy(d_A, h_A, DSIZE*DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_B, h_B, DSIZE*DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy H2D failure");

  // Cuda processing sequence step 1 is complete

  // Launch kernel
  dim3 block(block_size, block_size);  // dim3 variable holds 3 dimensions
  dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
  mmul<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
  cudaCheckErrors("kernel launch failure");

  // Cuda processing sequence step 2 is complete

  // Copy results back to host
  cudaMemcpy(h_C, d_C, DSIZE*DSIZE*sizeof(float), cudaMemcpyDeviceToHost);

  // GPU timing
  t2 = clock();
  t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
  printf ("Done. Compute took %f seconds\n", t2sum);

  // Cuda processing sequence step 3 is complete

  // Verify results
  cudaCheckErrors("kernel execution failure or cudaMemcpy H2D failure");
  for (int i = 0; i < DSIZE*DSIZE; i++) if (h_C[i] != A_val*B_val*DSIZE) {printf("mismatch at index %d, was: %f, should be: %f\n", i, h_C[i], A_val*B_val*DSIZE); return -1;}
  printf("Success!\n");

  return 0;
}


Overwriting matrix_multiply.cu


In [120]:
!nvcc -o mmul matrix_multiply.cu

In [121]:
!./mmul

Init took 0.132243 seconds.  Begin compute
Done. Compute took 0.562176 seconds
Success!


In [122]:
!nvidia-smi

Sat Oct 21 20:20:33 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    26W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces