Ref: https://colab.research.google.com/drive/1RdttaoQSBXNgjTBUYOPnxBpo9VzZhSxQ

In [12]:
!nvidia-smi

Mon Apr  7 13:07:41 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Compiling and running OpenMP, MPI

In [13]:
%%sh
cat > openmp.c << EOF
#include <stdio.h>
int main()
{
  #pragma omp parallel
  printf("Hello, world!\n");
}
EOF
ls -l

total 1020
-rwxr-xr-x 1 root root   17016 Apr  7 12:44 a.out
-rw-r--r-- 1 root root     374 Apr  7 12:44 hello.cu
-rwxr-xr-x 1 root root 1003240 Apr  7 12:44 hello_cuda
-rw-r--r-- 1 root root     360 Apr  7 12:44 mpi.c
-rw-r--r-- 1 root root     401 Apr  7 12:44 mpi.f90
-rw-r--r-- 1 root root      86 Apr  7 13:07 openmp.c
drwxr-xr-x 1 root root    4096 Apr  3 13:37 sample_data


In [14]:
!gcc -fopenmp openmp.c && ./a.out

Hello, world!
Hello, world!


In [15]:
%%sh
cat > mpi.c << EOF
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
    MPI_Init(NULL, NULL);

    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    printf("Hello, world from rank %d out of %d processors\n", world_rank, world_size);

    MPI_Finalize();
    return 0;
}
EOF
ls -l

total 1016
-rwxr-xr-x 1 root root   16280 Apr  7 13:07 a.out
-rw-r--r-- 1 root root     374 Apr  7 12:44 hello.cu
-rwxr-xr-x 1 root root 1003240 Apr  7 12:44 hello_cuda
-rw-r--r-- 1 root root     360 Apr  7 13:07 mpi.c
-rw-r--r-- 1 root root     401 Apr  7 12:44 mpi.f90
-rw-r--r-- 1 root root      86 Apr  7 13:07 openmp.c
drwxr-xr-x 1 root root    4096 Apr  3 13:37 sample_data


In [16]:
!mpicc mpi.c && mpirun -n 8 --allow-run-as-root --oversubscribe ./a.out

Hello, world from rank 5 out of 8 processors
Hello, world from rank 2 out of 8 processors
Hello, world from rank 1 out of 8 processors
Hello, world from rank 4 out of 8 processors
Hello, world from rank 3 out of 8 processors
Hello, world from rank 0 out of 8 processors
Hello, world from rank 7 out of 8 processors
Hello, world from rank 6 out of 8 processors


In [17]:
%%sh
cat > mpi.f90 << EOF
program hello
   use mpi
   integer rank, size, ierror, strlen, status(MPI_STATUS_SIZE)
   character(len=MPI_MAX_PROCESSOR_NAME) :: hostname

   call MPI_INIT(ierror)
   call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierror)
   call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierror)
   call MPI_GET_PROCESSOR_NAME( hostname, strlen, ierror )
   print*, trim(hostname), rank, size
   call MPI_FINALIZE(ierror)
end
EOF
mpif90 mpi.f90 && mpirun -n 8 --allow-run-as-root --oversubscribe a.out

 a29ab5129149           7           8
 a29ab5129149           0           8
 a29ab5129149           1           8
 a29ab5129149           2           8
 a29ab5129149           3           8
 a29ab5129149           4           8
 a29ab5129149           5           8
 a29ab5129149           6           8


## GPU programming in Python and CUDA

In [18]:
!pip -q install gputil psutil humanize
# Import packages
import humanize,psutil,GPUtil

# Define function
def mem_report():
  print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ))

  GPUs = GPUtil.getGPUs()
  for i, gpu in enumerate(GPUs):
    print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'
    .format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))

# Execute function
mem_report()

CPU RAM Free: 12.5 GB
GPU 0 ... Mem Free: 15095MB / 15360MB | Utilization   0%


In [19]:
%%sh
cat > hello.cu << EOF
#include <stdio.h>

#define NUM_BLOCKS 16
#define BLOCK_WIDTH 1

__global__ void hello()
{
    printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
}

int main(int argc,char **argv)
{
    // launch the kernel
    hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>();

    // force the printf()s to flush
    cudaDeviceSynchronize();

    printf("That's all!\n");

    return 0;
}
EOF
nvcc -o hello_cuda hello.cu && ./hello_cuda

That's all!
