In [None]:
!lscpu

In [None]:
!uname -a && cat /etc/*release

Linux 8abd1bed502a 6.1.85+ #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=22.04
DISTRIB_CODENAME=jammy
DISTRIB_DESCRIPTION="Ubuntu 22.04.3 LTS"
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy


In [None]:
!pwd
!ls -la

/content
total 32
drwxr-xr-x 1 root root  4096 Nov 23 10:24 .
drwxr-xr-x 1 root root  4096 Nov 23 10:23 ..
drwxr-xr-x 4 root root  4096 Nov 21 14:25 .config
-rw-r--r-- 1 root root 14196 Nov 23 10:24 devicequery.zip
drwxr-xr-x 1 root root  4096 Nov 21 14:25 sample_data


In [None]:
!nvidia-smi

Sat Nov 23 10:24:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8              11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!unzip devicequery.zip

Archive:  devicequery.zip
  inflating: devicequery.cu          
  inflating: helper_cuda.h           
  inflating: helper_string.h         


In [None]:
!nvcc devicequery.cu -o devicequery

In [None]:
!/content/devicequery

/content/devicequery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "Tesla T4"
  CUDA Driver Version / Runtime Version          12.2 / 12.2
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 15102 MBytes (15835660288 bytes)
  (040) Multiprocessors, (064) CUDA Cores/MP:    2560 CUDA Cores
  GPU Max Clock rate:                            1590 MHz (1.59 GHz)
  Memory Clock rate:                             5001 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 4194304 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount 

## Cuda

In [1]:
code=r'''#include <iostream>
#include <fstream>
#include <complex>
#include <chrono>
#include <omp.h>

#ifdef __NVCC__
#include <cuda.h>
#include <cuda/std/complex>
#endif

// Ranges of the set
#ifndef MIN_X
#define MIN_X -2
#endif
#ifndef MAX_X
#define MAX_X 1
#endif
#ifndef MIN_Y
#define MIN_Y -1
#endif
#ifndef MAX_Y
#define MAX_Y 1
#endif

// Image ratio
#define RATIO_X (MAX_X - MIN_X)
#define RATIO_Y (MAX_Y - MIN_Y)

// Image size
#ifndef RESOLUTION
#define RESOLUTION 1000
#endif
#define WIDTH (RATIO_X * RESOLUTION)
#define HEIGHT (RATIO_Y * RESOLUTION)

#define STEP ((double)RATIO_X / WIDTH)

#ifndef DEGREE
#define DEGREE 2        // Degree of the polynomial
#endif
#ifndef ITERATIONS
#define ITERATIONS 1000 // Maximum number of iterations
#endif

#ifndef BLOCK_SIZE
#define BLOCK_SIZE 16
#endif

using namespace std;

#ifdef __NVCC__

__global__ void kernel(int* image)
{
  int pos = threadIdx.x + blockIdx.x * blockDim.x;

  if(pos > WIDTH*HEIGHT) return;

  // evaluate derivatives

  image[pos] = 0;

  const int row = pos / WIDTH;
  const int col = pos % WIDTH;
  const cuda::std::complex<double> c(col * STEP + MIN_X, row * STEP + MIN_Y);

  // z = z^2 + c
  cuda::std::complex<double> z(0, 0);
  for (int i = 1; i <= ITERATIONS; i++)
  {
      z = cuda::std::pow(z, DEGREE) + c;

      // If it is convergent
      if (cuda::std::abs(z) >= 2)
      {
          image[pos] = i;
          break;
      }
  }

}

#endif

int main(int argc, char **argv)
{
    int *const image = new int[HEIGHT * WIDTH];

    // const auto start = chrono::steady_clock::now();
    const auto start = omp_get_wtime();

    #ifdef __NVCC__

    const int size=WIDTH*HEIGHT;

    int *image_dev;

    cudaMalloc((void**) &image_dev, size);

    cudaMemcpy(image_dev, image, size, cudaMemcpyHostToDevice);

    dim3 block_size(BLOCK_SIZE);
    dim3 grid_size = dim3((size - 1) / BLOCK_SIZE + 1);

    // Execute the modified version using same data
    kernel<<<grid_size, block_size>>>(image_dev);
    cudaMemcpy(image, image_dev, size, cudaMemcpyDeviceToHost);

    cudaDeviceSynchronize();

    #else

    #ifdef OMP
    # pragma omp parallel for default(none) shared(image)
    #endif
    for (int pos = 0; pos < HEIGHT * WIDTH; pos++)
    {
        image[pos] = 0;

        const int row = pos / WIDTH;
        const int col = pos % WIDTH;
        const complex<double> c(col * STEP + MIN_X, row * STEP + MIN_Y);

        // z = z^2 + c
        complex<double> z(0, 0);
        for (int i = 1; i <= ITERATIONS; i++)
        {
            z = pow(z, DEGREE) + c;

            // If it is convergent
            if (abs(z) >= 2)
            {
                image[pos] = i;
                break;
            }
        }
    }

    #endif
    // const auto end = chrono::steady_clock::now();
    const auto end = omp_get_wtime();
    // cout << "Time elapsed: "
    //      << chrono::duration_cast<chrono::seconds>(end - start).count()
    //      << " seconds." << endl;
    cout << "Time elapsed: " << (end-start) << " seconds." << endl;

    // Write the result to a file
    ofstream matrix_out;

    if (argc < 2)
    {
        cout << "Please specify the output file as a parameter." << endl;
        return -1;
    }

    matrix_out.open(argv[1], ios::trunc);
    if (!matrix_out.is_open())
    {
        cout << "Unable to open file." << endl;
        return -2;
    }

    for (int row = 0; row < HEIGHT; row++)
    {
        for (int col = 0; col < WIDTH; col++)
        {
            matrix_out << image[row * WIDTH + col];

            if (col < WIDTH - 1)
                matrix_out << ',';
        }
        if (row < HEIGHT - 1)
            matrix_out << endl;
    }
    matrix_out.close();

    delete[] image; // It's here for coding style, but useless
    #ifdef __NVCC__
    cudaFree(image_dev);
    #endif
    return 0;
}
'''
with open('assignment.cpp', 'w') as f:
  f.write(code)


In [None]:
!g++ -O3 -ffast-math -fopenmp assignment.cpp -o assignment -DITERATIONS=10000 && /content/assignment /dev/null

Time elapsed: 73.6822 seconds.


In [None]:
!nvcc -x cu assignment.cpp -o assignment -Xcompiler -fopenmp -DITERATIONS=10000 -DBLOCK_SIZE=16 && /content/assignment /dev/null

Time elapsed: 10.522 seconds.


In [2]:
from os import system
from subprocess import check_output,check_call,run,PIPE

N=[32,64,128,256,512,1024]

for n in N:
  SIZE=6*10**6
  ITERATIONS=70000
  block_size=n
  grid_size=(SIZE-1)//block_size+1
  #print(f"{i=} {block_size=}")
  print(f"{n=} <<<{grid_size},{block_size}>>>")
  process=run(f"nvcc -x cu assignment.cpp -o assignment -Xcompiler -fopenmp -DITERATIONS={ITERATIONS} -DBLOCK_SIZE={block_size} && /content/assignment /dev/null",shell=True,text=True,stdout=PIPE,stderr=PIPE)
  if process.returncode!=0:
    print(process.stdout,process.stderr)
    break
  print(process.stdout)

n=32 <<<187500,32>>>
Time elapsed: 35.8263 seconds.

n=64 <<<93750,64>>>
Time elapsed: 34.8702 seconds.

n=128 <<<46875,128>>>
Time elapsed: 34.8608 seconds.

n=256 <<<23438,256>>>
Time elapsed: 34.9985 seconds.

n=512 <<<11719,512>>>
Time elapsed: 35.2746 seconds.

n=1024 <<<5860,1024>>>
Time elapsed: 35.8884 seconds.

