## Setup CUDA
---

In [1]:
import os

# Add the directory containing the executable to the PATH
os.environ["PATH"] += os.pathsep + "/usr/local/cuda/bin"

# Check if the directory is added to the PATH
print(os.environ["PATH"])

/opt/tljh/user/bin:/bin:/usr/bin:/usr/local/cuda/bin


## Pure C implementation
---

In [2]:
%%writefile C_max.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define MIN_VAL -20
#define MAX_VAL 20
#define RUN_COUNT 30

#define print_array(type, n, arr, fmt) \
    do { \
        size_t i; \
        type *array = (type *)arr; \
        printf("["); \
        for (i = 0; i < 5; i++) { \
            printf(fmt, array[i]); \
        } \
        printf("..., "); \
        for (i = n - 5; i < n; i++) { \
            printf(fmt, array[i]); \
        } \
        printf("\b\b]\n"); \
    } while (0)

// C implementation
void C_max(size_t n, float A[], float B[], float C[], int idx[]) {
    for (size_t i = 0; i < n; i++) {
        if (A[i] >= B[i]) {
            C[i] = A[i];
            idx[i] = 0;
        }
        else {
            C[i] = B[i];
            idx[i] = 1;
        }
    }
}

// Dynamically allocates an array of size n and fills it with random float values
// WARNING: Pointer returned by this function must be freed!
float* malloc_rand(size_t n) {
    float* array = (float*) malloc(n * sizeof(float));

    // Gracefully exit in case malloc() fails
    if (array == NULL)
        return NULL;

    for (int i = 0; i < n; i++) {
        // Generates a random floating-point value between MIN_VAL and MAX_VAL
        array[i] = MIN_VAL + ((float) rand() / (float) RAND_MAX) * (MAX_VAL - MIN_VAL);
    }

    return array;
}

int main() {
    srand(339);
    size_t size = 1 << 28;

    float* A = malloc_rand(size);
    float* B = malloc_rand(size);
    float* C = (float*) malloc(size * sizeof(float));
    int* idx = (int*) malloc(size * sizeof(float));

    // Print input arrays
    
    printf("A = ");
    print_array(float, size, A, "%.2f, ");
    printf("B = ");
    print_array(float, size, B, "%.2f, ");
    
    printf("\nExecuting...\n\n");
    
    // Time execution
        
    clock_t start, end;
    long long int total_execution_time = 0;
    double avg_execution_time;

    for (int i = 0; i < RUN_COUNT; i++) {
        start = clock();
        C_max(size, A, B, C, idx);
        end = clock();
        total_execution_time += ((double) (end - start)) * 1E3 / CLOCKS_PER_SEC;
    }

    avg_execution_time = 1.0 * total_execution_time / RUN_COUNT;

    printf("\n===== size = %zu =====\n", size);

    printf("Average execution time over %d runs: %.6f ms\n",
        RUN_COUNT, avg_execution_time);
    
    // Print results
    
    printf("C = ");
    print_array(float, size, C, "%.2f, ");

    printf("idx = ");
    print_array(int, size, idx, "%d, ");

    free(A);
    free(B);
    free(C);
    free(idx);
        
    return 0;
}

Overwriting C_max.c


In [3]:
%%bash
gcc C_max.c -o C_max

In [4]:
%%bash
./C_max

A = [18.88, 0.47, 3.11, 19.78, 4.43, ..., 1.76, 8.19, -4.65, 1.50, -4.35]
B = [17.55, -10.48, 13.66, -1.62, -6.12, ..., 12.83, -12.63, 17.10, 15.21, -0.77]

Executing...


===== size = 268435456 =====
Average execution time over 30 runs: 5523.866667 ms
C = [18.88, 0.47, 13.66, 19.78, 4.43, ..., 12.83, 8.19, 17.10, 15.21, -0.77]
idx = [0, 0, 1, 0, 0, ..., 1, 0, 1, 1, 1]


## Grid-stride CUDA
---

In [5]:
%%writefile CUDA_max.cu
#include<stdio.h>
#include <stdlib.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float* max_arr, float* A, float* B, int* idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    
    // Execute 30 times
    const size_t loope = 30;
    
    // Declare arrays
    float *A, *B, *max_arr;
    int* idx;

    cudaMallocManaged(&A, ARRAY_BYTES);
    cudaMallocManaged(&B, ARRAY_BYTES);
    cudaMallocManaged(&max_arr, ARRAY_BYTES);
    cudaMallocManaged(&idx, ARRAY_BYTES);
    
    // Init array
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        A[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
        B[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
    }
    
    // Setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    // Execute kernel
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr, A, B, idx);
    
    // Barrier
    cudaDeviceSynchronize();
    
    // Error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A[i] > B[i]) ? A[i] : B[i]) != max_arr[i]) err_count++;
    }
    printf("Error count(CUDA program): %zu\n", err_count);
    
    // Free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(max_arr);
    cudaFree(idx);
    
    return 0;
}

Overwriting CUDA_max.cu


In [6]:
%%bash
nvcc CUDA_max.cu -o CUDA_max -Wno-deprecated-gpu-targets

In [7]:
%%bash
nvprof ./CUDA_max

==1064265== NVPROF is profiling process 1064265, command: ./CUDA_max


numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


==1064265== Profiling application: ./CUDA_max
==1064265== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  12.8406s        30  428.02ms  20.567ms  12.2354s  cuda_max(unsigned long, float*, float*, float*, int*)
      API calls:   87.09%  12.8434s         1  12.8434s  12.8434s  12.8434s  cudaDeviceSynchronize
                   10.27%  1.51448s         4  378.62ms  89.217us  1.51297s  cudaMallocManaged
                    2.56%  377.99ms         4  94.496ms  33.575ms  117.20ms  cudaFree
                    0.07%  9.8796ms        30  329.32us  13.221us  9.0893ms  cudaLaunchKernel
                    0.00%  684.64us       114  6.0050us     194ns  314.68us  cuDeviceGetAttribute
                    0.00%  229.76us         1  229.76us  229.76us  229.76us  cuDeviceGetName
                    0.00%  64.804us         1  64.804us  64.804us  64.804us  cuDeviceTotalMem
                    0.00%  40.534us         1  40.53

In [8]:
%%bash
nsys profile  -o CUDA_max ./CUDA_max

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-c3f1.qdstrm'
Generated:
	/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max.nsys-rep


## Grid-stride CUDA (prefetch)
---

In [9]:
%%writefile CUDA_max2.cu
#include <stdio.h>
#include <stdlib.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float *max_arr, float *A, float *B, int *idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    // Execute 30 times
    const size_t loope = 30;
    
    // Declare array
    float *A, *B, *max_arr;
    int *idx;

    cudaMallocManaged(&A, ARRAY_BYTES);
    cudaMallocManaged(&B, ARRAY_BYTES);
    cudaMallocManaged(&max_arr, ARRAY_BYTES);
    cudaMallocManaged(&idx, ARRAY_BYTES);
    
    // Get GPU ID
    int device = -1;
    cudaGetDevice(&device);
    
    // Init array
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        A[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
        B[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
    }

    // Prefetch CPU -> GPU
    cudaMemPrefetchAsync(A, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, device, NULL);
    
    // Setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    printf("*** function = MAX\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr, A, B, idx);
    
    // Barrier
    cudaDeviceSynchronize();

    // Prefetch GPU -> CPU
    cudaMemPrefetchAsync(max_arr, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(A, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(idx, ARRAY_BYTES, cudaCpuDeviceId, NULL);

    // Error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A[i] > B[i]) ? A[i] : B[i]) != max_arr[i]) err_count++;
    }
    printf("Error count (CUDA program): %zu\n", err_count);
    
    // Free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(max_arr);
    cudaFree(idx);
    
    return 0;
}

Overwriting CUDA_max2.cu


In [10]:
%%bash
nvcc CUDA_max2.cu -o CUDA_max2 -Wno-deprecated-gpu-targets

In [11]:
%%bash
nvprof ./CUDA_max2

==1064483== NVPROF is profiling process 1064483, command: ./CUDA_max2


*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count (CUDA program): 0


==1064483== Profiling application: ./CUDA_max2
==1064483== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  5.67483s        30  189.16ms  20.521ms  5.07373s  cuda_max(unsigned long, float*, float*, float*, int*)
      API calls:   52.25%  5.67741s         1  5.67741s  5.67741s  5.67741s  cudaDeviceSynchronize
                   21.83%  2.37187s         6  395.31ms  1.9833ms  757.97ms  cudaMemPrefetchAsync
                   18.30%  1.98823s         4  497.06ms  61.604us  1.98743s  cudaMallocManaged
                    5.21%  566.65ms         4  141.66ms  100.10ms  165.56ms  cudaFree
                    2.41%  261.54ms        30  8.7178ms  11.757us  260.68ms  cudaLaunchKernel
                    0.01%  796.19us       114  6.9840us     149ns  339.10us  cuDeviceGetAttribute
                    0.00%  306.64us         1  306.64us  306.64us  306.64us  cuDeviceGetName
                    0.00%  36.056us         1  

In [12]:
%%bash
nsys profile  -o CUDA_max2 ./CUDA_max2

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count (CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-6789.qdstrm'
Generated:
	/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max2.nsys-rep


## Grid-stride CUDA (prefetch + page creation)
---

In [13]:
%%writefile CUDA_max3.cu
#include <stdio.h>
#include <stdlib.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float *max_arr, float *A, float *B, int *idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    
    // Execute 30 times
    const size_t loope = 30;
    
    // Declare array
    float *A, *B, *max_arr;
    int *idx;
    cudaMallocManaged(&A, ARRAY_BYTES);
    cudaMallocManaged(&B, ARRAY_BYTES);
    cudaMallocManaged(&max_arr, ARRAY_BYTES);
    cudaMallocManaged(&idx, ARRAY_BYTES);

    // Get gpu id
    int device = -1;
    cudaGetDevice(&device);
    //"prefetch data" to create CPU page memory
    // TODO: Tama ba to can u check
    cudaMemPrefetchAsync(A, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(max_arr, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(idx, ARRAY_BYTES, device, NULL);

    // *** init array
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        A[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
        B[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
    }

    // Prefetch
    cudaMemPrefetchAsync(A, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, device, NULL);
    // *** setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    printf("*** function = MAX\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr, A, B, idx);
    // barrier
    cudaDeviceSynchronize();

    cudaMemPrefetchAsync(max_arr, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(A, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(idx, ARRAY_BYTES, cudaCpuDeviceId, NULL);

    // error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A[i] > B[i]) ? A[i] : B[i]) != max_arr[i]) err_count++;
    }
    printf("Error count(CUDA program): %zu\n", err_count);
    // free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(max_arr);
    cudaFree(idx);
    return 0;
}

Overwriting CUDA_max3.cu


In [14]:
%%bash
nvcc CUDA_max3.cu -o CUDA_max3 -Wno-deprecated-gpu-targets

In [15]:
%%bash
nvprof ./CUDA_max3

==1064669== NVPROF is profiling process 1064669, command: ./CUDA_max3


*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


==1064669== Profiling application: ./CUDA_max3
==1064669== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  641.54ms        30  21.385ms  20.674ms  26.005ms  cuda_max(unsigned long, float*, float*, float*, int*)
      API calls:   62.39%  5.97806s        10  597.81ms  5.1522ms  1.55978s  cudaMemPrefetchAsync
                   24.59%  2.35606s         4  589.01ms  308.77us  2.35324s  cudaMallocManaged
                    6.72%  644.32ms         1  644.32ms  644.32ms  644.32ms  cudaDeviceSynchronize
                    3.44%  329.32ms         4  82.331ms  75.808ms  86.327ms  cudaFree
                    2.84%  272.14ms        30  9.0712ms  7.7460us  271.59ms  cudaLaunchKernel
                    0.01%  1.2830ms       114  11.254us     127ns  435.49us  cuDeviceGetAttribute
                    0.00%  339.77us         1  339.77us  339.77us  339.77us  cuDeviceGetName
                    0.00%  151.96us         1  

In [16]:
%%bash
nsys profile  -o CUDA_max3 ./CUDA_max3

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-7cdc.qdstrm'
Generated:
	/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max3.nsys-rep


## Grid-stride CUDA (prefetching + page creation + mem advise)
---

In [17]:
%%writefile CUDA_max4.cu
#include <stdio.h>
#include <stdlib.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float *max_arr, float *A, float *B, int *idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    // number of times the program is to be executed
    const size_t loope = 30;
    // declare array
    float *A, *B, *max_arr;
    int *idx;
    cudaMallocManaged(&A, ARRAY_BYTES);
    cudaMallocManaged(&B, ARRAY_BYTES);
    cudaMallocManaged(&max_arr, ARRAY_BYTES);
    cudaMallocManaged(&idx, ARRAY_BYTES);

    // get gpu id
    int device = -1;
    cudaGetDevice(&device);
    // mem advise
    cudaMemAdvise(A, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation,
                  cudaCpuDeviceId);
    cudaMemAdvise(A, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);
    cudaMemAdvise(B, ARRAY_BYTES, cudaMemAdviseSetPreferredLocation,
                  cudaCpuDeviceId);
    cudaMemAdvise(B, ARRAY_BYTES, cudaMemAdviseSetReadMostly, cudaCpuDeviceId);

    //"prefetch data" to create CPU page memory
    // TODO: See previous cell
    cudaMemPrefetchAsync(A, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    //"prefetch data" to create GPU page memory
    cudaMemPrefetchAsync(max_arr, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(idx, ARRAY_BYTES, device, NULL);

    // *** init array
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        A[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
        B[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
    }

    cudaMemPrefetchAsync(A, ARRAY_BYTES, device, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, device, NULL);

    // *** setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    printf("*** function = MAX\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr, A, B, idx);
    // barrier
    cudaDeviceSynchronize();

    cudaMemPrefetchAsync(max_arr, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(A, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(B, ARRAY_BYTES, cudaCpuDeviceId, NULL);
    cudaMemPrefetchAsync(idx, ARRAY_BYTES, cudaCpuDeviceId, NULL);

    // error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A[i] > B[i]) ? A[i] : B[i]) != max_arr[i]) err_count++;
    }
    printf("Error count(CUDA program): %zu\n", err_count);
    // free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(max_arr);
    cudaFree(idx);
    return 0;
}

Overwriting CUDA_max4.cu


In [18]:
%%bash
nvcc CUDA_max4.cu -o CUDA_max4 -Wno-deprecated-gpu-targets

In [19]:
%%bash
nvprof ./CUDA_max4

==1064832== NVPROF is profiling process 1064832, command: ./CUDA_max4


*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


==1064832== Profiling application: ./CUDA_max4
==1064832== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  634.00ms        30  21.133ms  15.910ms  26.025ms  cuda_max(unsigned long, float*, float*, float*, int*)
      API calls:   56.98%  4.23570s        10  423.57ms  5.9735ms  1.15317s  cudaMemPrefetchAsync
                   28.50%  2.11811s         4  529.53ms  225.45us  2.11498s  cudaMallocManaged
                    8.57%  637.17ms         1  637.17ms  637.17ms  637.17ms  cudaDeviceSynchronize
                    5.82%  432.88ms         4  108.22ms  82.158ms  118.31ms  cudaFree
                    0.10%  7.3727ms        30  245.76us  9.5710us  6.7414ms  cudaLaunchKernel
                    0.01%  792.26us       114  6.9490us     108ns  328.18us  cuDeviceGetAttribute
                    0.01%  588.26us         4  147.07us  9.9940us  478.61us  cudaMemAdvise
                    0.00%  207.10us         1  20

In [20]:
%%bash
nsys profile  -o CUDA_max4 ./CUDA_max4

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


Failed to create '/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max4.nsys-rep': File exists.
Use `--force-overwrite true` to overwrite existing files.


Collecting data...
Generating '/tmp/nsys-report-ce98.qdstrm'
Generated:
	/tmp/nsys-report-dca0.nsys-rep


## CUDA Classic MEMCPY
---

In [21]:
%%writefile CUDA_max5.cu
#include <stdio.h>
#include <stdlib.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float *max_arr, float *A, float *B, int *idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    // number of times the program is to be executed
    const size_t loope = 30;
    // declare array
    float *A_host, *B_host, *max_arr_host;
    float *A_device, *B_device, *max_arr_device;
    int *idx_host, *idx_device;

    // get gpu id
    int device = -1;
    cudaGetDevice(&device);

    // malloc host
    A_host = (float*)malloc(ARRAY_BYTES);
    B_host = (float*)malloc(ARRAY_BYTES);
    max_arr_host = (float*)malloc(ARRAY_BYTES);
    idx_host = (int*)malloc(ARRAY_BYTES);

    // malloc device
    cudaMalloc((void**)&A_device, ARRAY_BYTES);
    cudaMalloc((void**)&B_device, ARRAY_BYTES);
    cudaMalloc((void**)&max_arr_device, ARRAY_BYTES);
    cudaMalloc((void**)&idx_device, ARRAY_BYTES);
    
    // *** init array host
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        A_host[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
        B_host[i] =
            MIN_VAL + ((float)rand() / (float)RAND_MAX) * (MAX_VAL - MIN_VAL);
    }

    cudaMemcpy(A_device, A_host, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(B_device, B_host, ARRAY_BYTES, cudaMemcpyHostToDevice);


    // *** setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    printf("*** function = MAX\n");
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr_device, A_device, B_device, idx_device);
    // barrier
    cudaDeviceSynchronize();

    cudaMemcpy(max_arr_host, max_arr_device, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    cudaMemcpy(idx_host, idx_device, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    // error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A_host[i] > B_host[i]) ? A_host[i] : B_host[i]) != max_arr_host[i]) err_count++;
    }
    printf("Error count(CUDA program): %zu\n", err_count);
    // free memory
    cudaFree(A_device);
    cudaFree(B_device);
    cudaFree(max_arr_device);
    cudaFree(idx_device);

    free(A_host);
    free(B_host);
    free(max_arr_host);
    free(idx_host);
    return 0;
}

Overwriting CUDA_max5.cu


In [22]:
%%bash
nvcc CUDA_max5.cu -o CUDA_max5 -Wno-deprecated-gpu-targets

In [23]:
%%bash
nvprof ./CUDA_max5

==1065000== NVPROF is profiling process 1065000, command: ./CUDA_max5


*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


==1065000== Profiling application: ./CUDA_max5
==1065000== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   77.38%  11.9239s         2  5.96193s  5.55298s  6.37088s  [CUDA memcpy DtoH]
                   18.60%  2.86563s         2  1.43282s  1.15426s  1.71138s  [CUDA memcpy HtoD]
                    4.02%  619.47ms        30  20.649ms  15.669ms  26.049ms  cuda_max(unsigned long, float*, float*, float*, int*)
      API calls:   83.21%  14.8179s         4  3.70447s  1.15568s  6.38560s  cudaMemcpy
                   13.16%  2.34363s         4  585.91ms  296.60us  2.34187s  cudaMalloc
                    3.50%  622.47ms         1  622.47ms  622.47ms  622.47ms  cudaDeviceSynchronize
                    0.08%  14.677ms         4  3.6692ms  3.2173ms  4.0574ms  cudaFree
                    0.04%  6.3394ms        30  211.31us  10.684us  5.6986ms  cudaLaunchKernel
                    0.01%  1.1532ms       114  10.115us     154

In [24]:
%%bash
nsys profile  -o CUDA_max5 ./CUDA_max5

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



*** function = MAX
numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-34ed.qdstrm'
Generated:
	/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max5.nsys-rep


## CUDA data init in a CUDA kernel
---

In [25]:
%%writefile CUDA_max_init.cu
#include<stdio.h>
#include <stdlib.h>
#include <curand_kernel.h>

#define MIN_VAL -20
#define MAX_VAL 20

__global__
void cuda_max(size_t n, float* max_arr, float* A, float* B, int* idx) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride) {
        max_arr[i] = (A[i] > B[i]) ? A[i] : B[i];
        idx[i] = (A[i] >= B[i]) ? 0 : 1;
    }
}

// For the purposes of testing we use i*2 and i for the initialized elements
// Shouldn't impact the performance measurement as they are still stored as FP
__global__
void initialize(size_t n, float* A, float* B) {
    // Init array
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (size_t i = index; i < n; i+= stride) {
        A[i] =
            i * 2;
        B[i] =
            i;
    }
}

int main() {
    const size_t ARRAY_SIZE = 1 << 28;
    const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
    
    // Execute 30 times
    const size_t loope = 30;
    
    // Declare arrays
    float *A, *B, *max_arr;
    int* idx;

    cudaMallocManaged(&A, ARRAY_BYTES);
    cudaMallocManaged(&B, ARRAY_BYTES);
    cudaMallocManaged(&max_arr, ARRAY_BYTES);
    cudaMallocManaged(&idx, ARRAY_BYTES);
    
    // Setup CUDA kernel
    size_t numThreads = 1024;
    size_t numBlocks = (ARRAY_SIZE + numThreads - 1) / numThreads;
    
    printf("numElements = %lu\n", ARRAY_SIZE);
    printf("numBlocks = %lu, numThreads = %lu \n", numBlocks, numThreads);

    // Execute kernel
    initialize<<<numBlocks, numThreads>>>(ARRAY_SIZE, A, B);
    for (size_t i = 0; i < loope; i++)
        cuda_max<<<numBlocks, numThreads>>>(ARRAY_SIZE, max_arr, A, B, idx);
    
    // Barrier
    cudaDeviceSynchronize();
    
    // Error checking
    size_t err_count = 0;
    for (size_t i = 0; i < ARRAY_SIZE; i++) {
        if (((A[i] > B[i]) ? A[i] : B[i]) != max_arr[i]) err_count++;
    }
    printf("Error count(CUDA program): %zu\n", err_count);
    
    // Free memory
    cudaFree(A);
    cudaFree(B);
    cudaFree(max_arr);
    cudaFree(idx);
    
    return 0;
}

Overwriting CUDA_max_init.cu


In [26]:
%%bash
nvcc CUDA_max_init.cu -o CUDA_max_init -Wno-deprecated-gpu-targets

In [27]:
%%bash
nvprof ./CUDA_max_init

==1065191== NVPROF is profiling process 1065191, command: ./CUDA_max_init


numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0


==1065191== Profiling application: ./CUDA_max_init
==1065191== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   52.70%  745.52ms        30  24.851ms  5.4707ms  586.77ms  cuda_max(unsigned long, float*, float*, float*, int*)
                   47.30%  669.06ms         1  669.06ms  669.06ms  669.06ms  initialize(unsigned long, float*, float*)
      API calls:   52.54%  1.98188s         4  495.47ms  132.97us  1.98047s  cudaMallocManaged
                   37.49%  1.41407s         1  1.41407s  1.41407s  1.41407s  cudaDeviceSynchronize
                    9.30%  350.99ms         4  87.747ms  27.533ms  115.60ms  cudaFree
                    0.65%  24.487ms        31  789.91us  8.7370us  23.895ms  cudaLaunchKernel
                    0.02%  648.09us       114  5.6850us     135ns  231.90us  cuDeviceGetAttribute
                    0.00%  152.43us         1  152.43us  152.43us  152.43us  cuDeviceGetName
                    0.

In [28]:
%%bash
nsys profile  -o CUDA_max_init ./CUDA_max_init

         This may increase runtime overhead and the likelihood of false
         dependencies across CUDA Streams. If you wish to avoid this, please
         disable the feature with --cuda-event-trace=false.
Try the 'nsys status --environment' command to learn more.

Try the 'nsys status --environment' command to learn more.



numElements = 268435456
numBlocks = 262144, numThreads = 1024 
Error count(CUDA program): 0
Collecting data...
Generating '/tmp/nsys-report-e6a6.qdstrm'
Generated:
	/home/jupyter-lorenz_marqueses@d-08e53/CUDA_max_init.nsys-rep
