In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-y6qlrvnv
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-y6qlrvnv
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4716 sha256=fbb1d95f8a90bb855b1cc5727aef57cbeebe8fe7fa55a439bcfb2cbc9040ad84
  Stored in directory: /tmp/pip-ephem-wheel-cache-ruf_5kr6/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [36]:
%%cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>


int* cube_host(int *a, int len, int *c){
    for(int i=0; i<len; i++){
        c[i] = a[i] * a[i] * a[i];
    }
    return c;
}

__global__ void cube_device(int *a, int *c) {
  int i = threadIdx.x + blockDim.x * blockIdx.x;
  c[i] = a[i] * a[i] * a[i];
}

int random(int min, int max){
   return min + rand() / (RAND_MAX / (max - min + 1) + 1);
}

int main() {
  clock_t start_t, end_t, start_t_k, end_t_k;
  double total_t;
  int *h_a, *h_c, *h_d;
  int N = 1000000000;
  int *d_a, *d_c;
  int BLOCK_SIZE=1024, GRID_SIZE=65535;

  // Allocating memory on the host
  h_a = (int *)malloc(N*sizeof(int));
  for(int i=0; i<N; i++){
      h_a[i]= random(0, 100);
  }

  for(int i=0; i<3; i++){
      printf("%d -> ", h_a[i]);
  }
  printf("\n");

  // Allocating memory on the device
  cudaMalloc((void **)&d_a, N*sizeof(int));
  cudaMalloc((void **)&d_c, N*sizeof(int));
  // Copy the data to the device

  start_t = clock();
  cudaMemcpy(d_a, h_a, N*sizeof(int), cudaMemcpyHostToDevice);
  // Launch the kernel
  start_t_k = clock();
  cube_device<<<GRID_SIZE, BLOCK_SIZE>>>(d_a, d_c);
  end_t_k = clock();
  total_t = (double)(end_t_k - start_t_k)/ CLOCKS_PER_SEC;
  printf("\n Device Kernel Time %f\n", total_t);

  h_c = (int *)malloc(N*sizeof(int));

  // Copy the result back to the host
  cudaMemcpy(h_c, d_c, N*sizeof(int), cudaMemcpyDeviceToHost);
  end_t = clock();
  total_t = (double)(end_t - start_t)/ CLOCKS_PER_SEC;
  printf("\n Device Time %f\n", total_t);

  // Print the result
  for(int i=0; i<3; i++){
      printf("%d -> ", h_c[i]);
  }
  printf("\n");

  // Free the device memory
  cudaFree(d_a);
  cudaFree(d_c);
  free(h_c);

  h_d = (int *)malloc(N*sizeof(int));
  start_t = clock();
  h_d = cube_host(h_a, N, h_d);
  end_t = clock();
  total_t = (double)(end_t - start_t)/ CLOCKS_PER_SEC;
  // Print the result
  for(int i=0; i<3; i++){
      printf("%d -> ", h_d[i]);
  }
  printf("\n CPU Time %f\n", total_t);

  free(h_a);
  free(h_d);


  return 0;
}

84 -> 39 -> 79 -> 

 Device Kernel Time 0.000267

 Device Time 3.925126
592704 -> 59319 -> 493039 -> 
592704 -> 59319 -> 493039 -> 
 CPU Time 6.170873



In [17]:
import math
math.pow(84, 3)

592704.0