<a href="https://colab.research.google.com/github/rekil156/cuda/blob/main/cuda_c%2B%2B_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')    

In [None]:
%cd /content/gdrive/My\ Drive/cuda

## Just two lines for CUDA + C++

In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-wlqt_hci
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-wlqt_hci
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=ea68560e73578f4470281db3552fe561c1d448a893250e0a46da31d9d70d4a08
  Stored in directory: /tmp/pip-ephem-wheel-cache-54oumxy8/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


### BAsic example

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
*c = *a + *b;
}
int main() {
int a, b, c;
// host copies of variables a, b & c
int *d_a, *d_b, *d_c;
// device copies of variables a, b & c
int size = sizeof(int);
// Allocate space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Setup input values  
c = 0;
a = 3;
b = 5;
// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b, d_c);
// Copy result back to host
cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",c);
// Cleanup
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}

### Cube of a number 

In [12]:
%%cu
#include <stdio.h>
__global__ void cube(float* d_out, float* d_in)
{
    int idx = threadIdx.x;
    float f = d_in[idx];
    d_out[idx] = f * f *f;
}
int main()
{
    const int ARRAY_SIZE = 64;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
 
 //memory on host 
 float h_in[ARRAY_SIZE];
 float h_out[ARRAY_SIZE];

 for(int i=0; i < ARRAY_SIZE; i++)
 {
     h_in[i] = i;
 } 

 //memory on device 
  //int *ptr = (int*) malloc(100 * sizeof(int)); - in C 
 //A void ** is just a pointer to a pointer to memory with an unspecified type. You can only dereference it once (since you can't dereference a void *).
 // However, apart from that, it is basically like any other pointer type. If it helps you, think of it the same way as you would with int *.
 
 float *d_in;
 float *d_out;
 cudaMalloc((void**)&d_in,ARRAY_BYTES); 
 cudaMalloc((void**)&d_out,ARRAY_BYTES);



 cudaMalloc((void**)&d_out,ARRAY_BYTES);
  //move from host to gpu/device
 cudaMemcpy(d_in,h_in,ARRAY_BYTES,cudaMemcpyHostToDevice);//dest,src 

 //launch the kernel
 cube<<<1,ARRAY_SIZE>>>(d_out,d_in);

//move from gpu/device to host
 cudaMemcpy(h_out,d_out,ARRAY_BYTES,cudaMemcpyDeviceToHost);

//print the result 
 for(int i=0; i <ARRAY_SIZE; i++)
 {
     printf("%f",h_out[i]);
    printf(((i % 4) != 3)?"\t":"\n");
 }
 cudaFree(d_in);
 cudaFree(d_out);

 return 0;

}

0.000000	1.000000	8.000000	27.000000
64.000000	125.000000	216.000000	343.000000
512.000000	729.000000	1000.000000	1331.000000
1728.000000	2197.000000	2744.000000	3375.000000
4096.000000	4913.000000	5832.000000	6859.000000
8000.000000	9261.000000	10648.000000	12167.000000
13824.000000	15625.000000	17576.000000	19683.000000
21952.000000	24389.000000	27000.000000	29791.000000
32768.000000	35937.000000	39304.000000	42875.000000
46656.000000	50653.000000	54872.000000	59319.000000
64000.000000	68921.000000	74088.000000	79507.000000
85184.000000	91125.000000	97336.000000	103823.000000
110592.000000	117649.000000	125000.000000	132651.000000
140608.000000	148877.000000	157464.000000	166375.000000
175616.000000	185193.000000	195112.000000	205379.000000
216000.000000	226981.000000	238328.000000	250047.000000



Solution

In [8]:
%%cu
#include <stdio.h>

__global__ void cube(float * d_out, float * d_in){
	// Todo: Fill in this function
	int idx = threadIdx.x;
	float f = d_in[idx];
	d_out[idx] = f * f * f;
}

int main(int argc, char ** argv) {
	const int ARRAY_SIZE = 96;
	const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

	// generate the input array on the host
	float h_in[ARRAY_SIZE];
	for (int i = 0; i < ARRAY_SIZE; i++) {
		h_in[i] = float(i);
	}
	float h_out[ARRAY_SIZE];

	// declare GPU memory pointers
	float * d_in;
	float * d_out;

	// allocate GPU memory
	cudaMalloc((void**) &d_in, ARRAY_BYTES);
	cudaMalloc((void**) &d_out, ARRAY_BYTES);

	// transfer the array to the GPU
	cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

	// launch the kernel
	cube<<<1, ARRAY_SIZE>>>(d_out, d_in);

	// copy back the result array to the CPU
	cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

	// print out the resulting array
	for (int i =0; i < ARRAY_SIZE; i++) {
		printf("%f", h_out[i]);
		printf(((i % 4) != 3) ? "\t" : "\n");
	}

	cudaFree(d_in);
	cudaFree(d_out);

	return 0;
}

0.000000	1.000000	8.000000	27.000000
64.000000	125.000000	216.000000	343.000000
512.000000	729.000000	1000.000000	1331.000000
1728.000000	2197.000000	2744.000000	3375.000000
4096.000000	4913.000000	5832.000000	6859.000000
8000.000000	9261.000000	10648.000000	12167.000000
13824.000000	15625.000000	17576.000000	19683.000000
21952.000000	24389.000000	27000.000000	29791.000000
32768.000000	35937.000000	39304.000000	42875.000000
46656.000000	50653.000000	54872.000000	59319.000000
64000.000000	68921.000000	74088.000000	79507.000000
85184.000000	91125.000000	97336.000000	103823.000000
110592.000000	117649.000000	125000.000000	132651.000000
140608.000000	148877.000000	157464.000000	166375.000000
175616.000000	185193.000000	195112.000000	205379.000000
216000.000000	226981.000000	238328.000000	250047.000000
262144.000000	274625.000000	287496.000000	300763.000000
314432.000000	328509.000000	343000.000000	357911.000000
373248.000000	389017.000000	405224.000000	421875.000000
438976.000000	456533.00