In [None]:
%%writefile square.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

// Defining number of elements in Array
#define N 5

// Kernel function for squaring number
__global__ void gpuSquare(float *d_in, float *d_out)
{
    // Getting thread index for current kernel
    int tid = threadIdx.x; // handle the data at this index
    float temp = d_in[tid];
    d_out[tid] = temp*temp;
}
int main(void)
{
    // Defining Arrays for host
    float h_in[N], h_out[N];
    float *d_in, *d_out;
    // allocate the memory on the cpu
    cudaMalloc((void**)&d_in, N * sizeof(float));
    cudaMalloc((void**)&d_out, N * sizeof(float));
    // Initializing Array
    for (int i = 0; i < N; i++) {
        h_in[i] = i;
    }
    // Copy Array from host to device
    cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);
    // Calling square kernel with one block and N threads per block
    gpuSquare <<<1, N >>>(d_in, d_out);
    // Coping result back to host from device memory
    cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);
    // Printing result on console
    printf("Square of Number on GPU \n");
    for (int i = 0; i < N; i++) {
        printf("The square of %f is %f\n", h_in[i], h_out[i]);
    }
    // Free up memory
    cudaFree(d_in);
    cudaFree(d_out);
    return 0;
}

Writing square.cu


In [None]:
!nvcc square.cu -o square

In [None]:
!./square

Square of Number on GPU 
The square of 0.000000 is 0.000000
The square of 1.000000 is 0.000000
The square of 2.000000 is 0.000000
The square of 3.000000 is 0.000000
The square of 4.000000 is 0.000000


In [None]:
%%writefile squareroot.cu
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
# include <math.h>

// Defining number of elements in Array
#define N 100

// Kernel function for square root of a number
__global__ void gpuSquareroot(float *d_in, float *d_out)
{
    // Getting thread index for current kernel
    int tid = threadIdx.x; // handle the data at this index
    float temp = d_in[tid];
    d_out[tid] = sqrt(temp);;
}
int main(void)
{
    // Defining Arrays for host
    float h_in[N], h_out[N];
    float *d_in, *d_out;
    // allocate the memory on the cpu
    cudaMalloc((void**)&d_in, N * sizeof(float));
    cudaMalloc((void**)&d_out, N * sizeof(float));
    // Initializing Array
    for (int i = 0; i < N; i++) {
        h_in[i] = i;
    }
    // Copy Array from host to device
    cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);
    // Calling square kernel with one block and N threads per block
    gpuSquareroot <<<1, N >>>(d_in, d_out);
    // Coping result back to host from device memory
    cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);
    // Printing result on console
    printf("Square of Number on GPU \n");
    for (int i = 0; i < N; i++) {
        printf("The square root of %f is %f\n", h_in[i], h_out[i]);
    }
    // Free up memory
    cudaFree(d_in);
    cudaFree(d_out);
    return 0;
}

Writing squareroot.cu


In [None]:
!nvcc squareroot.cu -o squareroot

In [None]:
!./squareroot

Square of Number on GPU 
The square root of 0.000000 is 0.000000
The square root of 1.000000 is 0.000000
The square root of 2.000000 is 0.000000
The square root of 3.000000 is 0.000000
The square root of 4.000000 is 0.000000
The square root of 5.000000 is 0.000000
The square root of 6.000000 is 0.000000
The square root of 7.000000 is 0.000000
The square root of 8.000000 is 0.000000
The square root of 9.000000 is 0.000000
The square root of 10.000000 is 0.000000
The square root of 11.000000 is 0.000000
The square root of 12.000000 is 0.000000
The square root of 13.000000 is 0.000000
The square root of 14.000000 is 0.000000
The square root of 15.000000 is 0.000000
The square root of 16.000000 is 0.000000
The square root of 17.000000 is 0.000000
The square root of 18.000000 is 0.000000
The square root of 19.000000 is 0.000000
The square root of 20.000000 is 0.000000
The square root of 21.000000 is 0.000000
The square root of 22.000000 is 0.000000
The square root of 23.000000 is 0.000000
T