<a href="https://colab.research.google.com/github/pszwed-ai/wyklad-imperatywne/blob/main/CUDA_perfect_numbers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CUDA perfect numbers

# 1. Init CUDA

In [1]:
!nvcc --version
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3kk5auod
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-3kk5auod
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=af7fbb3838905bbc04a08efb2cc05ad08fbd5e1662f140bcd12e18623b98e4d4
  Stored in directory: /tmp/pip-ephem-wheel-cache-sbccay5d/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory

# 2. Query device

In [2]:
%%cu

#include <stdio.h>



int main( void ) {
    cudaDeviceProp  prop;

    int count;
    cudaGetDeviceCount( &count ) ;
    for (int i=0; i< count; i++) {
        cudaGetDeviceProperties( &prop, i );
        printf( "   --- General Information for device %d ---\n", i );
        printf( "Name:  %s\n", prop.name );
        printf( "Compute capability:  %d.%d\n", prop.major, prop.minor );
        printf( "Clock rate:  %d\n", prop.clockRate );
        printf( "Device copy overlap:  " );
        if (prop.deviceOverlap)
            printf( "Enabled\n" );
        else
            printf( "Disabled\n");
        printf( "Kernel execution timeout :  " );
        if (prop.kernelExecTimeoutEnabled)
            printf( "Enabled\n" );
        else
            printf( "Disabled\n" );

        printf( "   --- Memory Information for device %d ---\n", i );
        printf( "Total global mem:  %ld\n", prop.totalGlobalMem );
        printf( "Total constant Mem:  %ld\n", prop.totalConstMem );
        printf( "Max mem pitch:  %ld\n", prop.memPitch );
        printf( "Texture Alignment:  %ld\n", prop.textureAlignment );

        printf( "   --- MP Information for device %d ---\n", i );
        printf( "Multiprocessor count:  %d\n",
                    prop.multiProcessorCount );
        printf( "Shared mem per mp:  %ld\n", prop.sharedMemPerBlock );
        printf( "Registers per mp:  %d\n", prop.regsPerBlock );
        printf( "Threads in warp:  %d\n", prop.warpSize );
        printf( "Max threads per block:  %d\n",
                    prop.maxThreadsPerBlock );
        printf( "Max thread dimensions:  (%d, %d, %d)\n",
                    prop.maxThreadsDim[0], prop.maxThreadsDim[1],
                    prop.maxThreadsDim[2] );
        printf( "Max grid dimensions:  (%d, %d, %d)\n",
                    prop.maxGridSize[0], prop.maxGridSize[1],
                    prop.maxGridSize[2] );
        printf( "\n" );
    }
}


   --- General Information for device 0 ---
Name:  Tesla T4
Compute capability:  7.5
Clock rate:  1590000
Device copy overlap:  Enabled
Kernel execution timeout :  Disabled
   --- Memory Information for device 0 ---
Total global mem:  15843721216
Total constant Mem:  65536
Max mem pitch:  2147483647
Texture Alignment:  512
   --- MP Information for device 0 ---
Multiprocessor count:  40
Shared mem per mp:  49152
Registers per mp:  65536
Threads in warp:  32
Max threads per block:  1024
Max thread dimensions:  (1024, 1024, 64)
Max grid dimensions:  (2147483647, 65535, 65535)




# Perfect numbers

## Sequential host version

In [None]:
%%cu
#include <math.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

int is_perfect(int n){
    int sum = 1;
    int i;
    for(i=2; i*i<n;i++) {
      if (n%i == 0) {
          sum+=i;
          sum+=n/i;
      }
      i++;
    }
    if (i*i==n)sum+=i;
    return sum == n;
}

void find_perfect(int*tab,int N){
    tab[0]=0;
    for(int i=1;i<N;i++){
        tab[i]=is_perfect(i);
    }
}

void print_perfect(int*tab,int n){
    int cnt=0;
    for(int i=0;i<n;i++){
        if(tab[i]>0){
            printf("%d ",i);
            cnt++;
        }
    }
    printf("\nFound %d numbers\n",cnt);
}

int main(){
    int N=36000000;
    int*tab = (int*)malloc(N*sizeof(int));
    clock_t start = clock();
    find_perfect(tab,N);
    clock_t end = clock();
    double seconds = (double)(end - start) / CLOCKS_PER_SEC;
    print_perfect(tab,N);
    printf("\nt=%f",seconds);
    free(tab);
}

1 6 28 496 8128 33550336 
Found 6 numbers

t=286.523575


##CUDAfied version

In [None]:
%%cu
#include <math.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>

__device__ int is_perfect(int n){
    int sum = 1;
    int i;
    for(i=2; i*i<n;i++) {
      if (n%i == 0) {
          sum+=i;
          sum+=n/i;
      }
      i++;
    }
    if (i*i==n)sum+=i;
    return sum == n;
}


__global__ void  find_perfect(int*tab,int N) {
    int tid = blockIdx.x;    // this thread handles the data at its thread id
    if (tid >= N)return;
    
    tab[tid]=0;
    if(tid==0){
        return;
    }
    tab[tid]=is_perfect(tid);
}

void print_perfect(int*tab,int n){
    int cnt=0;
    for(int i=0;i<n;i++){
        if(tab[i]>0){
            printf("%d ",i);
            cnt++;
        }
    }
    printf("\nFound %d numbers\n",cnt);
}

int main(){
    int N=36000000;
    int*tab = (int*)malloc(N*sizeof(int));
    int*dev_tab;
    clock_t start = clock();
    cudaError_t err = cudaMalloc( (void**)&dev_tab, N * sizeof(int) );
    if(err!=cudaSuccess){
        printf( "%s in %s at line %d\n", 
               cudaGetErrorString( err ),__FILE__, __LINE__ );
        exit( -1 );
    }
    find_perfect<<<N,1>>>(dev_tab,N);
    
    err = cudaMemcpy( tab, dev_tab, N * sizeof(int),cudaMemcpyDeviceToHost );
    if(err!=cudaSuccess){
        printf( "%s in %s at line %d\n", 
               cudaGetErrorString( err ),__FILE__, __LINE__ );
        exit( -1 );
    }
    clock_t end = clock();
    double seconds = (double)(end - start) / CLOCKS_PER_SEC;
    print_perfect(tab,N);
    printf("\nN=%d t=%f",N,seconds);
    free(tab);
}

1 6 28 496 8128 33550336 
Found 6 numbers

N=36000000 t=15.825811
