In [None]:
%%writefile branching.cu
// branching.cu
// loosely derived from
//     http://gpuray.blogspot.com/2009/07/cuda-warps-and-branching.html
#include <stdio.h>

#define WORK 320000
#define threadsPerBlock 320

__global__ void branching(int n)
{
    
 int threadMod = threadIdx.x%threadsPerBlock;

 if ( threadMod < n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 14);
   double theta2 = sin((double)(i + 2) / 13);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (100.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<2*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 15);
   double theta2 = sin((double)(i + 1) / 15);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (10.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<3*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 17);
   double theta2 = sin((double)(i + 9) / 13);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (20.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<4*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 12);
   double theta2 = sin((double)(i + 5) / 12);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (30.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 }


int main(int argc, char ** argv) {

    float time;
    cudaEvent_t start, stop;

    // run and time with all 32 threads in warp doing same thing
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    branching<<<1,threadsPerBlock>>>(32);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Time to generate:  %3.1f ms \n", time);

   // half the threads in warp do one thing, half another
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    branching<<<1,threadsPerBlock>>>(16);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Time to generate:  %3.1f ms \n", time);

    // 1/4 of the threads each do different things
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    branching<<<1,threadsPerBlock>>>(8);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Time to generate:  %3.1f ms \n", time);
}



Overwriting branching.cu


In [None]:
!nvcc -o branching branching.cu
!ls

a.out  branching  branching.cu	branching_student.cu  sample_data


In [None]:
!time ./branching


Time to generate:  1059.6 ms 
Time to generate:  1225.4 ms 
Time to generate:  2278.6 ms 

real	0m4.741s
user	0m2.405s
sys	0m2.320s


In [None]:
%%writefile branching_student.cu
// branching.cu
// loosely derived from
//     http://gpuray.blogspot.com/2009/07/cuda-warps-and-branching.html
#include <stdio.h>

#define WORK 320000
#define threadsPerBlock 320

__global__ void branching(int n)
{
    
 int threadMod = threadIdx.x%threadsPerBlock;

 if ( threadMod < n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 14);
   double theta2 = sin((double)(i + 2) / 13);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (100.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<2*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 15);
   double theta2 = sin((double)(i + 1) / 15);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (10.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<3*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 17);
   double theta2 = sin((double)(i + 9) / 13);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (20.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 else if(threadMod<4*n)
  for (int i=0; i < WORK; i++){
   double theta1 = sin((double)i / 12);
   double theta2 = sin((double)(i + 5) / 12);
   double theta3 = fmax(theta1, theta2);
   double theta4 = cos( sqrt (30.0 * theta3) );
   double theta5 = pow ( theta3, theta4 );
  }
 }


int main(int argc, char ** argv) {

    float time;
    cudaEvent_t start, stop;
    int nBreak = 32;

    printf("USAGE: ./branching_student [-n nBreak (default 32)]");

    if(argc>1) {
        sscanf(argv[1],"%d",&nBreak);
    }
    if(nBreak>32 || nBreak<8) {
        printf("Example code is designed to be used with 8 <= nBreak <= 32 \n");
        exit(0);
    }

    // run and time with all 32 threads in warp doing same thing
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    branching<<<1,threadsPerBlock>>>(nBreak);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Time to generate:  %3.1f ms \n", time);

 
}

Overwriting branching_student.cu


In [None]:
!nvcc -o branching_student branching_student.cu
!ls

a.out	   branching.cu       branching_student.cu
branching  branching_student  sample_data


In [None]:
!./branching_student 33

USAGE: ./branching_student [-n nBreak (default 32)]Example code is designed to be used with 8 <= nBreak <= 32 
