Replace unsigned int types in OpenCL/CUDA code

Replace unsigned int types in OpenCL/CUDA code. For very large point sets this leads to an overflow and incorrect indexing of arrays. Add test scripts. Update CUDA makefile. Fixes #30.
pwollstadt · Nov 16, 2020 · 2a0d99a · 2a0d99a
1 parent de4ebb6
commit 2a0d99a
Show file tree

Hide file tree

Showing 11 changed files with 1,493 additions and 9 deletions.
diff --git a/dev/search_GPU/deliverable1_1/gpuKnnBF_kernel.cu b/dev/search_GPU/deliverable1_1/gpuKnnBF_kernel.cu
@@ -0,0 +1,278 @@
+#ifndef _TEMPLATE_KERNEL_H_
+#define _TEMPLATE_KERNEL_H_
+
+//#include <cpugpuKnn_common.h>
+#ifndef INFINITY
+#define INFINITY 0x7F800000
+#endif
+
+__device__ float
+insertPointKlist(int kth, float distance, int indexv,float* kdistances, int* kindexes){
+	int k=0;
+	while( (distance>*(kdistances+k)) && (k<kth-1)){k++;}
+	//Move value to the next
+	for(size_t k2=kth-1;k2>k;k2--){
+		*(kdistances+k2)=*(kdistances+k2-1);
+		*(kindexes+k2)=*(kindexes+k2-1);
+	}
+	//Replace
+	*(kdistances+k)=distance;
+	*(kindexes+k)=indexv;
+
+	//printf("\n -> Modificacion pila: %.f %.f. New max distance: %.f", *kdistances, *(kdistances+1), *(kdistances+kth-1));
+	return *(kdistances+kth-1);
+}
+
+__device__ float
+maxMetricPoints(const float* g_uquery, const float* g_vpoint, size_t pointdim, size_t signallength){
+	float	r_u1;
+	float	r_v1;
+	float	r_d1,r_dim=0;
+
+	r_dim=0;
+	for(size_t d=0; d<pointdim; d++){
+		r_u1 = *(g_uquery+d*signallength);
+		r_v1 = *(g_vpoint+d*signallength);
+		r_d1 = r_v1 - r_u1;
+		r_d1 = r_d1 < 0? -r_d1: r_d1;  //abs
+		r_dim= r_dim < r_d1? r_d1: r_dim;
+	}
+	return r_dim;
+}
+
+/*
+ * KNN
+ */
+
+//extern __shared__ char array[];
+
+__global__ void
+kernelKNNshared(const float* g_uquery, const float* g_vpointset, int *g_indexes, float* g_distances, const int pointdim, const size_t triallength, const size_t signallength, const int kth, const int exclude)
+{
+
+    // shared memory
+	extern __shared__ char array[];
+	float *kdistances;
+	int *kindexes;
+	kdistances = (float*)array;
+	kindexes = (int*)array+kth*blockDim.x;
+
+	const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
+	const unsigned int itrial = tid / triallength;  //  indextrial
+
+if(tid<signallength){
+
+	for(size_t k=0;k<kth;k++){
+		kdistances[threadIdx.x*kth+k] = INFINITY;
+	}
+
+	__syncthreads();
+
+	//int   r_index;
+	float r_kdist=INFINITY;
+	unsigned int indexi = tid-triallength*itrial;
+	for(size_t t=0; t<triallength; t++){
+			int indexu = tid;
+			int indexv = (t + itrial*triallength);
+			int condition1=indexi-exclude;
+			int condition2=indexi+exclude;
+			if((t<condition1)||(t>condition2)){
+				float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
+				if(temp_dist <= r_kdist){
+					r_kdist = insertPointKlist(kth,temp_dist,t,kdistances+threadIdx.x*kth,kindexes+threadIdx.x*kth);
+				//printf("\nId: %d, Temp_dist: %.f. r_index: %d", tid, temp_dist, r_index);
+				}
+			}
+			//printf("tid:%d indexes: %d, %d distances: %.f %.f\n",tid, *kindexes, *(kindexes+1), *kdistances, *(kdistances+1));
+	}
+
+	__syncthreads();
+	//COPY TO GLOBAL MEMORY
+	for(size_t k=0;k<kth;k++){
+		g_indexes[tid+k*signallength] = kindexes[threadIdx.x*kth+k];
+		g_distances[tid+k*signallength]= kdistances[threadIdx.x*kth+k];//*(kdistances+k);
+	}
+}
+
+}
+
+/*
+ * Only valid for 10 nearest neighbors
+ */
+
+__global__ void
+kernelKNN(const float* g_uquery, const float* g_vpointset, int *g_indexes, float* g_distances, int pointdim, int triallength, size_t signallength, int kth, int exclude)
+{
+
+   	const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
+	//const unsigned int tidim = tid*pointdim;
+	const unsigned int itrial = tid / triallength;  //  indextrial
+
+	int kindexes[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+	float kdistances[]= {INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, \
+						 INFINITY, INFINITY, INFINITY, INFINITY, INFINITY};
+
+if(tid<signallength){
+
+	//int   r_index;
+	float r_kdist=INFINITY;
+	int indexi = tid-triallength*itrial;
+	for(size_t t=0; t<triallength; t++){
+			int indexu = tid;
+			int indexv = (t + itrial*triallength);
+			int condition1=indexi-exclude;
+			int condition2=indexi+exclude;
+			if((t<condition1)||(t>condition2)){
+				float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
+				if(temp_dist <= r_kdist){
+					r_kdist = insertPointKlist(kth,temp_dist,t,kdistances,kindexes);
+					//printf("\nId: %d, Temp_dist: %.f. r_index: %d", tid, temp_dist, r_index);
+				}
+			}
+			//printf("tid:%d indexes: %d, %d distances: %.f %.f\n",tid, *kindexes, *(kindexes+1), *kdistances, *(kdistances+1));
+	}
+
+	__syncthreads();
+	//COPY TO GLOBAL MEMORY
+	for(size_t k=0;k<kth;k++){
+		g_indexes[tid+k*signallength] = *(kindexes+k);
+		g_distances[tid+k*signallength]= *(kdistances+k);
+	}
+
+}
+
+}
+
+
+/*
+ * Range search using bruteforce
+ */
+
+__global__ void
+kernelBFRSshared(const float* g_uquery, const float* g_vpointset, int *g_npoints, int pointdim, int triallength, size_t signallength, int exclude, float radius)
+{
+
+    // shared memory
+	extern __shared__ char array[];
+	int *s_npointsrange;
+	s_npointsrange = (int*)array;
+
+	const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
+	const unsigned int itrial = tid / triallength;  //  indextrial
+
+if(tid<signallength){
+
+	s_npointsrange[threadIdx.x] = 0;
+	__syncthreads();
+
+
+	unsigned int indexi = tid-triallength*itrial;
+	for(size_t t=0; t<triallength; t++){
+			int indexu = tid;
+			int indexv = (t + itrial*triallength);
+			int condition1=indexi-exclude;
+			int condition2=indexi+exclude;
+			if((t<condition1)||(t>condition2)){
+				float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
+				if(temp_dist < radius){
+					s_npointsrange[threadIdx.x]++;
+				}
+			}
+
+	}
+
+	__syncthreads();
+	//printf("\ntid:%d npoints: %d\n",tid, s_npointsrange[threadIdx.x]);
+	//COPY TO GLOBAL MEMORY
+	g_npoints[tid] = s_npointsrange[threadIdx.x];
+
+}
+}
+
+
+__global__ void
+kernelBFRSMultishared(const float* g_uquery, const float* g_vpointset, int *g_npoints, int pointdim, int triallength, int signallength, int exclude, const float* vecradius)
+{
+
+    // shared memory
+	extern __shared__ char array[];
+	int *s_npointsrange;
+	s_npointsrange = (int*)array;
+    float radius=0;
+	const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
+	const unsigned int itrial = tid / triallength;  //  indextrial
+
+if(tid<signallength){
+
+	s_npointsrange[threadIdx.x] = 0;
+	__syncthreads();
+
+    radius = *(vecradius+itrial);
+	unsigned int indexi = tid-triallength*itrial;
+	for(size_t t=0; t<triallength; t++){
+			int indexu = tid;
+			int indexv = (t + itrial*triallength);
+			int condition1=indexi-exclude;
+			int condition2=indexi+exclude;
+			if((t<condition1)||(t>condition2)){
+				float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
+				if(temp_dist < radius){
+					s_npointsrange[threadIdx.x]++;
+				}
+			}
+
+	}
+
+	__syncthreads();
+	//printf("\ntid:%d npoints: %d\n",tid, s_npointsrange[threadIdx.x]);
+	//COPY TO GLOBAL MEMORY
+	g_npoints[tid] = s_npointsrange[threadIdx.x];
+
+}
+}
+
+
+__global__ void
+kernelBFRSAllshared(const float* g_uquery, const float* g_vpointset, int *g_npoints, int pointdim, int triallength, size_t signallength, int exclude, const float* vecradius)
+{
+
+    // shared memory
+	extern __shared__ char array[];
+	int *s_npointsrange;
+	s_npointsrange = (int*)array;
+    float radius=0;
+	const unsigned int tid = threadIdx.x + blockDim.x*blockIdx.x;
+	const unsigned int itrial = tid / triallength;  //  indextrial
+
+if(tid<signallength){
+
+	s_npointsrange[threadIdx.x] = 0;
+	__syncthreads();
+
+    radius = *(vecradius+tid);
+	unsigned int indexi = tid-triallength*itrial;
+	for(size_t t=0; t<triallength; t++){
+			int indexu = tid;
+			int indexv = (t + itrial*triallength);
+			int condition1=indexi-exclude;
+			int condition2=indexi+exclude;
+			if((t<condition1)||(t>condition2)){
+				float temp_dist = maxMetricPoints(g_uquery+indexu, g_vpointset+indexv,pointdim, signallength);
+				if(temp_dist < radius){
+					s_npointsrange[threadIdx.x]++;
+				}
+			}
+
+	}
+
+	__syncthreads();
+	//printf("\ntid:%d npoints: %d\n",tid, s_npointsrange[threadIdx.x]);
+	//COPY TO GLOBAL MEMORY
+	g_npoints[tid] = s_npointsrange[threadIdx.x];
+
+}
+}
+
+#endif // #ifndef _TEMPLATE_KERNEL_H_
+
+