Permalink
Browse files

started ksmallest project

  • Loading branch information...
1 parent 86f6f2a commit 8164743106b20fbb1e6df9eaac5191b3375be782 @seanbaxter committed Oct 6, 2011
Showing with 180 additions and 13 deletions.
  1. +29 −0 sort/src/kernels/common.cu
  2. +141 −0 sort/src/kernels/ksmallest.cu
  3. +1 −0 sort/vs9/vs9.sln
  4. +4 −4 support/src/segscan/segscan.cpp
  5. +5 −9 support/src/segscan/segscan.cu
View
@@ -155,3 +155,32 @@ DEVICE2 void StoreShifted(volatile uint* shared, uint shiftedIndex, uint val) {
*((volatile uint*)(((volatile char*)shared) + shiftedIndex)) = val;
}
+
+// Put a float into radix order.
+DEVICE float UintToFloat(uint u) {
+ int adjusted = (int)u;
+
+ // Negative now has high bit set, positive has high bit clear.
+ int flipped = adjusted - 0x80000000;
+
+ // Fill the register with set bits if negative.
+ int bits = flipped>> 31;
+
+ int x = flipped ^ (0x7fffffff & bits);
+
+ float f = __int_as_float(x);
+ return f;
+}
+
+// Put a radix order into back into a float.
+DEVICE uint FloatToUint(float f) {
+ int x = __float_as_int(f);
+ int bits = x>> 31;
+
+ int flipped = x ^ (0x7fffffff & bits);
+
+ int adjusted = 0x80000000 + flipped;
+
+ uint u = (uint)adjusted;
+ return u;
+}
@@ -0,0 +1,141 @@
+#include "common.cu"
+
+__global__ int ksmallestStream_global;
+__global__ int ksmallestLeft_global;
+
+#define NUM_THREADS 128
+#define NUM_WARPS 4
+#define VALUES_PER_THREAD 5
+#define VALUES_PER_WARP (VALUES_PER_THREAD * WARP_SIZE)
+
+// Reserve 2 * WARP_SIZE values per warp. As soon as WARP_SIZE values are
+// available per warp, store them to device memory.
+__shared__ volatile uint values_shared[NUM_THREADS * (VALUES_PER_THREAD + 1)];
+__shared__ volatile uint indices_shared[NUM_THREADS * (VALUES_PER_THREAD + 1)];
+
+DEVICE2 uint ConvertToUint(uint x) { return x; }
+DEVICE2 uint ConvertToUint(int x) { return (uint)x; }
+DEVICE2 uint ConvertToUint(float x) { return (uint)__float_as_int(x); }
+
+// Define absolute min and absolute max.
+
+////////////////////////////////////////////////////////////////////////////////
+// CompareAndStream
+
+DEVICE bool StreamToGlobal(uint* dest_global, uint* indices_global, int lane,
+ int& count, bool writeAll, int capacity, volatile uint* valuesShared,
+ volatile uint* indicesShared) {
+
+ int valuesToWrite = count;
+ if(!writeAll) valuesToWrite = ~(WARP_SIZE - 1) & count;
+ count -= valuesToWrite;
+
+ // To avoid atomic resource contention, have just one thread perform the
+ // interlocked operation.
+ volatile uint* advance_shared = valuesShared + VALUES_PER_WARP - 1;
+ if(!lane)
+ *advance_shared = atomicAdd(&ksmallestStream_global, valuesToWrite);
+ int target = *advance_shared;
+ if(target >= capacity) return false;
+
+ target += lane;
+ valuesToWrite -= lane;
+
+ uint source = lane;
+ while(valuesToWrite >= 0) {
+ dest_global[target] = valuesShared[source];
+ indices_global[target] = indicesShared[source];
+ source += WARP_SIZE;
+ valuesToWrite -= WARP_SIZE;
+ }
+
+ // Copy the values form the end of the shared memory array to the front.
+ if(count > lane) {
+ valuesShared[lane] = valuesShared[source];
+ indicesShared[lane] = indicesShared[source];
+ }
+ return true;
+}
+
+
+DEVICE2 template<typename T>
+void ProcessStream(const T* source_global, uint* dest_global,
+ uint* indices_global, int2 range, T left, T right, int capacity,
+ bool checkRange, uint lane, uint warp) {
+
+ uint leftCounter = 0;
+ int warpSharedPop = 0;
+ uint mask = bfi(0, 0xffffffff, 0, tid);
+
+ volatile uint* valuesShared = values_shared + warp * VALUES_PER_WARP;
+ volatile uint* indicesShared = indices_shared + warp * VALUES_PER_WARP;
+
+ while(warpRange.x < warpRange.y) {
+ uint source = warpRange.x + lane;
+
+ T val = source_global[source];
+
+ // There are three cases:
+ // 1) The value comes before the left splitter. For this, increment
+ // leftCounter.
+ // 2) The value comes after the right spliter. For this, do nothing.
+ // 3) The value comes between the splitters. For this, move the value
+ // to shared memory and periodically stream to global memory.
+
+ bool inRange = false;
+ if(val < left) ++leftCounter;
+ else if(val <= right) inRange = true;
+
+ uint warpValues = __ballot(inRange);
+
+ // Mask out the values at and above tid to find the offset.
+ uint offset = __popc(mask & warpValues) + warpSharedPop;
+ uint advance = __popc(warpValues);
+ warpSharedPop += advance;
+
+ if(inRange) valuesShared[offset] = ConvertToUint(val);
+ if(inRange) indicesShared[offset] = source;
+
+ if(warpSharedPop >= VALUES_PER_WARP) {
+ bool success = StreamToGlobal(dest_global, indices_global, lane,
+ warpSharedPop, false, capacity, valuesShared, indicesShared);
+ if(!success) return;
+ }
+
+ warpRange.x += WARP_SIZE;
+ }
+
+ // Sum up the number of left counters.
+ valuesShared[lane] = leftCounter;
+ #pragma unroll
+ for(int i = 0; i < LOG_WARP_SIZE; ++i) {
+ int offset = 1<< i;
+ if(lane >= offset) leftCounter += valuesShared[lane - offset];
+ valuesShared[lane] = leftCounter;
+ }
+
+ if(WARP_SIZE - 1 == lane)
+ atomicAdd(&ksmallestLeft_global, leftCounter);
+}
+
+template<typename T>
+DEVICE2 void CompareAndStream(const T* source_global, uint* dest_global,
+ uint* indices_global, const int2* range_global, T left, T right,
+ int capacity) {
+
+ uint tid = threadIdx.x;
+ uint lane = (WARP_SIZE - 1) & tid;
+ uint warp = tid / WARP_SIZE;
+ uint gid = blockIdx.x * NUM_WARPS + warp;
+
+ int2 warpRange = range_global[gid];
+
+
+ if(gid < NUM_WARPS * blockDim.x) {
+ ProcessStream<T>(source_global, dest_global, warpRange,
+ left, right, capacity, false, tid, lane, warp);
+ } else {
+ ProcessStream<T>(source_global, dest_global, warpRange,
+ left, right, capacity, true, tid, lane, warp);
+ }
+}
View
@@ -22,6 +22,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "kernels", "kernels", "{6F25
..\src\kernels\hist3_64.cu = ..\src\kernels\hist3_64.cu
..\src\kernels\histgen.cu = ..\src\kernels\histgen.cu
..\src\kernels\histogram.cu = ..\src\kernels\histogram.cu
+ ..\src\kernels\ksmallest.cu = ..\src\kernels\ksmallest.cu
..\src\kernels\params.cu = ..\src\kernels\params.cu
..\src\kernels\sort.cu = ..\src\kernels\sort.cu
..\src\kernels\sortcommon.cu = ..\src\kernels\sortcommon.cu
@@ -11,7 +11,7 @@
std::tr1::mt19937 mt19937;
std::tr1::uniform_int<uint> r(0, 3);
-std::tr1::uniform_int<uint> r2(0, 16 * 8);
+std::tr1::uniform_int<uint> r2(0, 7);
void SegmentedScan(uint* packed, int count) {
int last = 0;
@@ -87,9 +87,9 @@ int main(int argc, char** argv) {
printf("Warp segmented scan (1 val/thread):\n");
PrintSegScan(&host1[0], WarpSize);
-// SegmentedScan(&host1[0], WarpSize);
-// printf("\nCPU segmented scan:\n");
-// PrintSegScan(&host1[0], WarpSize);
+ SegmentedScan(&host1[0], WarpSize);
+ printf("\nCPU segmented scan:\n");
+ PrintSegScan(&host1[0], WarpSize);
CuCallStack callStack;
callStack.Push(device1, device1);
@@ -99,7 +99,6 @@ extern "C" __global__ void SegScanWarp8(const uint* dataIn_global,
for(int i = 0; i < VALUES_PER_THREAD; ++i)
packed[i] = shared[offset + i];
-
////////////////////////////////////////////////////////////////////////////
// UPSWEEP PASS
// Run a sequential segmented scan for all values in the packed array. Find
@@ -109,19 +108,19 @@ extern "C" __global__ void SegScanWarp8(const uint* dataIn_global,
uint last = 0;
uint hasHeadFlag = 0;
- uint x[VALUES_PER_THREAD];
+ uint scan[VALUES_PER_THREAD];
uint flags[VALUES_PER_THREAD];
#pragma unroll
for(int i = 0; i < VALUES_PER_THREAD; ++i) {
flags[i] = 0x80000000 & packed[i];
- x[i] = 0x7fffffff & packed[i];
+ uint x = 0x7fffffff & packed[i];
if(flags[i]) last = 0;
hasHeadFlag |= flags[i];
- last += x[i];
+ scan[i] = last;
+ last += x;
}
-
////////////////////////////////////////////////////////////////////////////
// SEGMENT PASS
// Run a ballot and clz to find the thread containing the start value for
@@ -137,7 +136,6 @@ extern "C" __global__ void SegScanWarp8(const uint* dataIn_global,
int preceding = 31 - __clz(warpFlags);
uint distance = tid - preceding;
-
////////////////////////////////////////////////////////////////////////////
// REDUCTION PASS
// Run a prefix sum scan over last to compute for each thread the sum of all
@@ -162,7 +160,6 @@ extern "C" __global__ void SegScanWarp8(const uint* dataIn_global,
// the preceding thread.
sum += first - last;
-
////////////////////////////////////////////////////////////////////////////
// DOWNSWEEP PASS
// Add sum to all the values in the continuing segment (that is, before the
@@ -171,8 +168,7 @@ extern "C" __global__ void SegScanWarp8(const uint* dataIn_global,
#pragma unroll
for(int i = 0; i < VALUES_PER_THREAD; ++i) {
if(flags[i]) sum = 0;
- shared[offset + i] = sum;
- sum += x[i];
+ shared[offset + i] = scan[i] + sum;
}
// Store the values back to global memory.

0 comments on commit 8164743

Please sign in to comment.