-
Notifications
You must be signed in to change notification settings - Fork 0
/
PrefixSum.cpp
337 lines (266 loc) · 9.28 KB
/
PrefixSum.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <sys/time.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
using namespace std;
#define LW 256
#define ELEMENTS_PER_THREAD 2
#define ELEMENTS_PER_WORKGROUP (ELEMENTS_PER_THREAD*LW)
#define MAX_SOURCE_SIZE (0x100000)
#define ITERATIONS 1000
#define PROFILE_GPU TRUE
int pow2gt(int x) {
int i = 1;
while (i < x)
i <<= 1;
return i;
}
class PrefixSum {
int capacity;
cl_context context;
cl_command_queue queue;
cl_mem *d_parts;
size_t local;
cl_kernel kern_scan_pad_to_pow2;
cl_kernel kern_scan_subarrays;
cl_kernel kern_scan_inc_subarrays;
public:
double elapsed;
PrefixSum(cl_context context, cl_device_id device_id, int n_devices, int capacity);
cl_mem factory(int len);
cl_mem factory();
int scan(cl_mem d_array, cl_mem d_total);
int scan(cl_mem d_array, cl_mem d_total, int len);
};
int PrefixSum::scan(cl_mem d_array, cl_mem d_total) {
PrefixSum::scan(d_array, d_total, capacity);
}
int PrefixSum::scan(cl_mem d_array, cl_mem d_total, int len) {
cl_event event;
#if PROFILE_GPU == TRUE
cl_ulong time_start, time_end;
#endif
int i;
int k = (len + ELEMENTS_PER_WORKGROUP - 1) / ELEMENTS_PER_WORKGROUP;
size_t global = k*LW;
cl_mem d_part;
cl_int ret;
if (k == 1) {
ret = clSetKernelArg(kern_scan_pad_to_pow2, 0, sizeof(cl_mem), &d_array);
ret = clSetKernelArg(kern_scan_pad_to_pow2, 1, ELEMENTS_PER_WORKGROUP*sizeof(int), NULL);
ret |= clSetKernelArg(kern_scan_pad_to_pow2, 2, sizeof(int), &len);
ret = clSetKernelArg(kern_scan_pad_to_pow2, 3, sizeof(cl_mem), &d_total);
if (ret != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", ret);
exit(1);
}
#if PROFILE_GPU == TRUE
ret = clEnqueueNDRangeKernel(queue, kern_scan_pad_to_pow2, 1, NULL, &global, &local, 0, NULL, &event);
clWaitForEvents(1, &event);
if (ret) {
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
elapsed += time_end - time_start;
#endif
}
else {
d_part = d_parts[(int) (log(len)/log(ELEMENTS_PER_WORKGROUP)) - 1];
ret = clSetKernelArg(kern_scan_subarrays, 0, sizeof(cl_mem), &d_array);
ret |= clSetKernelArg(kern_scan_subarrays, 1, ELEMENTS_PER_WORKGROUP*sizeof(int), NULL);
ret |= clSetKernelArg(kern_scan_subarrays, 2, sizeof(cl_mem), &d_part);
ret |= clSetKernelArg(kern_scan_subarrays, 3, sizeof(int), &len);
if (ret != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", ret);
exit(1);
}
ret = clEnqueueNDRangeKernel(queue, kern_scan_subarrays, 1, NULL, &global, &local, 0, NULL, &event);
clWaitForEvents(1, &event);
if (ret) {
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
#if PROFILE_GPU == TRUE
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
elapsed += time_end - time_start;
#endif
ret = clSetKernelArg(kern_scan_inc_subarrays, 0, sizeof(cl_mem), &d_array);
ret |= clSetKernelArg(kern_scan_inc_subarrays, 1, ELEMENTS_PER_WORKGROUP*sizeof(int), NULL);
ret |= clSetKernelArg(kern_scan_inc_subarrays, 2, sizeof(cl_mem), &d_part);
ret |= clSetKernelArg(kern_scan_inc_subarrays, 3, sizeof(int), &len);
if (ret != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", ret);
exit(1);
}
ret = clEnqueueNDRangeKernel(queue, kern_scan_inc_subarrays, 1, NULL, &global, &local, 0, NULL, &event);
clWaitForEvents(1, &event);
if (ret) {
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
#if PROFILE_GPU == TRUE
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
elapsed += time_end - time_start;
#endif
}
clFinish(queue);
}
cl_mem PrefixSum::factory() {
return factory(capacity);
}
cl_mem PrefixSum::factory(int len) {
cl_int ret;
len = pow2gt(len);
cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, len*sizeof(int), NULL, &ret);
if (ret != CL_SUCCESS)
cout << "error" << endl;
return buf;
}
PrefixSum::PrefixSum(cl_context p_context, cl_device_id device_id, int n_devices, int p_capacity) {
capacity = p_capacity;
context = p_context;
FILE *fp;
char fileName[] = "./prefixsum.cl";
char *source_str;
size_t source_size;
cl_program program = NULL;
cl_int ret;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create Command Queue */
#if PROFILE_GPU == TRUE
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
#else
queue = clCreateCommandQueue(context, device_id, 0, &ret);
#endif
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kern_scan_pad_to_pow2 = clCreateKernel(program, "scan_pad_to_pow2", &ret);
kern_scan_subarrays = clCreateKernel(program, "scan_subarrays", &ret);
kern_scan_inc_subarrays = clCreateKernel(program, "scan_inc_subarrays", &ret);
local = LW;
elapsed = 0;
int len = capacity/ELEMENTS_PER_WORKGROUP;
int n = (int) ceil(log((float) capacity)/log((float) ELEMENTS_PER_WORKGROUP));
d_parts = (cl_mem*) malloc(n*sizeof(cl_mem));
for (int i=0; i<n; i++) {
d_parts[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, len*sizeof(int), NULL, NULL);
len /= ELEMENTS_PER_WORKGROUP;
i++;
}
}
typedef struct dim2d {
int w;
int h;
} dim2d_t;
vector<dim2d_t> tile_dims;
vector<dim2d_t> img_dims;
dim2d_t tile_dims0 = {16, 16};
dim2d_t tile_dims1 = {32, 32};
dim2d_t img_dims0 = {400, 300};
dim2d_t img_dims1 = {640, 480};
dim2d_t img_dims2 = {800, 600};
dim2d_t img_dims3 = {1024, 768};
dim2d_t img_dims4 = {1600, 1200};
dim2d_t img_dims5 = {1920, 1080};
dim2d_t img_dims6 = {2560, 1440};
dim2d_t img_dims7 = {2048, 2048};
dim2d_t img_dims8 = {3600, 2400};
dim2d_t img_dims9 = {4096, 4096};
dim2d_t img_dims10 = {8192, 8192};
int main() {
cl_context context = NULL;
cl_device_id device_id = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
cl_command_queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
tile_dims.push_back(tile_dims0);
tile_dims.push_back(tile_dims1);
img_dims.push_back(img_dims0);
img_dims.push_back(img_dims1);
img_dims.push_back(img_dims2);
img_dims.push_back(img_dims3);
img_dims.push_back(img_dims4);
img_dims.push_back(img_dims5);
img_dims.push_back(img_dims6);
img_dims.push_back(img_dims7);
img_dims.push_back(img_dims8);
img_dims.push_back(img_dims9);
img_dims.push_back(img_dims10);
cout << "img dim,mp,tile dim,num tiles,total ms,kernels ms" << endl;
for(vector<dim2d_t>::iterator it = img_dims.begin(); it != img_dims.end(); ++it) {
int img_w = (*it).w;
int img_h = (*it).h;
float mp = (float) (img_w*img_h)/(1024*1024);
for(vector<dim2d_t>::iterator it = tile_dims.begin(); it != tile_dims.end(); ++it) {
int tile_w = (*it).w;
int tile_h = (*it).h;
int n_tiles = (img_w/tile_w)*(img_h/tile_h);
PrefixSum ps = PrefixSum(context, device_id, 1, n_tiles);
int h_total = 0;
cl_mem d_total = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(int), NULL, &ret);
cl_mem d_list = ps.factory();
int data[n_tiles];
for (int i=0; i<n_tiles; i++) {
data[i] = 1;
}
ret = clEnqueueWriteBuffer(queue, d_list, CL_TRUE, 0, n_tiles*sizeof(int), data, 0, NULL, NULL);
struct timeval t, t2;
double elapsed;
gettimeofday(&t, NULL);
for (int i=0; i<ITERATIONS; i++) {
ps.scan(d_list, d_total, n_tiles);
}
gettimeofday(&t2, NULL);
double seconds = t2.tv_sec - t.tv_sec;
double microseconds = t2.tv_usec - t.tv_usec;
elapsed = (seconds * 1.0e6 + microseconds);
/*
cout << mp << "mp, " << n_tiles << " (" << tile_w << "x" << tile_h << ") tiles" << endl;
cout << "total: " << (1e-3 * elapsed) / ITERATIONS << "ms" << endl;
#if PROFILE_GPU == TRUE
cout << "kernels: " << (1e-6 * ps.elapsed) / ITERATIONS << "ms" << endl;
#endif
cout << endl;
*/
cout << "(" << img_w << "x" << img_h << "),";
cout << mp << ",";
cout << "(" << tile_w << "x" << tile_h << "),";
cout << n_tiles << ",";
cout << (1e-3 * elapsed) / ITERATIONS << ",";
cout << (1e-6 * ps.elapsed) / ITERATIONS;
cout << endl;
ret = clEnqueueReadBuffer(queue, d_total, CL_TRUE, 0, 1*sizeof(int), &h_total, 0, NULL, NULL);
ret = clEnqueueReadBuffer(queue, d_list, CL_TRUE, 0, n_tiles*sizeof(int), &data, 0, NULL, NULL);
}
}
}