Skip to content
Permalink
Browse files
Args: add -yield: enable workaround for CUDA busy-wait
  • Loading branch information
preda committed Sep 22, 2019
1 parent d393f3a commit 5cca90dab8a817054620cd3eef8a1b016b87d3b8
Showing with 70 additions and 23 deletions.
  1. +3 −1 Args.cpp
  2. +5 −1 Args.h
  3. +1 −1 Gpu.cpp
  4. +17 −15 clpp.h
  5. +5 −5 clwrap.cpp
  6. +1 −0 clwrap.h
  7. +38 −0 tinycl.h
@@ -58,7 +58,8 @@ Command line options:
-pm1 <exponent> : run a single P-1 test and exit, ignoring worktodo.txt
-results <file> : name of results file, default 'results.txt'
-iters <N> : run next PRP test for <N> iterations and exit. Multiple of 10000.
-maxAlloc : limit GPU memory usage to this value in MB
-maxAlloc : limit GPU memory usage to this value in MB (needed on non-AMD GPUs)
-yield : enable work-around for CUDA busy wait taking up one CPU core
-use NEW_FFT8,OLD_FFT5,NEW_FFT10: comma separated list of defines, see the #if tests in gpuowl.cl (used for perf tuning).
-device <N> : select a specific device:
)", blockSize, logStep, B1, B2_B1_ratio);
@@ -107,6 +108,7 @@ void Args::parse(string line) {
else if (key == "-time") { timeKernels = true; }
else if (key == "-device" || key == "-d") { device = stoi(s); }
else if (key == "-dir") { dir = s; }
else if (key == "-yield") { cudaYield = true; }
else if (key == "-carry") {
if (s == "short" || s == "long") {
carry = s == "short" ? CARRY_SHORT : CARRY_LONG;
6 Args.h
@@ -23,12 +23,16 @@ class Args {
std::vector<std::string> flags;

int device = -1;

bool timeKernels = false;
bool enableTF = false;
bool cudaYield = false;

int carry = CARRY_AUTO;
u32 blockSize = 500;
u32 logStep = 50000;
int fftSize = 0;
bool enableTF = false;

u32 B1 = 500000;
u32 B2 = 0;
u32 B2_B1_ratio = 30;
@@ -176,7 +176,7 @@ Gpu::Gpu(const Args& args, u32 E, u32 W, u32 BIG_H, u32 SMALL_H, u32 nW, u32 nH,
device(device),
context{device},
program(compile(args, context.get(), N, E, W, SMALL_H, BIG_H / SMALL_H, nW)),
queue(Queue::make(context, timeKernels)),
queue(Queue::make(context, timeKernels, args.cudaYield)),

#define LOAD(name, workGroups) name(program.get(), queue, device, workGroups, #name)
LOAD(carryFused, BIG_H + 1),
32 clpp.h
@@ -5,6 +5,7 @@
#include "clwrap.h"
#include "AllocTrac.h"

#include <unistd.h>
#include <memory>
#include <string>
#include <utility>
@@ -73,17 +74,9 @@ void setArg(cl_kernel k, int pos, const Buffer<T>& buf) { setArg(k, pos, buf.get
class Event : public EventHolder {
public:
double secs() { return float(getEventNanos(this->get())) * 1e-9f; }
bool isComplete() { return getEventInfo(this->get()) == CL_COMPLETE; }
};

/*
struct TimeStats {
u64 nanos{};
u32 n{};
void add(u64 delta) { nanos += delta; ++n; }
void clear() { n = 0; nanos = 0; }
};
*/

struct TimeInfo {
double total{};
u32 n{};
@@ -100,10 +93,11 @@ class Queue : public QueueHolder {
TimeMap timeMap;
std::vector<std::pair<Event, TimeMap::iterator>> events;
bool profile{};
bool cudaYield{};

public:
explicit Queue(cl_queue q, bool profile) : QueueHolder{q}, profile{profile} {}
static QueuePtr make(const Context& context, bool profile) { return make_shared<Queue>(makeQueue(context.deviceId(), context.get(), profile), profile); }
Queue(cl_queue q, bool profile, bool cudaYield) : QueueHolder{q}, profile{profile}, cudaYield{cudaYield} {}
static QueuePtr make(const Context& context, bool profile, bool cudaYield) { return make_shared<Queue>(makeQueue(context.deviceId(), context.get(), profile), profile, cudaYield); }

template<typename T> vector<T> read(const Buffer<T>& buf, size_t sizeOrFull = 0) {
auto size = sizeOrFull ? sizeOrFull : buf.size();
@@ -137,14 +131,22 @@ class Queue : public QueueHolder {

void run(cl_kernel kernel, size_t groupSize, size_t workSize, const string &name) {
Event event{::run(get(), kernel, groupSize, workSize, name)};
if (profile) { events.emplace_back(std::move(event), timeMap.insert({name, TimeInfo{}}).first); }
auto it = profile ? timeMap.insert({name, TimeInfo{}}).first : timeMap.end();
if (profile || events.empty()) {
events.emplace_back(std::move(event), it);
} else {
events.front() = std::make_pair(std::move(event), it);
}
}

void finish() {
::finish(get());
if (profile) {
for (auto& [event, it] : events) { it->second.add(event.secs()); }
if (events.empty()) { return; }
if (cudaYield) {
while (!events.back().first.isComplete()) { usleep(50); } // std::this_thread::sleep_for();
} else {
::finish(get());
}
if (profile) { for (auto& [event, it] : events) { it->second.add(event.secs()); } }
events.clear();
}

@@ -364,13 +364,13 @@ void fillBuf(cl_queue q, cl_mem buf, void *pat, size_t patSize, size_t size, siz
CHECK1(clEnqueueFillBuffer(q, buf, pat, patSize, start, size ? size : patSize, 0, 0, 0));
}

u64 getEventNanos(cl_event event) {
/*
u32 getEventInfo(cl_event event) {
u32 status = -1;
CHECK1(clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, 0));
log("event execution status %u\n", status);
*/

return status;
}

u64 getEventNanos(cl_event event) {
u64 start = 0;
u64 end = 0;
CHECK1(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, 0));
@@ -98,3 +98,4 @@ std::string getKernelArgName(cl_kernel k, int pos);

cl_device_id getDevice(int argsDevId);
u64 getEventNanos(cl_event event);
u32 getEventInfo(cl_event event);
@@ -130,6 +130,44 @@ int clSetKernelArgSVMPointer(cl_kernel, unsigned, const void *);
#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
#define CL_EVENT_CONTEXT 0x11D4

/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
#define CL_COMMAND_TASK 0x11F1
#define CL_COMMAND_NATIVE_KERNEL 0x11F2
#define CL_COMMAND_READ_BUFFER 0x11F3
#define CL_COMMAND_WRITE_BUFFER 0x11F4
#define CL_COMMAND_COPY_BUFFER 0x11F5
#define CL_COMMAND_READ_IMAGE 0x11F6
#define CL_COMMAND_WRITE_IMAGE 0x11F7
#define CL_COMMAND_COPY_IMAGE 0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
#define CL_COMMAND_MAP_BUFFER 0x11FB
#define CL_COMMAND_MAP_IMAGE 0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
#define CL_COMMAND_MARKER 0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
#define CL_COMMAND_READ_BUFFER_RECT 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
#define CL_COMMAND_USER 0x1204
#define CL_COMMAND_BARRIER 0x1205
#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
#define CL_COMMAND_FILL_BUFFER 0x1207
#define CL_COMMAND_FILL_IMAGE 0x1208
#define CL_COMMAND_SVM_FREE 0x1209
#define CL_COMMAND_SVM_MEMCPY 0x120A
#define CL_COMMAND_SVM_MEMFILL 0x120B
#define CL_COMMAND_SVM_MAP 0x120C
#define CL_COMMAND_SVM_UNMAP 0x120D

/* command execution status */
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3

#define CL_INVALID_COMPILER_OPTIONS -66

#define CL_KERNEL_NUM_ARGS 0x1191

0 comments on commit 5cca90d

Please sign in to comment.