Skip to content

Commit

Permalink
Add support for cl_khr_command_buffer_multi_device
Browse files Browse the repository at this point in the history
cmdbuf: handle profiling queries for multidev buffers

cmdbuf: add new platform & device queries

multidev command buffers & remap

clRemapCommandBufferKHR: leave new command buffer in same state as the original

advertise cl_khr_command_buffer_multi_device

clEnqueueCommandBufferKHR: fix helper function name

command buffers: add multi device test

cmdbuf: make clRemapCommandBufferKHR actually available

cmdbuf: various bugfixes

cmdbuf: more correct cmdbuf flag handling

tests/cmdbuf: add automatic remapping flag to clRemapCommandBufferKHR

lib: update command buffer code to conform to latest specs

Rework *ndrangekernel & cmdbuf remapping

multidev-cmdbuf: Use correct queues to run cmds

tests: fix memory leaks in multidev cmdbuf test

tests: add missing include
  • Loading branch information
jansol committed Apr 4, 2024
1 parent a055b67 commit 59016cb
Show file tree
Hide file tree
Showing 19 changed files with 961 additions and 147 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1315,8 +1315,8 @@ set(HOST_DEVICE_CL_VERSION_MINOR 0)
set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics \
cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics \
cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes \
cl_khr_command_buffer cl_khr_subgroups cl_intel_unified_shared_memory \
cl_exp_pinned_buffers")
cl_khr_command_buffer cl_khr_command_buffer_multi_device cl_khr_subgroups \
cl_intel_unified_shared_memory cl_exp_pinned_buffers")

# Host CPU device: list of OpenCL 3.0 features that are always enabled
# TODO: __opencl_c_atomic_scope_all_devices works with CPU device but not others
Expand Down
1 change: 1 addition & 0 deletions lib/CL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ set(POCL_LIB_SOURCES "clCreateContextFromType.c"
"clGetCommandBufferInfoKHR.c"
"clReleaseCommandBufferKHR.c"
"clRetainCommandBufferKHR.c"
"clRemapCommandBufferKHR.c"
"clMemAllocINTEL.c"
"clMemFreeINTEL.c"
"clGetMemAllocInfoINTEL.c"
Expand Down
28 changes: 4 additions & 24 deletions lib/CL/clCommandNDRangeKernelKHR.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,29 +42,9 @@ POname (clCommandNDRangeKernelKHR) (

CMDBUF_VALIDATE_COMMON_HANDLES;

errcode = pocl_ndrange_kernel_common (
command_buffer, command_queue, properties, kernel, work_dim,
global_work_offset, global_work_size, local_work_size,
num_sync_points_in_wait_list, NULL, NULL, sync_point_wait_list,
sync_point, &cmd);

for (unsigned i = 0; i < kernel->meta->num_args; ++i)
{
struct pocl_argument_info *ai
= &cmd->command.run.kernel->meta->arg_info[i];
struct pocl_argument *a = &cmd->command.run.kernel->dyn_arguments[i];
if (ai->type == POCL_ARG_TYPE_SAMPLER)
POname (clRetainSampler) (cmd->command.run.arguments[i].value);
}

errcode = pocl_command_record (command_buffer, cmd, sync_point);
if (errcode != CL_SUCCESS)
goto ERROR;

return CL_SUCCESS;

ERROR:
pocl_mem_manager_free_command (cmd);
return errcode;
return pocl_record_ndrange_kernel (
command_buffer, command_queue, properties, kernel, kernel->dyn_arguments,
work_dim, global_work_offset, global_work_size, local_work_size,
num_sync_points_in_wait_list, sync_point_wait_list, sync_point);
}
POsym (clCommandNDRangeKernelKHR)
19 changes: 9 additions & 10 deletions lib/CL/clCreateCommandBufferKHR.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <CL/cl_ext.h>

#include "pocl_cl.h"
#include "pocl_util.h"

CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
POname (clCreateCommandBufferKHR) (
Expand All @@ -34,11 +35,7 @@ POname (clCreateCommandBufferKHR) (
int errcode = 0;
cl_command_buffer_khr cmdbuf = NULL;

/* cl_khr_command_buffer_multi_device supports multiple queues but the basic
* extension does not, keep this check as is until we support that extension
*/
POCL_GOTO_ERROR_COND ((num_queues != 1), CL_INVALID_VALUE);

POCL_GOTO_ERROR_COND ((num_queues == 0), CL_INVALID_VALUE);
POCL_GOTO_ERROR_COND ((queues == NULL), CL_INVALID_VALUE);

/* All queues must have the same OpenCL context */
Expand Down Expand Up @@ -81,17 +78,19 @@ POname (clCreateCommandBufferKHR) (
}

const cl_command_buffer_properties_khr *val = key + 1;
cl_command_buffer_properties_khr tmp = *val;
switch (*key)
{
case CL_COMMAND_BUFFER_FLAGS_KHR:
/* For now only CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR is a known
* allowed value */
POCL_GOTO_ERROR_COND (
((*val & ~(CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR)) != 0),
CL_INVALID_VALUE);
/* Simultaneous use is always supported, no action needed */
tmp &= ~CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;

/* If any of the devices associated with 'queues' does not
* support a requested capability, error out with
* CL_INVALID_PROPERTY */

/* Any flag bits not handled above are invalid */
POCL_GOTO_ERROR_COND ((tmp != 0), CL_INVALID_VALUE);
break;
default:
errcode = CL_INVALID_VALUE;
Expand Down
55 changes: 19 additions & 36 deletions lib/CL/clEnqueueCommandBufferKHR.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,6 @@ buffer_finished_callback (cl_event event, cl_int event_command_status,
POname (clReleaseCommandBufferKHR) (command_buffer);
}

static cl_command_buffer_properties_khr
get_cmdbuf_property (cl_command_buffer_khr command_buffer,
cl_command_buffer_properties_khr name)
{
for (unsigned i = 0; i < command_buffer->num_properties; ++i)
{
if (command_buffer->properties[2 * i] == name)
return command_buffer->properties[2 * i + 1];
}
return 0;
}

CL_API_ENTRY cl_int
POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
cl_command_queue *queues,
Expand All @@ -70,7 +58,8 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
POCL_RETURN_ERROR_COND ((command_buffer->queues == NULL),
CL_INVALID_COMMAND_BUFFER_KHR);

const cl_command_queue *used_queues;
cl_uint num_used_queues = command_buffer->num_queues;
const cl_command_queue *used_queues = command_buffer->queues;

POCL_RETURN_ERROR_COND ((num_queues != 0 && queues == NULL),
CL_INVALID_VALUE);
Expand Down Expand Up @@ -114,14 +103,9 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
POCL_RETURN_ERROR_COND (
(queues[i]->properties != command_buffer->queues[i]->properties),
CL_INCOMPATIBLE_COMMAND_QUEUE_KHR);

used_queues = queues;
}
}
else
{
num_queues = command_buffer->num_queues;
used_queues = command_buffer->queues;
used_queues = queues;
num_used_queues = num_queues;
}

errcode = pocl_check_event_wait_list (
Expand All @@ -130,8 +114,8 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
return errcode;

cl_command_buffer_flags_khr flags
= (cl_command_buffer_flags_khr)get_cmdbuf_property (
command_buffer, CL_COMMAND_BUFFER_FLAGS_KHR);
= (cl_command_buffer_flags_khr)pocl_cmdbuf_get_property (
command_buffer, CL_COMMAND_BUFFER_FLAGS_KHR);
POCL_LOCK (command_buffer->mutex);
int is_ready
= command_buffer->state == CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR
Expand All @@ -146,12 +130,12 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
POCL_RETURN_ERROR_COND ((!is_ready), CL_INVALID_OPERATION);

/* Submit to queue(s) */
cl_command_queue q = used_queues[0];
if (num_queues == 1 && q->device->ops->run_command_buffer)
if (num_used_queues == 1 && used_queues[0]->device->ops->run_command_buffer)
{
/* TODO: add base event id & increment global event id counter by number
* of commands generated by the buffer */
return q->device->ops->run_command_buffer (q->device, command_buffer);
return used_queues[0]->device->ops->run_command_buffer (
used_queues[0]->device, command_buffer);
}
/* Submit individual commands manually */
else
Expand All @@ -166,18 +150,17 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
unsigned sync_id = 0;
LL_FOREACH (command_buffer->cmds, cmd)
{
q = used_queues[cmd->queue_idx];
unsigned j, k;
unsigned j = 0, k = 0;

/* Add events from syncpoints to waitlist */
for (j = 0; j < cmd->sync.syncpoint.num_sync_points_in_wait_list; ++j)
for (; j < cmd->sync.syncpoint.num_sync_points_in_wait_list; ++j)
{
// sync point ids start at 1
deps[j]
= syncpoints[cmd->sync.syncpoint.sync_point_wait_list[j] - 1];
}
/* Add events from command buffer dependencies to waitlist */
for (k = 0; k < num_events_in_wait_list; ++k, ++j)
for (; k < num_events_in_wait_list; ++k, ++j)
{
deps[j] = event_wait_list[k];
}
Expand All @@ -195,8 +178,8 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
sizeof (cl_mem) * cmd->memobj_count);
}
errcode = pocl_create_command (
&node, q, cmd->type, &syncpoints[sync_id], j, deps,
cmd->memobj_count, memobj_list, readonly_flag_list);
&node, used_queues[cmd->queue_idx], cmd->type, &syncpoints[sync_id],
j, deps, cmd->memobj_count, memobj_list, readonly_flag_list);
++sync_id;

POCL_MEM_FREE (readonly_flag_list);
Expand All @@ -219,7 +202,7 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
return errcode;
}

pocl_command_enqueue (q, node);
pocl_command_enqueue (used_queues[cmd->queue_idx], node);
}

/* We need an event for the completion of the command buffer as a whole.
Expand All @@ -229,9 +212,9 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
/* TODO: which queue should be managing the buffer completion event? */
_cl_command_node *node = NULL;
cl_event final_ev;
errcode = pocl_create_command (&node, q, CL_COMMAND_COMMAND_BUFFER_KHR,
&final_ev, command_buffer->num_syncpoints,
syncpoints, 0, NULL, NULL);
errcode = pocl_create_command (
&node, used_queues[0], CL_COMMAND_COMMAND_BUFFER_KHR, &final_ev,
command_buffer->num_syncpoints, syncpoints, 0, NULL, NULL);
if (errcode != CL_SUCCESS)
{
pocl_mem_manager_free_command (node);
Expand Down Expand Up @@ -259,7 +242,7 @@ POname (clEnqueueCommandBufferKHR) (cl_uint num_queues,
POname (clReleaseEvent) (syncpoints[i]);
}
POname (clRetainCommandBufferKHR) (command_buffer);
pocl_command_enqueue (q, node);
pocl_command_enqueue (used_queues[0], node);

return CL_SUCCESS;
}
Expand Down
14 changes: 3 additions & 11 deletions lib/CL/clEnqueueNDRangeKernel.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,19 @@
IN THE SOFTWARE.
*/

#include "config.h"
#include "pocl_binary.h"
#include "pocl_cache.h"
#include "pocl_cl.h"
#include "pocl_context.h"
#include "pocl_cq_profiling.h"
#include "pocl_llvm.h"
#include "pocl_local_size.h"
#include "pocl_mem_management.h"
#include "pocl_shared.h"
#include "pocl_util.h"
#include "utlist.h"

#ifndef _WIN32
# include <unistd.h>
#else
# include "vccompat.hpp"
#endif
#include <assert.h>
#include <sys/stat.h>
#include <errno.h>
#include <string.h>

//#define DEBUG_NDRANGE
Expand All @@ -65,9 +57,9 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
POCL_RETURN_ERROR_COND ((*(command_queue->device->available) == CL_FALSE),
CL_DEVICE_NOT_AVAILABLE);
errcode = pocl_ndrange_kernel_common (
NULL, command_queue, NULL, kernel, work_dim, global_work_offset,
global_work_size, local_work_size, num_events_in_wait_list,
event_wait_list, event, NULL, NULL, &cmd);
NULL, command_queue, NULL, kernel, kernel->dyn_arguments, work_dim,
global_work_offset, global_work_size, local_work_size,
num_events_in_wait_list, event_wait_list, event, NULL, NULL, &cmd);
POCL_RETURN_ERROR_COND (errcode != CL_SUCCESS, errcode);

if (pocl_cq_profiling_enabled)
Expand Down
29 changes: 24 additions & 5 deletions lib/CL/clGetDeviceInfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,14 +385,33 @@ POname(clGetDeviceInfo)(cl_device_id device,

/** cl_khr_command_buffer queries **/
case CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR:
POCL_RETURN_GETINFO (
cl_device_command_buffer_capabilities_khr,
CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR
| CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR
| CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR);
POCL_RETURN_GETINFO (cl_device_command_buffer_capabilities_khr,
CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR
| CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR
| CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR
| CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR);

case CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR:
POCL_RETURN_GETINFO (cl_command_queue_properties, 0);

case CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR:
if (device->ops->get_device_info_ext != NULL
&& device->ops->get_device_info_ext (device, param_name,
param_value_size, param_value,
param_value_size_ret)
== CL_SUCCESS)
return CL_SUCCESS;
/* If querying the actual device fails, it is probably safe to say there
* are no devices it can sync with on the device side */
POCL_RETURN_GETINFO (cl_uint, 0);

case CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR:
if (device->ops->get_device_info_ext != NULL)
/* Let devices fill in the list if supported, or no-op otherwise */
device->ops->get_device_info_ext (device, param_name, param_value_size,
param_value, param_value_size_ret);
return CL_SUCCESS; /* gracefully no-op in case the driver fails to handle
the query */
}

if(device->ops->get_device_info_ext != NULL) {
Expand Down
14 changes: 14 additions & 0 deletions lib/CL/clGetEventProfilingInfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ POname(clGetEventProfilingInfo)(cl_event event,
POCL_RETURN_ERROR_ON((event->status != CL_COMPLETE), CL_PROFILING_INFO_NOT_AVAILABLE,
"Cannot return profiling info on events not CL_COMPLETE yet\n");

if (event->command_type == CL_COMMAND_COMMAND_BUFFER_KHR)
{
cl_command_buffer_khr buf = event->command->command.replay.buffer;
if (buf->num_queues > 1)
{
for (unsigned i = 0; i < buf->num_queues; ++i)
POCL_RETURN_ERROR_ON (
((buf->queues[i]->properties & CL_QUEUE_PROFILING_ENABLE) == 0),
CL_PROFILING_INFO_NOT_AVAILABLE,
"Profiling info from command buffers is only available if "
"profiling is enabled on all queues used in the buffer.\n");
}
}

if (param_value)
{
if (param_value_size < value_size) return CL_INVALID_VALUE;
Expand Down
5 changes: 5 additions & 0 deletions lib/CL/clGetExtensionFunctionAddressForPlatform.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ CL_API_SUFFIX__VERSION_1_2
return (void *)&POname (clGetCommandBufferInfoKHR);
/* end of cl_khr_command_buffer */

/* cl_khr_command_buffer_multi_device */
if (strcmp (func_name, "clRemapCommandBufferKHR") == 0)
return (void *)&POname (clRemapCommandBufferKHR);
/* end of cl_khr_command_buffer_multi_device */

/* cl_intel_unified_shared_memory */
if (strcmp (func_name, "clHostMemAllocINTEL") == 0)
return (void *)&POname (clHostMemAllocINTEL);
Expand Down
7 changes: 7 additions & 0 deletions lib/CL/clGetPlatformInfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,13 @@ POname(clGetPlatformInfo)(cl_platform_id platform,
case CL_PLATFORM_HOST_TIMER_RESOLUTION:
POCL_RETURN_GETINFO(cl_ulong, 0);

/* cl_khr_command_buffer_multi_device */
case CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR:
POCL_RETURN_GETINFO (cl_platform_command_buffer_capabilities_khr,
CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR
| CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR
| CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR);

default:
return CL_INVALID_VALUE;
}
Expand Down

0 comments on commit 59016cb

Please sign in to comment.