Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pocl_util: fix numberous deadlocks on disconnect #1459

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions include/pocl.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,11 @@ struct _cl_command_node
cl_device_id device;
/* The index of the targeted device in the **program** device list. */
unsigned program_device_i;
/*
* -1: command_failed
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best add macros for these to avoid magic numbers.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, enum even better as we allow them in C code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've changed it to an enum

* 0: not ready
* 1: ready
*/
cl_int ready;

/* fields needed by buffered commands only */
Expand Down
9 changes: 7 additions & 2 deletions lib/CL/devices/basic/basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -538,8 +538,13 @@ pocl_basic_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->ready != 1)
{
POCL_MSG_PRINT_EVENTS (
"basic: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (event))
{
Expand Down
1 change: 1 addition & 0 deletions lib/CL/devices/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ pocl_broadcast (cl_event brc_event)
if (target != brc_event->notify_list)
{
pocl_unlock_events_inorder (brc_event, target_event);
POname (clReleaseEvent) (target_event);
POCL_LOCK_OBJ (brc_event);
continue;
}
Expand Down
9 changes: 7 additions & 2 deletions lib/CL/devices/proxy/pocl_proxy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1562,8 +1562,13 @@ pocl_proxy_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->ready != 1)
{
POCL_MSG_PRINT_EVENTS (
"proxy: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (node->sync.event.event))
{
Expand Down
9 changes: 7 additions & 2 deletions lib/CL/devices/pthread/pthread.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,13 @@ pocl_pthread_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->ready != 1)
{
POCL_MSG_PRINT_EVENTS (
"pthread: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (node->sync.event.event))
{
Expand Down
13 changes: 10 additions & 3 deletions lib/CL/devices/remote/remote.c
Original file line number Diff line number Diff line change
Expand Up @@ -1433,7 +1433,7 @@ pocl_remote_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
if (node->ready != 1)
{
POCL_MSG_PRINT_EVENTS (
"remote: command related to the notified event %lu not ready\n",
Expand Down Expand Up @@ -2257,7 +2257,7 @@ remote_start_command (remote_device_data_t *d, _cl_command_node *node)
cl_command_queue cq = node->sync.event.event->queue;
if (*(cq->device->available) == CL_FALSE)
{
pocl_update_event_device_lost (event);
node->ready = -1;
goto EARLY_FINISH;
}
pocl_update_event_running (event);
Expand Down Expand Up @@ -2544,7 +2544,14 @@ pocl_remote_driver_pthread (void *cldev)
char msg[128] = "Event ";
strcat (msg, cstr);

POCL_UPDATE_EVENT_COMPLETE_MSG (event, msg);
if (finished->ready != 1)
{
pocl_update_event_device_lost (event);
}
else
{
POCL_UPDATE_EVENT_COMPLETE_MSG (event, msg);
}

POCL_FAST_LOCK (d->wq_lock);
}
Expand Down
19 changes: 11 additions & 8 deletions lib/CL/pocl_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,14 +384,17 @@ struct pocl_device_ops {
must be locked also on return.*/
void (*notify) (cl_device_id device, cl_event event, cl_event finished);

/* broadcast is(has to be) called by the device driver when a command is
completed.
It is used to broadcast notifications to device drivers waiting
this event to complete.
There is a default implementation for this. Use it if there is no need
to do anything special here.
The default implementation calls notify(event, target_event) for the
list of events waiting on 'event'. */
/**
* Broadcast is(has to be) called by the device driver when a command is
* completed.
* It is used to broadcast notifications to device drivers waiting
* this event to complete.
* There is a default implementation for this. Use it if there is no need
* to do anything special here.
* The default implementation calls notify(event, target_event) for the
* list of events waiting on 'event'.
* @warning Called with event (and its notify events) unlocked.
*/
void (*broadcast) (cl_event event);

/* wait_event is called by clWaitForEvents() and blocks the execution until
Expand Down
116 changes: 67 additions & 49 deletions lib/CL/pocl_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -487,12 +487,23 @@ sort_and_uniq (cl_mem *objs, char *readonly_flags, size_t *num_objs)
extern unsigned long event_c;
extern unsigned long uevent_c;

/**
* Create a new event object.
* @param event pointer to event to be created.
* @param command_queue set to NULL for user events.
* @param command_type assigned to event.
* @param num_buffers can be zero.
* @param buffers needs to be non NULL if num_buffers is not zero.
* @param context assigned to event.
* @return CL_SUCCESS on success or an ocl error otherwise.
*/
cl_int
pocl_create_event (cl_event *event, cl_command_queue command_queue,
cl_command_type command_type, size_t num_buffers,
const cl_mem *buffers, cl_context context)
{
static uint64_t event_id_counter = 0;
cl_int errcode = CL_SUCCESS;

if (context == NULL)
return CL_INVALID_CONTEXT;
Expand All @@ -510,9 +521,12 @@ pocl_create_event (cl_event *event, cl_command_queue command_queue,

/* user events have a NULL command queue, don't retain it */
if (command_queue)
POname (clRetainCommandQueue) (command_queue);
errcode = POname (clRetainCommandQueue) (command_queue);
else
POname (clRetainContext) (context);
errcode = POname (clRetainContext) (context);

if (errcode != CL_SUCCESS)
goto ERROR;

(*event)->command_type = command_type;
(*event)->id = POCL_ATOMIC_INC (event_id_counter);
Expand All @@ -534,6 +548,10 @@ pocl_create_event (cl_event *event, cl_command_queue command_queue,
pocl_command_to_str (command_type));

return CL_SUCCESS;

ERROR:
pocl_mem_manager_free_event (*event);
return errcode;
}

static int
Expand Down Expand Up @@ -1020,10 +1038,6 @@ pocl_create_migration_commands (cl_device_id dev, cl_event *ev_export_p,
last_migration_event = ev_import;
}

/* we don't need it anymore. */
if (previous_last_event)
POname (clReleaseEvent (previous_last_event));

/* the final event must depend on the export/import commands */
if (last_migration_event)
{
Expand All @@ -1043,6 +1057,11 @@ pocl_create_migration_commands (cl_device_id dev, cl_event *ev_export_p,
}
POCL_UNLOCK_OBJ (mem);

// we don't need it anymore. Done after unlocking mem to avoid deadlocks
// with pocl_update_event_finished.
if (previous_last_event)
POname (clReleaseEvent (previous_last_event));

return CL_SUCCESS;
}

Expand Down Expand Up @@ -2434,18 +2453,32 @@ static void pocl_free_event_memobjs (cl_event event)
POCL_MEM_FREE (event->mem_objs);
}

// status can be complete or failed (<0)
/**
* Mark an event with the status and notify relevant objects of this change
* in status. Must be called with the event unlocked.
* @warning calls clReleaseEvent, therefore event can be freed.
* @param status can be complete or failed (<0)
* @param func used for debug messages
* @param line used for debug messages
* @param event to be updated
* @param msg used for debug messages
*/
void
pocl_update_event_finished (cl_int status, const char *func, unsigned line,
cl_event event, const char *msg)
pocl_update_event_finished (cl_int status,
const char *func,
unsigned line,
cl_event event,
const char *msg)
{
assert (event != NULL);
assert (event->queue != NULL);
assert (event->status > CL_COMPLETE);

cl_command_queue cq = event->queue;
POCL_LOCK_OBJ (cq);
POCL_LOCK_OBJ (event);

assert (event->status > CL_COMPLETE);

if ((cq->properties & CL_QUEUE_PROFILING_ENABLE)
&& (cq->device->has_own_timer == 0))
event->time_end = pocl_gettimemono_ns ();
Expand Down Expand Up @@ -2478,42 +2511,6 @@ pocl_update_event_finished (cl_int status, const char *func, unsigned line,
POCL_UNLOCK_OBJ (event);
ops->broadcast (event);

/* With remote being asynchronous it is possible that an event is signaled as
Copy link
Contributor Author

@RABijl RABijl May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if an event is marked as failed, it will lock (when broadcasting), notify and mark all events in it's notify list as failed as well. event chains that look something like the image will be deadlocked
Untitled Diagram.pdf

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you convert this to a code comment somewhere reasonable? It will get lost if it's only a PR comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've put this here as a comment since it is about code that is being removed. I could add a comment above the broadcast function warning about the locking behavior it has.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think adding more code documentation would be beneficial, yes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've left a warning in the pocl_cl.h description of the broadcast function

* complete before some of its dependencies. Therefore this event has to be
* removed from the notify lists of any remaining events in the wait list.
*
* Mind the acrobatics of trying to avoid races with pocl_broadcast and
* pocl_create_event_sync. */
event_node *tmp;
POCL_LOCK_OBJ (event);
while ((tmp = event->wait_list))
{
cl_event notifier = tmp->event;
POCL_UNLOCK_OBJ (event);
pocl_lock_events_inorder (notifier, event);
if (tmp != event->wait_list)
{
pocl_unlock_events_inorder (notifier, event);
POCL_LOCK_OBJ (event);
continue;
}
event_node *tmp2;
LL_FOREACH (notifier->notify_list, tmp2)
{
if (tmp2->event == event)
{
LL_DELETE (notifier->notify_list, tmp2);
pocl_mem_manager_free_event_node (tmp2);
break;
}
}
LL_DELETE (event->wait_list, tmp);
pocl_unlock_events_inorder (notifier, event);
pocl_mem_manager_free_event_node (tmp);
POCL_LOCK_OBJ (event);
}
POCL_UNLOCK_OBJ (event);

#ifdef POCL_DEBUG_MESSAGES
if (msg != NULL)
{
Expand All @@ -2537,7 +2534,13 @@ pocl_update_event_finished (cl_int status, const char *func, unsigned line,
POname (clReleaseEvent) (event);
}


/**
* Mark the event as failed and notify all events waiting on this event.
* @warning Call with event locked.
* @param event to be marked as failed.
* @todo change this function to not require the event to be locked since
* clReleaseEvent is eventually called.
*/
void
pocl_update_event_failed (cl_event event)
{
Expand All @@ -2546,14 +2549,29 @@ pocl_update_event_failed (cl_event event)
POCL_LOCK_OBJ (event);
}

/**
* Mark the event as device_lost and notify all events waiting on this event.
* @warning Call with event unlocked.
* @warning Eventually clReleaseEvent is called, so this function can end up
* freeing the event.
* @param event to mark as device lost.
*/
void
pocl_update_event_device_lost (cl_event event)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the remote driver is the only one to call this function

{
POCL_UNLOCK_OBJ (event);
pocl_update_event_finished (CL_DEVICE_NOT_AVAILABLE, NULL, 0, event, NULL);
POCL_LOCK_OBJ (event);
}

/**
* Mark the event as complete and notify all events waiting on this event.
* @warning Call with event unlocked.
* @warning Eventually clReleaseEvent is called, so this function can end up
* freeing the event.
* @param func used for debug messages.
* @param line used for debug messages.
* @param event to mark as complete.
* @param msg used for debug messages.
*/
void
pocl_update_event_complete (const char *func, unsigned line,
cl_event event, const char *msg)
Expand Down