-
Notifications
You must be signed in to change notification settings - Fork 250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
pocl_util: fix numberous deadlocks on disconnect #1459
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -487,12 +487,23 @@ sort_and_uniq (cl_mem *objs, char *readonly_flags, size_t *num_objs) | |
extern unsigned long event_c; | ||
extern unsigned long uevent_c; | ||
|
||
/** | ||
* Create a new event object. | ||
* @param event pointer to event to be created. | ||
* @param command_queue set to NULL for user events. | ||
* @param command_type assigned to event. | ||
* @param num_buffers can be zero. | ||
* @param buffers needs to be non NULL if num_buffers is not zero. | ||
* @param context assigned to event. | ||
* @return CL_SUCCESS on success or an ocl error otherwise. | ||
*/ | ||
cl_int | ||
pocl_create_event (cl_event *event, cl_command_queue command_queue, | ||
cl_command_type command_type, size_t num_buffers, | ||
const cl_mem *buffers, cl_context context) | ||
{ | ||
static uint64_t event_id_counter = 0; | ||
cl_int errcode = CL_SUCCESS; | ||
|
||
if (context == NULL) | ||
return CL_INVALID_CONTEXT; | ||
|
@@ -510,9 +521,12 @@ pocl_create_event (cl_event *event, cl_command_queue command_queue, | |
|
||
/* user events have a NULL command queue, don't retain it */ | ||
if (command_queue) | ||
POname (clRetainCommandQueue) (command_queue); | ||
errcode = POname (clRetainCommandQueue) (command_queue); | ||
else | ||
POname (clRetainContext) (context); | ||
errcode = POname (clRetainContext) (context); | ||
|
||
if (errcode != CL_SUCCESS) | ||
goto ERROR; | ||
|
||
(*event)->command_type = command_type; | ||
(*event)->id = POCL_ATOMIC_INC (event_id_counter); | ||
|
@@ -534,6 +548,10 @@ pocl_create_event (cl_event *event, cl_command_queue command_queue, | |
pocl_command_to_str (command_type)); | ||
|
||
return CL_SUCCESS; | ||
|
||
ERROR: | ||
pocl_mem_manager_free_event (*event); | ||
return errcode; | ||
} | ||
|
||
static int | ||
|
@@ -1020,10 +1038,6 @@ pocl_create_migration_commands (cl_device_id dev, cl_event *ev_export_p, | |
last_migration_event = ev_import; | ||
} | ||
|
||
/* we don't need it anymore. */ | ||
if (previous_last_event) | ||
POname (clReleaseEvent (previous_last_event)); | ||
|
||
/* the final event must depend on the export/import commands */ | ||
if (last_migration_event) | ||
{ | ||
|
@@ -1043,6 +1057,11 @@ pocl_create_migration_commands (cl_device_id dev, cl_event *ev_export_p, | |
} | ||
POCL_UNLOCK_OBJ (mem); | ||
|
||
// we don't need it anymore. Done after unlocking mem to avoid deadlocks | ||
// with pocl_update_event_finished. | ||
if (previous_last_event) | ||
POname (clReleaseEvent (previous_last_event)); | ||
|
||
return CL_SUCCESS; | ||
} | ||
|
||
|
@@ -2434,18 +2453,32 @@ static void pocl_free_event_memobjs (cl_event event) | |
POCL_MEM_FREE (event->mem_objs); | ||
} | ||
|
||
// status can be complete or failed (<0) | ||
/** | ||
* Mark an event with the status and notify relevant objects of this change | ||
* in status. Must be called with the event unlocked. | ||
* @warning calls clReleaseEvent, therefore event can be freed. | ||
* @param status can be complete or failed (<0) | ||
* @param func used for debug messages | ||
* @param line used for debug messages | ||
* @param event to be updated | ||
* @param msg used for debug messages | ||
*/ | ||
void | ||
pocl_update_event_finished (cl_int status, const char *func, unsigned line, | ||
cl_event event, const char *msg) | ||
pocl_update_event_finished (cl_int status, | ||
const char *func, | ||
unsigned line, | ||
cl_event event, | ||
const char *msg) | ||
{ | ||
assert (event != NULL); | ||
assert (event->queue != NULL); | ||
assert (event->status > CL_COMPLETE); | ||
|
||
cl_command_queue cq = event->queue; | ||
POCL_LOCK_OBJ (cq); | ||
POCL_LOCK_OBJ (event); | ||
|
||
assert (event->status > CL_COMPLETE); | ||
|
||
if ((cq->properties & CL_QUEUE_PROFILING_ENABLE) | ||
&& (cq->device->has_own_timer == 0)) | ||
event->time_end = pocl_gettimemono_ns (); | ||
|
@@ -2478,42 +2511,6 @@ pocl_update_event_finished (cl_int status, const char *func, unsigned line, | |
POCL_UNLOCK_OBJ (event); | ||
ops->broadcast (event); | ||
|
||
/* With remote being asynchronous it is possible that an event is signaled as | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if an event is marked as failed, it will lock (when broadcasting), notify and mark all events in it's notify list as failed as well. event chains that look something like the image will be deadlocked There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you convert this to a code comment somewhere reasonable? It will get lost if it's only a PR comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i've put this here as a comment since it is about code that is being removed. I could add a comment above the broadcast function warning about the locking behavior it has. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think adding more code documentation would be beneficial, yes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i've left a warning in the pocl_cl.h description of the broadcast function |
||
* complete before some of its dependencies. Therefore this event has to be | ||
* removed from the notify lists of any remaining events in the wait list. | ||
* | ||
* Mind the acrobatics of trying to avoid races with pocl_broadcast and | ||
* pocl_create_event_sync. */ | ||
event_node *tmp; | ||
POCL_LOCK_OBJ (event); | ||
while ((tmp = event->wait_list)) | ||
{ | ||
cl_event notifier = tmp->event; | ||
POCL_UNLOCK_OBJ (event); | ||
pocl_lock_events_inorder (notifier, event); | ||
if (tmp != event->wait_list) | ||
{ | ||
pocl_unlock_events_inorder (notifier, event); | ||
POCL_LOCK_OBJ (event); | ||
continue; | ||
} | ||
event_node *tmp2; | ||
LL_FOREACH (notifier->notify_list, tmp2) | ||
{ | ||
if (tmp2->event == event) | ||
{ | ||
LL_DELETE (notifier->notify_list, tmp2); | ||
pocl_mem_manager_free_event_node (tmp2); | ||
break; | ||
} | ||
} | ||
LL_DELETE (event->wait_list, tmp); | ||
pocl_unlock_events_inorder (notifier, event); | ||
pocl_mem_manager_free_event_node (tmp); | ||
POCL_LOCK_OBJ (event); | ||
} | ||
POCL_UNLOCK_OBJ (event); | ||
|
||
#ifdef POCL_DEBUG_MESSAGES | ||
if (msg != NULL) | ||
{ | ||
|
@@ -2537,7 +2534,13 @@ pocl_update_event_finished (cl_int status, const char *func, unsigned line, | |
POname (clReleaseEvent) (event); | ||
} | ||
|
||
|
||
/** | ||
* Mark the event as failed and notify all events waiting on this event. | ||
* @warning Call with event locked. | ||
* @param event to be marked as failed. | ||
* @todo change this function to not require the event to be locked since | ||
* clReleaseEvent is eventually called. | ||
*/ | ||
void | ||
pocl_update_event_failed (cl_event event) | ||
{ | ||
|
@@ -2546,14 +2549,29 @@ pocl_update_event_failed (cl_event event) | |
POCL_LOCK_OBJ (event); | ||
} | ||
|
||
/** | ||
* Mark the event as device_lost and notify all events waiting on this event. | ||
* @warning Call with event unlocked. | ||
* @warning Eventually clReleaseEvent is called, so this function can end up | ||
* freeing the event. | ||
* @param event to mark as device lost. | ||
*/ | ||
void | ||
pocl_update_event_device_lost (cl_event event) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the remote driver is the only one to call this function |
||
{ | ||
POCL_UNLOCK_OBJ (event); | ||
pocl_update_event_finished (CL_DEVICE_NOT_AVAILABLE, NULL, 0, event, NULL); | ||
POCL_LOCK_OBJ (event); | ||
} | ||
|
||
/** | ||
* Mark the event as complete and notify all events waiting on this event. | ||
* @warning Call with event unlocked. | ||
* @warning Eventually clReleaseEvent is called, so this function can end up | ||
* freeing the event. | ||
* @param func used for debug messages. | ||
* @param line used for debug messages. | ||
* @param event to mark as complete. | ||
* @param msg used for debug messages. | ||
*/ | ||
void | ||
pocl_update_event_complete (const char *func, unsigned line, | ||
cl_event event, const char *msg) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Best add macros for these to avoid magic numbers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, enum even better as we allow them in C code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i've changed it to an enum