Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pocl_util: fix numberous deadlocks on disconnect #1459

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions include/pocl.h
Original file line number Diff line number Diff line change
Expand Up @@ -483,10 +483,16 @@ typedef union
_cl_command_svm_memadvise mem_advise;
} _cl_command_t;

typedef enum
{
COMMAND_FAILED = -1,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
COMMAND_FAILED = -1,
POCL_COMMAND_FAILED = -1,

And if you don't need to force the enum number, better just leave it out. 0 might have significance as it's the default if zero initialized.

COMMAND_NOT_READY = 0,
COMMAND_READY = 1,
} command_node_state;

// one item in the command queue or command buffer
typedef struct _cl_command_node _cl_command_node;
struct _cl_command_node
{
struct _cl_command_node {
_cl_command_t command;
cl_command_type type;
_cl_command_node *next; // for linked-list storage
Expand Down Expand Up @@ -516,7 +522,7 @@ struct _cl_command_node
cl_device_id device;
/* The index of the targeted device in the **program** device list. */
unsigned program_device_i;
cl_int ready;
command_node_state node_state;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
command_node_state node_state;
pocl_command_node_state node_state;

Due to lack of namespaces, best add name prefixes.


/* fields needed by buffered commands only */

Expand Down
11 changes: 8 additions & 3 deletions lib/CL/devices/almaif/almaif.cc
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ bool only_custom_device_events_left(cl_event event) {

void pocl_almaif_submit(_cl_command_node *Node, cl_command_queue /*CQ*/) {

Node->ready = 1;
Node->node_state = COMMAND_READY;

struct AlmaifData *D = (AlmaifData *)Node->device->data;
cl_event E = Node->sync.event.event;
Expand Down Expand Up @@ -794,8 +794,13 @@ void pocl_almaif_notify(cl_device_id Device, cl_event Event, cl_event Finished)
return;
}

if (!Node->ready)
return;
if (Node->node_state != COMMAND_READY)
{
POCL_MSG_PRINT_EVENTS (
"almaif: command related to the notified event %lu not ready\n",
Event->id);
return;
}

if (Event->command->type != CL_COMMAND_NDRANGE_KERNEL) {
if (pocl_command_is_ready(Event)) {
Expand Down
11 changes: 8 additions & 3 deletions lib/CL/devices/basic/basic.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ pocl_basic_submit (_cl_command_node *node, cl_command_queue cq)
if (node != NULL && node->type == CL_COMMAND_NDRANGE_KERNEL)
pocl_check_kernel_dlhandle_cache (node, CL_TRUE, CL_TRUE);

node->ready = 1;
node->node_state = COMMAND_READY;
POCL_LOCK (d->cq_lock);
pocl_command_push(node, &d->ready_list, &d->command_list);

Expand Down Expand Up @@ -538,8 +538,13 @@ pocl_basic_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->node_state != COMMAND_READY)
{
POCL_MSG_PRINT_EVENTS (
"basic: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (event))
{
Expand Down
1 change: 1 addition & 0 deletions lib/CL/devices/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,7 @@ pocl_broadcast (cl_event brc_event)
if (target != brc_event->notify_list)
{
pocl_unlock_events_inorder (brc_event, target_event);
POname (clReleaseEvent) (target_event);
POCL_LOCK_OBJ (brc_event);
continue;
}
Expand Down
11 changes: 8 additions & 3 deletions lib/CL/devices/proxy/pocl_proxy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1506,7 +1506,7 @@ pocl_proxy_submit (_cl_command_node *node, cl_command_queue cq)
POCL_INIT_COND (e_d->event_cond);
e->data = (void *)e_d;

node->ready = 1;
node->node_state = COMMAND_READY;
if (pocl_command_is_ready (e))
{
pocl_update_event_submitted (e);
Expand Down Expand Up @@ -1562,8 +1562,13 @@ pocl_proxy_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->node_state != COMMAND_READY)
{
POCL_MSG_PRINT_EVENTS (
"proxy: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (node->sync.event.event))
{
Expand Down
11 changes: 8 additions & 3 deletions lib/CL/devices/pthread/pthread.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ pocl_pthread_run (void *data, _cl_command_node *cmd)
void
pocl_pthread_submit (_cl_command_node *node, cl_command_queue cq)
{
node->ready = 1;
node->node_state = COMMAND_READY;
if (pocl_command_is_ready (node->sync.event.event))
{
pocl_update_event_submitted (node->sync.event.event);
Expand Down Expand Up @@ -246,8 +246,13 @@ pocl_pthread_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
return;
if (node->node_state != COMMAND_READY)
{
POCL_MSG_PRINT_EVENTS (
"pthread: command related to the notified event %lu not ready\n",
event->id);
return;
}

if (pocl_command_is_ready (node->sync.event.event))
{
Expand Down
15 changes: 11 additions & 4 deletions lib/CL/devices/remote/remote.c
Original file line number Diff line number Diff line change
Expand Up @@ -1370,7 +1370,7 @@ pocl_remote_submit (_cl_command_node *node, cl_command_queue cq)
POCL_INIT_COND (e_d->event_cond);
e->data = (void *)e_d;

node->ready = 1;
node->node_state = COMMAND_READY;
if (pocl_command_is_ready (node->sync.event.event))
{
pocl_update_event_submitted (node->sync.event.event);
Expand Down Expand Up @@ -1433,7 +1433,7 @@ pocl_remote_notify (cl_device_id device, cl_event event, cl_event finished)
return;
}

if (!node->ready)
if (node->node_state != COMMAND_READY)
{
POCL_MSG_PRINT_EVENTS (
"remote: command related to the notified event %lu not ready\n",
Expand Down Expand Up @@ -2257,7 +2257,7 @@ remote_start_command (remote_device_data_t *d, _cl_command_node *node)
cl_command_queue cq = node->sync.event.event->queue;
if (*(cq->device->available) == CL_FALSE)
{
pocl_update_event_device_lost (event);
node->node_state = COMMAND_FAILED;
goto EARLY_FINISH;
}
pocl_update_event_running (event);
Expand Down Expand Up @@ -2544,7 +2544,14 @@ pocl_remote_driver_pthread (void *cldev)
char msg[128] = "Event ";
strcat (msg, cstr);

POCL_UPDATE_EVENT_COMPLETE_MSG (event, msg);
if (finished->node_state != COMMAND_READY)
{
pocl_update_event_device_lost (event);
}
else
{
POCL_UPDATE_EVENT_COMPLETE_MSG (event, msg);
}

POCL_FAST_LOCK (d->wq_lock);
}
Expand Down
19 changes: 11 additions & 8 deletions lib/CL/pocl_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -384,14 +384,17 @@ struct pocl_device_ops {
must be locked also on return.*/
void (*notify) (cl_device_id device, cl_event event, cl_event finished);

/* broadcast is(has to be) called by the device driver when a command is
completed.
It is used to broadcast notifications to device drivers waiting
this event to complete.
There is a default implementation for this. Use it if there is no need
to do anything special here.
The default implementation calls notify(event, target_event) for the
list of events waiting on 'event'. */
/**
* Broadcast is(has to be) called by the device driver when a command is
* completed.
* It is used to broadcast notifications to device drivers waiting
* this event to complete.
* There is a default implementation for this. Use it if there is no need
* to do anything special here.
* The default implementation calls notify(event, target_event) for the
* list of events waiting on 'event'.
* @warning Called with event (and its notify events) unlocked.
*/
void (*broadcast) (cl_event event);

/* wait_event is called by clWaitForEvents() and blocks the execution until
Expand Down
Loading