Skip to content

Commit

Permalink
fix(api): restart workers on MIOPEN memory errors
Browse files Browse the repository at this point in the history
  • Loading branch information
ssube committed Apr 1, 2023
1 parent 4a68984 commit 6aac0fe
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions api/onnx_web/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
EXIT_REPLACED = 3
EXIT_SUCCESS = 0

MEMORY_ERRORS = [
"Failed to allocate memory",
"out of memory",
"MIOPEN failure 7",
]


def worker_main(context: WorkerContext, server: ServerContext):
apply_patches(server)
Expand Down Expand Up @@ -66,10 +72,11 @@ def worker_main(context: WorkerContext, server: ServerContext):
exit(EXIT_ERROR)
except Exception as e:
e_str = str(e)
if "Failed to allocate memory" in e_str or "out of memory" in e_str:
logger.error("detected out-of-memory error, exiting: %s", e)
context.fail()
exit(EXIT_MEMORY)
for e_mem in MEMORY_ERRORS:
if e_mem in e_str:
logger.error("detected out-of-memory error, exiting: %s", e)
context.fail()
exit(EXIT_MEMORY)
else:
logger.exception(
"error while running job",
Expand Down

0 comments on commit 6aac0fe

Please sign in to comment.