-
Notifications
You must be signed in to change notification settings - Fork 0
/
b31fa177c1ca26dcd2d5cd952e692ef87d95b528
84 lines (72 loc) · 2.81 KB
/
b31fa177c1ca26dcd2d5cd952e692ef87d95b528
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
Author: Thomas Hamel <hmlth@t-hamel.fr>
Description: Revert b31fa177c1ca26dcd2d5cd952e692ef87d95b528
Commit b31fa177c1ca26dcd2d5cd952e692ef87d95b528 was made to fix an issue
with nodes states but introduces a new regression wrt. energy power saving
mode where it leaves a way for jobs to start on nodes not ready yet.
We revert this patch for now until we find a better fix.
Upstream: https://github.com/SchedMD/slurm/commit/b31fa177c1ca26dcd2d5cd952e692ef87d95b528
--- a/doc/man/man5/slurm.conf.5
+++ b/doc/man/man5/slurm.conf.5
@@ -1464,8 +1464,10 @@ Note that the \fBHealthCheckProgram\fR will be executed at the same time
on all nodes to minimize its impact upon parallel programs.
This program will be killed if it does not terminate normally within
60 seconds.
-This program will also be executed when the slurmd daemon is first started and
-before it registers with the slurmctld daemon.
+When slurmd is first started, this program will be run repeatedly until
+it returns an exit code of 0, and will block slurmd from registering with
+slurmctld until this has been satisfied.
+The return code is otherwise ignored.
By default, no program will be executed.
.IP
--- a/src/slurmd/slurmd/slurmd.c
+++ b/src/slurmd/slurmd/slurmd.c
@@ -120,6 +120,8 @@
#include "src/slurmd/slurmd/req.h"
#include "src/slurmd/slurmd/slurmd.h"
+#define HEALTH_RETRY_DELAY 10
+
#define MAX_THREADS 256
#define _free_and_set(__dst, __src) \
@@ -215,6 +217,7 @@ static void _usage(void);
static void _usr_handler(int);
static int _validate_and_convert_cpu_list(void);
static void _wait_for_all_threads(int secs);
+static void _wait_health_check(void);
/**************************************************************************\
* To test for memory leaks, set MEMORY_LEAK_DEBUG to 1 using
@@ -381,7 +384,8 @@ main (int argc, char **argv)
slurm_conf_install_fork_handlers();
record_launched_jobs();
- run_script_health_check();
+ /* Wait for a successful health check if HealthCheckInterval != 0 */
+ _wait_health_check();
slurm_thread_create_detached(NULL, _registration_engine, NULL);
@@ -2588,6 +2592,28 @@ static void _resource_spec_fini(void)
}
/*
+ * Wait for health check to execute successfully
+ *
+ * Return immediately if a shutdown has been requested or
+ * if the HealthCheckInterval is 0.
+ */
+static void _wait_health_check(void)
+{
+ int last_check_time = 0;
+ while (!_shutdown && (slurm_conf.health_check_interval != 0) ) {
+ if (time(NULL) - last_check_time > HEALTH_RETRY_DELAY) {
+ if (run_script_health_check() == SLURM_SUCCESS) {
+ break;
+ }
+ last_check_time = time(NULL);
+ info ("Health Check failed, retrying in %ds...",
+ HEALTH_RETRY_DELAY);
+ }
+ usleep(10000);
+ }
+}
+
+/*
* Run the configured health check program
*
* Returns the run result. If the health check program