Skip to content

Commit 5847bd7

Browse files
MarshallGareydannyauble
authored andcommitted
Fix jobacct_gather/cgroup to work correctly when more than one task is
started on a node. Bug 5865
1 parent ae89960 commit 5847bd7

File tree

9 files changed

+165
-36
lines changed

9 files changed

+165
-36
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ documents those changes that are of interest to users and administrators.
5252
-- Accept modifiers for TRES originally added in 6f0342e0358.
5353
-- When doing a reconfigure handle QOS' GrpJobsAccrue correctly.
5454
-- Remove unneeded extra parentheses from sh5util.
55+
-- Fix jobacct_gather/cgroup to work correctly when more than one task is
56+
started on a node.
5557

5658
* Changes in Slurm 18.08.3
5759
==========================

src/common/slurm_jobacct_gather.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ static void _init_tres_usage(struct jobacctinfo *jobacct,
172172
jobacct->tres_usage_out_min[i] = INFINITE64;
173173
jobacct->tres_usage_out_tot[i] = INFINITE64;
174174

175-
if (jobacct_id && jobacct_id->taskid != NO_VAL16) {
175+
if (jobacct_id && jobacct_id->taskid != NO_VAL) {
176176
jobacct->tres_usage_in_max_taskid[i] =
177177
(uint64_t) jobacct_id->taskid;
178178
jobacct->tres_usage_in_min_taskid[i] =
@@ -320,6 +320,10 @@ static void _pack_jobacct_id(jobacct_id_t *jobacct_id,
320320
pack16((uint16_t) jobacct_id->taskid, buffer);
321321
} else {
322322
pack32(NO_VAL, buffer);
323+
/*
324+
* This pack is not used in modern code, so leave
325+
* it NO_VAL16.
326+
*/
323327
pack16(NO_VAL16, buffer);
324328
}
325329
}
@@ -904,7 +908,7 @@ extern jobacctinfo_t *jobacctinfo_create(jobacct_id_t *jobacct_id)
904908
jobacct = xmalloc(sizeof(struct jobacctinfo));
905909

906910
if (!jobacct_id) {
907-
temp_id.taskid = NO_VAL16;
911+
temp_id.taskid = NO_VAL;
908912
temp_id.nodeid = NO_VAL;
909913
jobacct_id = &temp_id;
910914
}

src/common/slurm_jobacct_gather.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
#define CPU_TIME_ADJ 1000
7474

7575
typedef struct {
76-
uint16_t taskid; /* contains which task number it was on */
76+
uint32_t taskid; /* contains which task number it was on */
7777
uint32_t nodeid; /* contains which node number it was on */
7878
stepd_step_rec_t *job; /* contains stepd job pointer */
7979
} jobacct_id_t;

src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.c

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,45 @@ const uint32_t plugin_version = SLURM_VERSION_NUMBER;
9999
/* Other useful declarations */
100100
static slurm_cgroup_conf_t slurm_cgroup_conf;
101101

102-
static void _prec_extra(jag_prec_t *prec)
102+
static void _prec_extra(jag_prec_t *prec, uint32_t taskid)
103103
{
104104
unsigned long utime, stime, total_rss, total_pgpgin;
105105
char *cpu_time = NULL, *memory_stat = NULL, *ptr;
106106
size_t cpu_time_size = 0, memory_stat_size = 0;
107+
xcgroup_t *task_cpuacct_cg = NULL;;
108+
xcgroup_t *task_memory_cg = NULL;
109+
bool exit_early = false;
110+
111+
/* Find which task cgroups to use */
112+
task_memory_cg = list_find_first(task_memory_cg_list,
113+
find_task_cg_info,
114+
&taskid);
115+
task_cpuacct_cg = list_find_first(task_cpuacct_cg_list,
116+
find_task_cg_info,
117+
&taskid);
118+
119+
/*
120+
* We should always find the task cgroups; if we don't for some reason,
121+
* just print an error and return.
122+
*/
123+
if (!task_cpuacct_cg) {
124+
error("%s: Could not find task_cpuacct_cg, this should never happen",
125+
__func__);
126+
exit_early = true;
127+
}
128+
if (!task_memory_cg) {
129+
error("%s: Could not find task_memory_cg, this should never happen",
130+
__func__);
131+
exit_early = true;
132+
}
133+
if (exit_early)
134+
return;
107135

108136
//DEF_TIMERS;
109137
//START_TIMER;
110138
/* info("before"); */
111139
/* print_jag_prec(prec); */
112-
xcgroup_get_param(&task_cpuacct_cg, "cpuacct.stat",
140+
xcgroup_get_param(task_cpuacct_cg, "cpuacct.stat",
113141
&cpu_time, &cpu_time_size);
114142
if (cpu_time == NULL) {
115143
debug2("%s: failed to collect cpuacct.stat pid %d ppid %d",
@@ -120,7 +148,7 @@ static void _prec_extra(jag_prec_t *prec)
120148
prec->ssec = stime;
121149
}
122150

123-
xcgroup_get_param(&task_memory_cg, "memory.stat",
151+
xcgroup_get_param(task_memory_cg, "memory.stat",
124152
&memory_stat, &memory_stat_size);
125153
if (memory_stat == NULL) {
126154
debug2("%s: failed to collect memory.stat pid %d ppid %d",
@@ -363,3 +391,24 @@ extern char* jobacct_cgroup_create_slurm_cg(xcgroup_ns_t* ns)
363391

364392
return pre;
365393
}
394+
395+
extern int find_task_cg_info(void *x, void *key)
396+
{
397+
task_cg_info_t *task_cg = (task_cg_info_t*)x;
398+
uint32_t taskid = *(uint32_t*)key;
399+
400+
if (task_cg->taskid == taskid)
401+
return 1;
402+
403+
return 0;
404+
}
405+
406+
extern void free_task_cg_info(void *object)
407+
{
408+
task_cg_info_t *task_cg = (task_cg_info_t *)object;
409+
410+
if (task_cg) {
411+
xcgroup_destroy(&task_cg->task_cg);
412+
xfree(task_cg);
413+
}
414+
}

src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,17 @@
4343
#include "src/common/xcgroup_read_config.h"
4444
#include "src/slurmd/common/xcgroup.h"
4545

46-
extern xcgroup_t task_memory_cg;
47-
extern xcgroup_t task_cpuacct_cg;
46+
/*
47+
* There are potentially multiple tasks on a node, so we want to
48+
* track every task cgroup and which taskid it belongs to.
49+
*/
50+
typedef struct task_cg_info {
51+
xcgroup_t task_cg;
52+
uint32_t taskid;
53+
} task_cg_info_t;
54+
55+
extern List task_memory_cg_list;
56+
extern List task_cpuacct_cg_list;
4857

4958
extern int jobacct_gather_cgroup_cpuacct_init(
5059
slurm_cgroup_conf_t *slurm_cgroup_conf);
@@ -76,3 +85,7 @@ extern int jobacct_gather_cgroup_memory_attach_task(
7685
/* pid_t pid, jobacct_id_t *jobacct_id); */
7786

7887
extern char* jobacct_cgroup_create_slurm_cg (xcgroup_ns_t* ns);
88+
89+
extern int find_task_cg_info(void *x, void *key);
90+
91+
extern void free_task_cg_info(void *task_cg);

src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup_cpuacct.c

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ static xcgroup_ns_t cpuacct_ns;
5757
static xcgroup_t user_cpuacct_cg;
5858
static xcgroup_t job_cpuacct_cg;
5959
static xcgroup_t step_cpuacct_cg;
60-
xcgroup_t task_cpuacct_cg;
60+
61+
List task_cpuacct_cg_list = NULL;
6162

6263
static uint32_t max_task_id;
6364

@@ -68,6 +69,7 @@ jobacct_gather_cgroup_cpuacct_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
6869
user_cgroup_path[0]='\0';
6970
job_cgroup_path[0]='\0';
7071
jobstep_cgroup_path[0]='\0';
72+
task_cgroup_path[0]='\0';
7173

7274
/* initialize cpuacct cgroup namespace */
7375
if (xcgroup_ns_create(slurm_cgroup_conf, &cpuacct_ns, "", "cpuacct")
@@ -77,6 +79,9 @@ jobacct_gather_cgroup_cpuacct_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
7779
return SLURM_ERROR;
7880
}
7981

82+
FREE_NULL_LIST(task_cpuacct_cg_list);
83+
task_cpuacct_cg_list = list_create(free_task_cg_info);
84+
8085
return SLURM_SUCCESS;
8186
}
8287

@@ -90,7 +95,7 @@ jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
9095
if (user_cgroup_path[0] == '\0'
9196
|| job_cgroup_path[0] == '\0'
9297
|| jobstep_cgroup_path[0] == '\0'
93-
|| task_cgroup_path[0] == 0)
98+
|| task_cgroup_path[0] == '\0')
9499
return SLURM_SUCCESS;
95100

96101
/*
@@ -151,16 +156,16 @@ jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
151156
if (lock_ok == true)
152157
xcgroup_unlock(&cpuacct_cg);
153158

154-
xcgroup_destroy(&task_cpuacct_cg);
155159
xcgroup_destroy(&user_cpuacct_cg);
156160
xcgroup_destroy(&job_cpuacct_cg);
157161
xcgroup_destroy(&step_cpuacct_cg);
158162
xcgroup_destroy(&cpuacct_cg);
163+
FREE_NULL_LIST(task_cpuacct_cg_list);
159164

160165
user_cgroup_path[0]='\0';
161166
job_cgroup_path[0]='\0';
162167
jobstep_cgroup_path[0]='\0';
163-
task_cgroup_path[0] = 0;
168+
task_cgroup_path[0]='\0';
164169

165170
xcgroup_ns_destroy(&cpuacct_ns);
166171

@@ -180,6 +185,8 @@ jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id)
180185
int fstatus = SLURM_SUCCESS;
181186
int rc;
182187
char* slurm_cgpath;
188+
task_cg_info_t *task_cg_info;
189+
bool need_to_add = false;
183190

184191
job = jobacct_id->job;
185192
uid = job->uid;
@@ -191,6 +198,8 @@ jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id)
191198
if (taskid >= max_task_id)
192199
max_task_id = taskid;
193200

201+
xassert(task_cpuacct_cg_list);
202+
194203
debug("%s: jobid %u stepid %u taskid %u max_task_id %u",
195204
__func__, jobid, stepid, taskid, max_task_id);
196205

@@ -210,6 +219,7 @@ jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id)
210219
return SLURM_ERROR;
211220
}
212221
}
222+
xfree(slurm_cgpath);
213223

214224
/* build job cgroup relative path if not set (may not be) */
215225
if (*job_cgroup_path == '\0') {
@@ -344,26 +354,40 @@ jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id)
344354
goto error;
345355
}
346356

357+
if (!(task_cg_info = list_find_first(task_cpuacct_cg_list,
358+
find_task_cg_info,
359+
&taskid))) {
360+
task_cg_info = xmalloc(sizeof(*task_cg_info));
361+
task_cg_info->taskid = taskid;
362+
need_to_add = true;
363+
}
364+
347365
/*
348366
* Create task cgroup in the cpuacct ns
349367
*/
350-
if (xcgroup_create(&cpuacct_ns, &task_cpuacct_cg,
368+
if (xcgroup_create(&cpuacct_ns, &task_cg_info->task_cg,
351369
task_cgroup_path,
352370
uid, gid) != XCGROUP_SUCCESS) {
353371
/* do not delete user/job cgroup as they can exist for other
354372
* steps, but release cgroup structures */
355373
xcgroup_destroy(&user_cpuacct_cg);
356374
xcgroup_destroy(&job_cpuacct_cg);
375+
376+
/* Don't use free_task_cg_info as the task_cg isn't there */
377+
xfree(task_cg_info);
378+
357379
error("jobacct_gather/cgroup: unable to create jobstep %u.%u "
358380
"task %u cpuacct cgroup", jobid, stepid, taskid);
359381
fstatus = SLURM_ERROR;
360382
goto error;
361383
}
362384

363-
if (xcgroup_instantiate(&task_cpuacct_cg) != XCGROUP_SUCCESS) {
385+
if (xcgroup_instantiate(&task_cg_info->task_cg)
386+
!= XCGROUP_SUCCESS) {
364387
xcgroup_destroy(&user_cpuacct_cg);
365388
xcgroup_destroy(&job_cpuacct_cg);
366389
xcgroup_destroy(&step_cpuacct_cg);
390+
free_task_cg_info(task_cg_info);
367391
error("jobacct_gather/cgroup: unable to instantiate jobstep "
368392
"%u.%u task %u cpuacct cgroup", jobid, stepid, taskid);
369393
fstatus = SLURM_ERROR;
@@ -373,14 +397,18 @@ jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id)
373397
/*
374398
* Attach the slurmstepd to the task cpuacct cgroup
375399
*/
376-
rc = xcgroup_add_pids(&task_cpuacct_cg, &pid, 1);
400+
rc = xcgroup_add_pids(&task_cg_info->task_cg, &pid, 1);
377401
if (rc != XCGROUP_SUCCESS) {
378-
error("jobacct_gather/cgroup: unable to add slurmstepd to "
379-
"cpuacct cg '%s'", task_cpuacct_cg.path);
402+
error("jobacct_gather/cgroup: unable to add slurmstepd to cpuacct cg '%s'",
403+
task_cg_info->task_cg.path);
380404
fstatus = SLURM_ERROR;
381405
} else
382406
fstatus = SLURM_SUCCESS;
383407

408+
/* Add the task cgroup to the list now that it is initialized. */
409+
if (need_to_add)
410+
list_append(task_cpuacct_cg_list , task_cg_info);
411+
384412
error:
385413
xcgroup_unlock(&cpuacct_cg);
386414
xcgroup_destroy(&cpuacct_cg);

0 commit comments

Comments
 (0)