diff --git a/scheds/rust-user/scx_layered/src/bpf/main.bpf.c b/scheds/rust-user/scx_layered/src/bpf/main.bpf.c index cd74769e95..98d9418e1a 100644 --- a/scheds/rust-user/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust-user/scx_layered/src/bpf/main.bpf.c @@ -250,10 +250,20 @@ struct layer *lookup_layer(int idx) return &layers[idx]; } +/* + * Because the layer membership is by the default hierarchy cgroups rather than + * the CPU controller membership, we can't use ops.cgroup_move(). Let's iterate + * the tasks manually and set refresh_layer. + * + * The iteration isn't synchronized and may fail spuriously. It's not a big + * practical problem as process migrations are very rare in most modern systems. + * That said, we eventually want this to be based on CPU controller membership. + */ SEC("tp_btf/cgroup_attach_task") int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, struct task_struct *leader, bool threadgroup) { + struct list_head *thread_head; struct task_struct *next; struct task_ctx *tctx; int leader_pid = leader->pid; @@ -265,6 +275,8 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, if (!threadgroup) return 0; + thread_head = &leader->signal->thread_head; + if (!(next = bpf_task_acquire(leader))) { scx_bpf_error("failed to acquire leader"); return 0; @@ -274,18 +286,18 @@ int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, struct task_struct *p; int pid; - p = container_of(next->thread_group.next, struct task_struct, thread_group); + p = container_of(next->thread_node.next, struct task_struct, thread_node); bpf_task_release(next); - pid = BPF_CORE_READ(p, pid); - if (pid == leader_pid) { + if (&p->thread_node == thread_head) { next = NULL; break; } + pid = BPF_CORE_READ(p, pid); next = bpf_task_from_pid(pid); if (!next) { - scx_bpf_error("thread iteration failed"); + bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed"); break; } diff --git a/scheds/rust-user/scx_rusty/src/bpf/main.bpf.c b/scheds/rust-user/scx_rusty/src/bpf/main.bpf.c index 53a95bd081..c85e95bf37 100644 --- a/scheds/rust-user/scx_rusty/src/bpf/main.bpf.c +++ b/scheds/rust-user/scx_rusty/src/bpf/main.bpf.c @@ -1009,7 +1009,15 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p) { pid_t pid = p->pid; - long ret = bpf_map_delete_elem(&task_data, &pid); + long ret; + + /* + * XXX - There's no reason delete should fail here but BPF's recursion + * protection can unnecessarily fail the operation. The fact that + * deletions aren't reliable means that we sometimes leak task_ctx and + * can't use BPF_NOEXIST on allocation in .prep_enable(). + */ + ret = bpf_map_delete_elem(&task_data, &pid); if (ret) { stat_add(RUSTY_STAT_TASK_GET_ERR, 1); return;