Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scx_layered: Add override context for layer hinting #587

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scheds/rust/scx_layered/src/bpf/intf.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ enum consts {
MAX_COMM = 16,
MAX_LAYER_MATCH_ORS = 32,
MAX_LAYERS = 16,
MAX_LAYER_OVERRIDES = 1024,
MAX_OVERRIDE_YIELDS = 64,
USAGE_HALF_LIFE = 100000000, /* 100ms */

HI_FALLBACK_DSQ = MAX_LAYERS * MAX_DOMS,
Expand Down Expand Up @@ -162,4 +164,12 @@ struct layer {
unsigned int perf;
};

struct override_ctx {
u64 gen;
u64 yield_gen; // managed from BPF side
u64 task_overrides[MAX_LAYER_OVERRIDES];
u64 task_yields[MAX_OVERRIDE_YIELDS];
u32 nr_overrides;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this have a generation number so that it doesn't have to be walked every time?

u32 nr_yields;
};
#endif /* __INTF_H */
151 changes: 139 additions & 12 deletions scheds/rust/scx_layered/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ const volatile bool smt_enabled = true;
const volatile bool disable_topology = false;
const volatile s32 __sibling_cpu[MAX_CPUS];
const volatile unsigned char all_cpus[MAX_CPUS_U8];
const volatile bool allow_overrides = false;

private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
struct layer layers[MAX_LAYERS];
u32 fallback_cpu;
static u32 preempt_cursor;
static u64 override_gens[MAX_LAYERS];

#define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0)
#define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
Expand Down Expand Up @@ -209,6 +211,26 @@ static void adj_load(u32 layer_idx, s64 adj, u64 now)
bpf_get_smp_processor_id(), layer_idx, layer->load, adj);
}

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct override_ctx);
__uint(max_entries, MAX_LAYERS);
__uint(map_flags, BPF_F_MMAPABLE);
} override_ctxs SEC(".maps");

static struct override_ctx *lookup_override_ctx(int layer_idx)
{
struct override_ctx *uctx;

if ((uctx = bpf_map_lookup_elem(&override_ctxs, &layer_idx))) {
return uctx;
} else {
scx_bpf_error("no layer override_ctx");
return NULL;
}
}

struct layer_cpumask_wrapper {
struct bpf_cpumask __kptr *cpumask;
};
Expand Down Expand Up @@ -294,17 +316,6 @@ u32 llc_node_id(u32 llc_id)
return *llc_ptr;
}

SEC("fentry")
int BPF_PROG(sched_tick_fentry)
{
int idx;

if (bpf_get_smp_processor_id() == 0)
bpf_for(idx, 0, nr_layers)
refresh_cpumasks(idx);
return 0;
}

struct task_ctx {
int pid;
int last_cpu;
Expand All @@ -314,6 +325,7 @@ struct task_ctx {
struct bpf_cpumask __kptr *layered_cpumask;

bool all_cpus_allowed;
u64 override_gen;
u64 runnable_at;
u64 running_at;
};
Expand Down Expand Up @@ -418,6 +430,96 @@ int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
return 0;
}

static void refresh_overrides(int nr_layers)
{
int i, layer_idx;
u64 override_gen;
struct task_ctx *tctx;
struct override_ctx *octx;
struct task_struct *p;

bpf_for(layer_idx, 0, nr_layers) {
struct layer *layer = &layers[layer_idx];
override_gen = override_gens[layer_idx];
if (!(octx = lookup_override_ctx(layer_idx)))
continue;
if (octx->gen == override_gen)
continue;

// If we had proper locks with userspace then this may be an
// allowed state for wrapping the override_gen. For now we'll
// allow resetting the gen to 0.
if (octx->gen < override_gen && octx->gen != 0) {
scx_bpf_error("override generation is lower than set generation");
return;
}

override_gens[layer_idx] = octx->gen;

bpf_for(i, 0, octx->nr_overrides) {
if (i > MAX_LAYER_OVERRIDES) {
scx_bpf_error("too many overrides");
return;
}

p = bpf_task_from_pid(octx->task_overrides[i]);
if (!p)
continue;

if (!(tctx = lookup_task_ctx(p))) {
bpf_task_release(p);
continue;
}
if (octx->gen < tctx->override_gen && octx->gen != 0) {
bpf_task_release(p);
scx_bpf_error("task override generation is invalid");
return;
}
if (octx->gen == tctx->override_gen) {
bpf_task_release(p);
continue;
}
if (tctx->layer >= 0 && tctx->layer < nr_layers)
__sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1);
/*
* XXX: Need to adjust the vtime delta with the
* previous layer similar to maybe_refresh_layer.
* However, from this context the value is read only so
* it may be incorrect.
*
* p->scx.dsq_vtime = layer->vtime_now;
*
*/
tctx->layer = layer_idx;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, the way to do this, I think, is just updating the generation numbers here and what the target index should be and let maybe_refresh_layer() do the actual update. That way, it'll update in a synchronized manner and vtime will be taken care of too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think that makes sense... keep the updates in one place.

tctx->override_gen = octx->gen;
__sync_fetch_and_add(&layer->nr_tasks, 1);
bpf_task_release(p);
}

// To yield control back to the BPF scheduler userspace is
// allowed to yield a task. Reset all the yielded tasks.
bpf_for(i, 0, octx->nr_yields) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When override gen changes, wouldn't that automatically revoke overrides of all tasks that aren't anymore in the current generation? ie. maybe_refresh_layer() can test whether its gen is behind the global gen and if so, revert back to the original layer and then clear the gen number to indicate that the situation has been dealt with.

if (i > MAX_OVERRIDE_YIELDS) {
scx_bpf_error("too many override yields");
return;
}

p = bpf_task_from_pid(octx->task_overrides[i]);
if (!p)
continue;

if (!(tctx = lookup_task_ctx(p))) {
bpf_task_release(p);
continue;
}
tctx->refresh_layer = true;
tctx->override_gen = 0;
bpf_task_release(p);
}
octx->yield_gen++;
}
}

static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
struct task_struct *p, struct task_ctx *tctx,
const struct cpumask *layer_cpumask)
Expand All @@ -437,6 +539,21 @@ static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask,
trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq);
}

SEC("fentry")
int BPF_PROG(sched_tick_fentry)
{
int idx;

if (bpf_get_smp_processor_id() == 0) {
bpf_for(idx, 0, nr_layers) {
refresh_cpumasks(idx);
}
if (allow_overrides)
refresh_overrides(nr_layers);
}
return 0;
}

static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
const struct cpumask *idle_smtmask)
{
Expand Down Expand Up @@ -1099,7 +1216,7 @@ static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx)
u64 idx; // XXX - int makes verifier unhappy
pid_t pid = p->pid;

if (!tctx->refresh_layer)
if (!tctx->refresh_layer || tctx->override_gen > 0)
return;
tctx->refresh_layer = false;

Expand Down Expand Up @@ -1404,6 +1521,7 @@ s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
tctx->pid = p->pid;
tctx->last_cpu = -1;
tctx->layer = -1;
tctx->override_gen = 0;
tctx->refresh_layer = true;

if (all_cpumask)
Expand Down Expand Up @@ -1523,6 +1641,7 @@ void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
{
struct bpf_cpumask *cpumask;
struct override_ctx *octx;
int i, j, k, nr_online_cpus, ret;

ret = scx_bpf_create_dsq(HI_FALLBACK_DSQ, -1);
Expand Down Expand Up @@ -1571,6 +1690,14 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
i, layer->min_exec_ns, layer->open, layer->preempt,
layer->exclusive);

if (allow_overrides) {
if (!(octx = lookup_override_ctx(i)))
return -ENOENT;
octx->nr_overrides = 0;
octx->gen = 0;
override_gens[i] = 0;
}

if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
scx_bpf_error("too many ORs");
return -EINVAL;
Expand Down
13 changes: 13 additions & 0 deletions scheds/rust/scx_layered/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,11 @@ struct Opts {
#[clap(short = 'e', long)]
example: Option<String>,

/// User override bpffs mount path, if set it will mount a BPF map over override_ctx's per
/// layer. This can be used by userspace to control the layer of a task.
#[clap(short = 'o', long)]
override_ctx_mount: Option<String>,

/// Enable stats monitoring with the specified interval.
#[clap(long)]
stats: Option<f64>,
Expand Down Expand Up @@ -1596,6 +1601,7 @@ impl<'a, 'b> Scheduler<'a, 'b> {
// Initialize skel according to @opts.
skel.struct_ops.layered_mut().exit_dump_len = opts.exit_dump_len;

skel.maps.rodata_data.allow_overrides = opts.override_ctx_mount.is_some();
skel.maps.rodata_data.debug = opts.verbose as u32;
skel.maps.rodata_data.slice_ns = opts.slice_us * 1000;
skel.maps.rodata_data.max_exec_ns = if opts.max_exec_us > 0 {
Expand All @@ -1617,6 +1623,13 @@ impl<'a, 'b> Scheduler<'a, 'b> {

let mut skel = scx_ops_load!(skel, layered, uei)?;

if opts.override_ctx_mount.is_some() {
if !skel.maps.override_ctxs.is_pinned() {
let path = opts.override_ctx_mount.clone().unwrap();
let _ = skel.maps.override_ctxs.pin(path);
}
}

let mut layers = vec![];
for (idx, spec) in layer_specs.iter().enumerate() {
layers.push(Layer::new(
Expand Down