diff --git a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c index 823f18073d..56b9c8ca5a 100644 --- a/scheds/rust/scx_bpfland/src/bpf/main.bpf.c +++ b/scheds/rust/scx_bpfland/src/bpf/main.bpf.c @@ -94,6 +94,24 @@ volatile u64 nr_online_cpus; */ static u64 nr_cpu_ids; +/* + * Runtime throttling. + * + * Throttle the CPUs by injecting @throttle_ns idle time every @slice_max. + */ +const volatile u64 throttle_ns; +static volatile bool cpus_throttled; + +static inline bool is_throttled(void) +{ + return READ_ONCE(cpus_throttled); +} + +static inline void set_throttled(bool state) +{ + WRITE_ONCE(cpus_throttled, state); +} + /* * Exit information. */ @@ -134,6 +152,20 @@ struct { __type(value, struct numa_timer); } numa_timer SEC(".maps"); +/* + * Timer used to inject idle cycles when CPU throttling is enabled. + */ +struct throttle_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct throttle_timer); +} throttle_timer SEC(".maps"); + /* * Per-node context. */ @@ -781,6 +813,9 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p, bool is_idle = false; s32 cpu; + if (is_throttled()) + return prev_cpu; + cpu = pick_idle_cpu(p, prev_cpu, wake_flags, &is_idle); if (is_idle) { int node = __COMPAT_scx_bpf_cpu_node(cpu); @@ -807,6 +842,9 @@ static bool kick_idle_cpu(const struct task_struct *p, const struct task_ctx *tc s32 cpu = scx_bpf_task_cpu(p); int node = __COMPAT_scx_bpf_cpu_node(cpu); + if (is_throttled()) + return false; + /* * No need to look for full-idle SMT cores if SMT is disabled. */ @@ -877,6 +915,12 @@ static bool try_direct_dispatch(struct task_struct *p, struct task_ctx *tctx, return true; } + /* + * Skip direct dispatch if the CPUs are forced to stay idle. + */ + if (is_throttled()) + return false; + /* * If ops.select_cpu() has been skipped, try direct dispatch. */ @@ -1066,6 +1110,12 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev) { int node = __COMPAT_scx_bpf_cpu_node(cpu); + /* + * Let the CPU go idle if the system is throttled. + */ + if (is_throttled()) + return; + /* * Consume regular tasks from the shared DSQ, transferring them to the * local CPU DSQ. @@ -1429,6 +1479,50 @@ static void init_cpuperf_target(void) scx_bpf_put_cpumask(online_cpumask); } +/* + * Throttle timer used to inject idle time across all the CPUs. + */ +static int throttle_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + bool throttled = is_throttled(); + u64 flags, duration; + s32 cpu; + int err; + + /* + * Stop the CPUs sending a preemption IPI (SCX_KICK_PREEMPT) if we + * need to interrupt the running tasks and inject the idle sleep. + * + * Otherwise, send a wakeup IPI to resume from the injected idle + * sleep. + */ + if (throttled) { + flags = SCX_KICK_IDLE; + duration = slice_max; + } else { + flags = SCX_KICK_PREEMPT; + duration = throttle_ns; + } + + /* + * Flip the throttled state. + */ + set_throttled(!throttled); + + bpf_for(cpu, 0, nr_cpu_ids) + scx_bpf_kick_cpu(cpu, flags); + + /* + * Re-arm the duty-cycle timer setting the runtime or the idle time + * duration. + */ + err = bpf_timer_start(timer, duration, 0); + if (err) + scx_bpf_error("Failed to re-arm duty cycle timer"); + + return 0; +} + /* * Refresh NUMA statistics. */ @@ -1545,21 +1639,42 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init) if (err) return err; + timer = bpf_map_lookup_elem(&throttle_timer, &key); + if (!timer) { + scx_bpf_error("Failed to lookup throttle timer"); + return -ESRCH; + } + + /* + * Fire the throttle timer if CPU throttling is enabled. + */ + if (throttle_ns) { + bpf_timer_init(timer, &throttle_timer, CLOCK_BOOTTIME); + bpf_timer_set_callback(timer, throttle_timerfn); + err = bpf_timer_start(timer, slice_max, 0); + if (err) { + scx_bpf_error("Failed to arm throttle timer"); + return err; + } + } + /* Do not update NUMA statistics if there's only one node */ if (numa_disabled || __COMPAT_scx_bpf_nr_node_ids() <= 1) return 0; timer = bpf_map_lookup_elem(&numa_timer, &key); if (!timer) { - scx_bpf_error("Failed to lookup central timer"); + scx_bpf_error("Failed to lookup NUMA timer"); return -ESRCH; } bpf_timer_init(timer, &numa_timer, CLOCK_BOOTTIME); bpf_timer_set_callback(timer, numa_timerfn); err = bpf_timer_start(timer, NSEC_PER_SEC, 0); - if (err) + if (err) { scx_bpf_error("Failed to start NUMA timer"); + return err; + } return 0; } diff --git a/scheds/rust/scx_bpfland/src/main.rs b/scheds/rust/scx_bpfland/src/main.rs index b93d46126c..7b106b993d 100644 --- a/scheds/rust/scx_bpfland/src/main.rs +++ b/scheds/rust/scx_bpfland/src/main.rs @@ -135,6 +135,13 @@ struct Opts { #[clap(short = 'l', long, allow_hyphen_values = true, default_value = "20000")] slice_us_lag: i64, + /// Throttle the running CPUs by periodically injecting idle cycles. + /// + /// This option can help extend battery life on portable devices, reduce heating, fan noise + /// and overall energy consumption (0 = disable). + #[clap(short = 't', long, default_value = "0")] + throttle_us: u64, + /// Set CPU idle QoS resume latency in microseconds (-1 = disabled). /// /// Setting a lower latency value makes CPUs less likely to enter deeper idle states, enhancing @@ -303,12 +310,16 @@ impl<'a> Scheduler<'a> { skel.maps.rodata_data.smt_enabled = smt_enabled; skel.maps.rodata_data.numa_disabled = opts.disable_numa; skel.maps.rodata_data.local_pcpu = opts.local_pcpu; - skel.maps.rodata_data.local_kthreads = opts.local_kthreads; skel.maps.rodata_data.no_preempt = opts.no_preempt; skel.maps.rodata_data.no_wake_sync = opts.no_wake_sync; skel.maps.rodata_data.slice_max = opts.slice_us * 1000; skel.maps.rodata_data.slice_min = opts.slice_us_min * 1000; skel.maps.rodata_data.slice_lag = opts.slice_us_lag * 1000; + skel.maps.rodata_data.throttle_ns = opts.throttle_us * 1000; + + // Implicitly enable direct dispatch of per-CPU kthreads if CPU throttling is enabled + // (it's never a good idea to throttle per-CPU kthreads). + skel.maps.rodata_data.local_kthreads = opts.local_kthreads || opts.throttle_us > 0; // Set scheduler compatibility flags. skel.maps.rodata_data.__COMPAT_SCX_PICK_IDLE_IN_NODE = *compat::SCX_PICK_IDLE_IN_NODE;