diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c index 2fe1e40471..9be312c0bc 100644 --- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c +++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c @@ -236,8 +236,10 @@ static inline int allocate_cell() if (!(c = lookup_cell(cell_idx))) return -1; - if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) + if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) { + WRITE_ONCE(c->vtime_now, 0); return cell_idx; + } } scx_bpf_error("No available cells to allocate"); return -1; @@ -1130,6 +1132,10 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable) used = now - tctx->started_running_at; tctx->started_running_at = now; /* scale the execution time by the inverse of the weight and charge */ + if (p->scx.weight == 0) { + scx_bpf_error("Task %d has zero weight", p->pid); + return; + } p->scx.dsq_vtime += used * 100 / p->scx.weight; if (cidx != 0 || tctx->all_cell_cpus_allowed) { @@ -1210,11 +1216,13 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp) record_cgroup_exit(cgrp->kn->id); - if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE))) { - scx_bpf_error("cgrp_ctx creation failed for cgid %llu", - cgrp->kn->id); - return -ENOENT; + /* + * Use lookup without CREATE since this is the exit path. If the cgroup + * doesn't have storage, it's not a cell owner anyway. + */ + if (!(cgc = lookup_cgrp_ctx(cgrp))) { + /* Errors above on failure, verifier. */ + return 0; } if (cgc->cell_owner) { diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs index 5810e3423f..58cc138367 100644 --- a/scheds/rust/scx_mitosis/src/main.rs +++ b/scheds/rust/scx_mitosis/src/main.rs @@ -8,7 +8,7 @@ pub mod bpf_intf; mod stats; use std::cmp::max; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt; use std::fmt::Display; use std::mem::MaybeUninit; @@ -145,10 +145,14 @@ impl Display for DistributionStats { // given logging interval, the global and cell queueing decision counts print at the same width. // Second, it reduces variance in column width between logging intervals. 5 is simply a heuristic. const MIN_DECISIONS_WIDTH: usize = 5; - let descisions_width = max( - MIN_DECISIONS_WIDTH, - (self.global_queue_decisions as f64).log10().ceil() as usize, - ); + let descisions_width = if self.global_queue_decisions > 0 { + max( + MIN_DECISIONS_WIDTH, + (self.global_queue_decisions as f64).log10().ceil() as usize, + ) + } else { + MIN_DECISIONS_WIDTH + }; write!( f, "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%", @@ -441,15 +445,17 @@ impl<'a> Scheduler<'a> { fn refresh_bpf_cells(&mut self) -> Result<()> { let applied_configuration = unsafe { - std::ptr::read_volatile( - &self - .skel - .maps - .bss_data - .as_ref() - .unwrap() - .applied_configuration_seq as *const u32, - ) + let ptr = &self + .skel + .maps + .bss_data + .as_ref() + .unwrap() + .applied_configuration_seq as *const u32; + (ptr as *const std::sync::atomic::AtomicU32) + .as_ref() + .unwrap() + .load(std::sync::atomic::Ordering::Acquire) }; if self .last_configuration_seq @@ -471,28 +477,35 @@ impl<'a> Scheduler<'a> { // Create cells we don't have yet, drop cells that are no longer in use. // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we // flush metrics for a cell before we remove it completely. - let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells; - for i in 0..MAX_CELLS { - let cell_idx = i as u32; - let bpf_cell = cells[i]; - let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) }; - if in_use > 0 { - self.cells - .entry(cell_idx) - .or_insert_with(|| Cell { - cpus: Cpumask::new(), - }) - .cpus = cell_to_cpus - .get(&cell_idx) - .expect("missing cell in cpu map") - .clone(); - self.metrics.cells.insert(cell_idx, CellMetrics::default()); - } else { - self.cells.remove(&cell_idx); - self.metrics.cells.remove(&cell_idx); - } + // + // IMPORTANT: We determine which cells exist based on CPU assignments (which are + // synchronized by applied_configuration_seq), NOT by reading the in_use field + // separately. This avoids a TOCTOU race where a cell's in_use is set before + // CPUs are assigned. + + // Cell 0 (root cell) always exists even if it has no CPUs temporarily + let cells_with_cpus: HashSet = cell_to_cpus.keys().copied().collect(); + let mut active_cells = cells_with_cpus.clone(); + active_cells.insert(0); + + for cell_idx in &active_cells { + let cpus = cell_to_cpus + .get(cell_idx) + .cloned() + .unwrap_or_else(|| Cpumask::new()); + self.cells + .entry(*cell_idx) + .or_insert_with(|| Cell { + cpus: Cpumask::new(), + }) + .cpus = cpus; + self.metrics.cells.insert(*cell_idx, CellMetrics::default()); } + // Remove cells that no longer have CPUs assigned + self.cells.retain(|&k, _| active_cells.contains(&k)); + self.metrics.cells.retain(|&k, _| active_cells.contains(&k)); + self.last_configuration_seq = Some(applied_configuration); Ok(()) @@ -507,6 +520,13 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result> { .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY) .context("Failed to lookup cpu_ctx")? .unwrap(); + if cpu_ctxs_vec.len() < *NR_CPUS_POSSIBLE { + bail!( + "Percpu map returned {} entries but expected {}", + cpu_ctxs_vec.len(), + *NR_CPUS_POSSIBLE + ); + } for cpu in 0..*NR_CPUS_POSSIBLE { cpu_ctxs.push(*unsafe { &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx) diff --git a/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh b/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh new file mode 100755 index 0000000000..a6fc804022 --- /dev/null +++ b/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Cleanup all test cgroups created by test scripts + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +if [ "$EUID" -ne 0 ]; then + echo -e "${RED}Must run as root${NC}" + exit 1 +fi + +echo -e "${YELLOW}========================================${NC}" +echo -e "${YELLOW}Cleaning up all test cgroups${NC}" +echo -e "${YELLOW}========================================${NC}\n" + +# Find all test-related cgroups with pattern matching +TEST_PATTERNS=( + "test_mitosis*" + "test_cell*" + "test_simple*" + "test_brutal*" + "test_absolute*" + "test_working*" + "test_reuse*" + "test_verify*" + "test_cycle*" + "test_16*" + "test_15*" + "test_*" + "scx_mitosis_test*" +) + +CLEANED=0 +FAILED=0 + +# First pass: Find all matching cgroups at root level +ALL_TEST_CGROUPS=() +for pattern in "${TEST_PATTERNS[@]}"; do + shopt -s nullglob # Make glob return nothing if no matches + for cg in /sys/fs/cgroup/${pattern}; do + if [ -d "$cg" ]; then + ALL_TEST_CGROUPS+=("$cg") + fi + done + shopt -u nullglob +done + +# Remove duplicates and sort +UNIQUE_CGROUPS=($(printf '%s\n' "${ALL_TEST_CGROUPS[@]}" | sort -u)) + +if [ ${#UNIQUE_CGROUPS[@]} -eq 0 ]; then + echo -e "${GREEN}No test cgroups found to clean up${NC}" + exit 0 +fi + +echo -e "${YELLOW}Found ${#UNIQUE_CGROUPS[@]} test cgroups to clean up${NC}\n" + +# Clean each cgroup +for test_root in "${UNIQUE_CGROUPS[@]}"; do + if [ ! -d "$test_root" ]; then + continue + fi + + echo -e "${YELLOW}Cleaning $(basename $test_root)...${NC}" + + # Find all child cgroups, deepest first + if [ -d "$test_root" ]; then + find "$test_root" -mindepth 1 -type d 2>/dev/null | sort -r | while read -r cg; do + # Kill all processes in the cgroup + if [ -f "$cg/cgroup.procs" ]; then + cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true + fi + sleep 0.05 + # Remove the cgroup + rmdir "$cg" 2>/dev/null || true + done + fi + + # Remove the root test cgroup + if [ -f "$test_root/cgroup.procs" ]; then + cat "$test_root/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true + fi + sleep 0.1 + rmdir "$test_root" 2>/dev/null || true + + if [ -d "$test_root" ]; then + echo -e "${RED} ✗ Failed to remove $(basename $test_root)${NC}" + FAILED=$((FAILED + 1)) + else + echo -e "${GREEN} ✓ Removed $(basename $test_root)${NC}" + CLEANED=$((CLEANED + 1)) + fi +done + +echo -e "\n${YELLOW}========================================${NC}" +echo -e "${GREEN}Cleaned: $CLEANED cgroups${NC}" +if [ $FAILED -gt 0 ]; then + echo -e "${RED}Failed: $FAILED cgroups${NC}" + echo -e "${YELLOW}Try running the cleanup again if some cgroups are still active${NC}" +else + echo -e "${GREEN}All test cgroups removed successfully!${NC}" +fi +echo -e "${YELLOW}========================================${NC}" diff --git a/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh b/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh new file mode 100755 index 0000000000..ab6ca0e13e --- /dev/null +++ b/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Test that MUST fail on unpatched code and pass on patched code +# Strategy: Fill all cells, remove cpusets (leak cells), try to allocate more + +set -e + +CGROUP_ROOT="/sys/fs/cgroup" +MAX_CELLS=16 +INITIAL_CELLS=15 # Leave 1 slot for root cell + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +if [ "$EUID" -ne 0 ]; then + echo -e "${RED}Must run as root${NC}" + exit 1 +fi + +if ! pgrep -x scx_mitosis > /dev/null; then + echo -e "${RED}scx_mitosis not running${NC}" + exit 1 +fi + +NUM_CPUS=$(nproc) +if [ "$NUM_CPUS" -lt "$INITIAL_CELLS" ]; then + echo -e "${RED}Need at least $INITIAL_CELLS CPUs${NC}" + exit 1 +fi + +CREATED_CGROUPS=() + +cleanup() { + echo -e "\n${YELLOW}Cleanup...${NC}" + for cg in "${CREATED_CGROUPS[@]}"; do + if [ -d "$cg" ]; then + cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true + rmdir "$cg" 2>/dev/null || true + fi + done +} + +trap cleanup EXIT INT TERM + +echo -e "${YELLOW}========================================${NC}" +echo -e "${YELLOW}EXHAUSTION TEST (unpatched should FAIL)${NC}" +echo -e "${YELLOW}========================================${NC}\n" + +echo "+cpuset" > "$CGROUP_ROOT/cgroup.subtree_control" 2>/dev/null || true +dmesg -c > /dev/null +initial_errors=$(dmesg | grep -c "No available cells to allocate" || true) + +# PHASE 1: Fill all cells (create 15 cgroups = 15 cells + 1 root = 16 total) +echo -e "${YELLOW}PHASE 1: Creating $INITIAL_CELLS cgroups to fill all cells${NC}" +for i in $(seq 1 $INITIAL_CELLS); do + cg="$CGROUP_ROOT/test_exhaust_batch1_$i" + CREATED_CGROUPS+=("$cg") + + mkdir -p "$cg" + echo "0" > "$cg/cpuset.mems" + cpu=$((i - 1)) + echo "$cpu" > "$cg/cpuset.cpus" + + # Spawn task + (echo $$ > "$cg/cgroup.procs"; sleep 600 &) 2>/dev/null || true + + echo -e " Created cgroup $i: CPU $cpu" + sleep 0.12 +done + +echo -e "${GREEN}Created $INITIAL_CELLS cgroups - all cells allocated${NC}" +sleep 2 + +# Verify all cells allocated +current_errors=$(dmesg | grep -c "No available cells to allocate" || true) +if [ "$current_errors" -gt "$initial_errors" ]; then + echo -e "${RED}ERROR: Unexpected exhaustion during phase 1!${NC}" + exit 1 +fi + +# PHASE 2: Remove cpusets from HALF the cgroups (leak cells on unpatched) +REMOVE_COUNT=$((INITIAL_CELLS / 2)) +echo -e "\n${YELLOW}PHASE 2: Removing cpusets from $REMOVE_COUNT cgroups${NC}" +echo -e "${YELLOW} On UNPATCHED: Cells stay allocated (leaked)${NC}" +echo -e "${YELLOW} On PATCHED: Cells freed when DSQ empty${NC}" + +for i in $(seq 1 $REMOVE_COUNT); do + cg="$CGROUP_ROOT/test_exhaust_batch1_$i" + + # Kill tasks first to ensure DSQ is empty + cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true + sleep 0.05 + + # Remove cpuset restriction (assign to all CPUs) + echo "0-$((NUM_CPUS - 1))" > "$cg/cpuset.cpus" 2>/dev/null || true + + echo -e " Removed cpuset from cgroup $i" + sleep 0.1 +done + +echo -e "${GREEN}Removed cpusets from $REMOVE_COUNT cgroups${NC}" +echo -e "${YELLOW}Waiting for timer to process (100ms intervals)...${NC}" +sleep 3 + +# PHASE 3: Try to create NEW cgroups with cpusets +echo -e "\n${YELLOW}PHASE 3: Creating $REMOVE_COUNT NEW cgroups with cpusets${NC}" +echo -e "${YELLOW} On UNPATCHED: Should FAIL (cells leaked)${NC}" +echo -e "${YELLOW} On PATCHED: Should SUCCEED (cells freed)${NC}\n" + +success=0 +for i in $(seq 1 $REMOVE_COUNT); do + cg="$CGROUP_ROOT/test_exhaust_batch2_$i" + CREATED_CGROUPS+=("$cg") + + mkdir -p "$cg" + echo "0" > "$cg/cpuset.mems" + cpu=$((i - 1)) + + echo -ne "${YELLOW}Creating NEW cgroup $i (CPU $cpu)...${NC}" + + if echo "$cpu" > "$cg/cpuset.cpus" 2>/dev/null; then + (echo $$ > "$cg/cgroup.procs"; sleep 600 &) 2>/dev/null || true + echo -e " ${GREEN}OK${NC}" + success=$((success + 1)) + else + echo -e " ${RED}FAILED${NC}" + fi + + sleep 0.15 + + # Check for exhaustion immediately + current_errors=$(dmesg | grep -c "No available cells to allocate" || true) + if [ "$current_errors" -gt "$initial_errors" ]; then + new_errors=$((current_errors - initial_errors)) + echo -e "\n${RED}========================================${NC}" + echo -e "${RED}CELL EXHAUSTION DETECTED!${NC}" + echo -e "${RED}========================================${NC}" + echo -e "${RED}Failed at NEW cgroup $i${NC}" + echo -e "${RED}Successfully created: $success/$REMOVE_COUNT${NC}" + echo -e "${RED}Exhaustion errors: $new_errors${NC}\n" + dmesg | grep "No available cells" | tail -10 + echo -e "\n${GREEN}TEST RESULT: BUG REPRODUCED (unpatched code)${NC}" + echo -e "${GREEN}This test would PASS with the patch applied${NC}" + exit 0 + fi +done + +# Check final errors +final_errors=$(dmesg | grep -c "No available cells to allocate" || true) +if [ "$final_errors" -gt "$initial_errors" ]; then + echo -e "\n${GREEN}BUG REPRODUCED! (unpatched code)${NC}" + exit 0 +else + echo -e "\n${YELLOW}========================================${NC}" + echo -e "${YELLOW}NO EXHAUSTION - Patch appears to be applied${NC}" + echo -e "${YELLOW}========================================${NC}" + echo -e "${GREEN}All $REMOVE_COUNT new cgroups created successfully${NC}" + echo -e "${GREEN}Cells were properly freed and reused${NC}" + echo -e "\n${GREEN}TEST RESULT: PASS (patched code)${NC}" + exit 0 +fi