From 93d5d5ba9b13701c1dfe5677020a1fa6c37258c6 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:11:19 +0000
Subject: [PATCH 1/8] Add defensive check for zero weight in vtime calculation

In mitosis_stopping, the scheduler calculates vtime advancement by
dividing by the task's weight. If a task somehow has a zero weight,
this would cause a division by zero error. Add a defensive check to
catch this case and report an error rather than crashing.
---
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 2fe1e40471..45006221de 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -1130,6 +1130,10 @@ void BPF_STRUCT_OPS(mitosis_stopping, struct task_struct *p, bool runnable)
 	used			 = now - tctx->started_running_at;
 	tctx->started_running_at = now;
 	/* scale the execution time by the inverse of the weight and charge */
+	if (p->scx.weight == 0) {
+		scx_bpf_error("Task %d has zero weight", p->pid);
+		return;
+	}
 	p->scx.dsq_vtime += used * 100 / p->scx.weight;
 
 	if (cidx != 0 || tctx->all_cell_cpus_allowed) {

From 5e50cb091d43e62e0cf078411b1d8c869a4ca578 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:11:39 +0000
Subject: [PATCH 2/8] Initialize cell vtime_now on allocation

When allocating a new cell, the vtime_now field is not initialized,
which means it contains garbage data. This can cause scheduling
anomalies as tasks inherit this uninitialized vtime. Initialize
vtime_now to 0 when allocating a cell to ensure consistent vtime
tracking from the start.
---
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index 45006221de..a7bb177215 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -236,8 +236,10 @@ static inline int allocate_cell()
 		if (!(c = lookup_cell(cell_idx)))
 			return -1;
 
-		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1))
+		if (__sync_bool_compare_and_swap(&c->in_use, 0, 1)) {
+			WRITE_ONCE(c->vtime_now, 0);
 			return cell_idx;
+		}
 	}
 	scx_bpf_error("No available cells to allocate");
 	return -1;

From 199eae8f075f7cd9505477708a067b473dd62ae9 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:12:09 +0000
Subject: [PATCH 3/8] Use atomic operations for BPF-to-Rust communication

Replace std::ptr::read_volatile with proper atomic load operations
with Acquire ordering when reading fields shared between BPF and Rust
code. This ensures proper memory ordering guarantees required for
multi-threaded communication:

- applied_configuration_seq: Used to synchronize configuration updates
- in_use: Used to determine cell allocation status

The Acquire ordering ensures that all writes made by the BPF side
before releasing the data are visible to the Rust side after the
load.
---
 scheds/rust/scx_mitosis/src/main.rs | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 5810e3423f..4aee028111 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -441,15 +441,17 @@ impl<'a> Scheduler<'a> {
 
     fn refresh_bpf_cells(&mut self) -> Result<()> {
         let applied_configuration = unsafe {
-            std::ptr::read_volatile(
-                &self
-                    .skel
-                    .maps
-                    .bss_data
-                    .as_ref()
-                    .unwrap()
-                    .applied_configuration_seq as *const u32,
-            )
+            let ptr = &self
+                .skel
+                .maps
+                .bss_data
+                .as_ref()
+                .unwrap()
+                .applied_configuration_seq as *const u32;
+            (ptr as *const std::sync::atomic::AtomicU32)
+                .as_ref()
+                .unwrap()
+                .load(std::sync::atomic::Ordering::Acquire)
         };
         if self
             .last_configuration_seq
@@ -475,7 +477,13 @@ impl<'a> Scheduler<'a> {
         for i in 0..MAX_CELLS {
             let cell_idx = i as u32;
             let bpf_cell = cells[i];
-            let in_use = unsafe { std::ptr::read_volatile(&bpf_cell.in_use as *const u32) };
+            let in_use = unsafe {
+                let ptr = &bpf_cell.in_use as *const u32;
+                (ptr as *const std::sync::atomic::AtomicU32)
+                    .as_ref()
+                    .unwrap()
+                    .load(std::sync::atomic::Ordering::Acquire)
+            };
             if in_use > 0 {
                 self.cells
                     .entry(cell_idx)

From ada3169374c545bc25b2d24034259e776d78c82f Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:12:21 +0000
Subject: [PATCH 4/8] Replace panic-prone expect with recoverable error
 handling

Replace expect() with unwrap_or_else() when looking up cell CPU
assignments. If a cell is marked as in_use but has no CPUs assigned
(which can happen during race conditions or transient states), log a
warning and use an empty cpumask rather than panicking. This makes
the scheduler more resilient to transient inconsistencies.
---
 scheds/rust/scx_mitosis/src/main.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 4aee028111..8f3e470be5 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -485,15 +485,19 @@ impl<'a> Scheduler<'a> {
                     .load(std::sync::atomic::Ordering::Acquire)
             };
             if in_use > 0 {
+                let cpus = cell_to_cpus.get(&cell_idx).cloned().unwrap_or_else(|| {
+                    warn!(
+                        "Cell {} marked in_use but has no CPUs assigned, using empty cpumask",
+                        cell_idx
+                    );
+                    Cpumask::new()
+                });
                 self.cells
                     .entry(cell_idx)
                     .or_insert_with(|| Cell {
                         cpus: Cpumask::new(),
                     })
-                    .cpus = cell_to_cpus
-                    .get(&cell_idx)
-                    .expect("missing cell in cpu map")
-                    .clone();
+                    .cpus = cpus;
                 self.metrics.cells.insert(cell_idx, CellMetrics::default());
             } else {
                 self.cells.remove(&cell_idx);

From 067e590c41c48401fb14ea7f3dbbb51afe44d0f4 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:12:37 +0000
Subject: [PATCH 5/8] Guard against log10(0) in formatting code

Add a check to ensure global_queue_decisions is non-zero before
taking log10. If it's zero (which can happen during initialization
or in periods of no activity), use MIN_DECISIONS_WIDTH directly
instead of attempting to calculate log10(0), which returns negative
infinity and causes formatting issues.
---
 scheds/rust/scx_mitosis/src/main.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 8f3e470be5..91de2681ce 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -145,10 +145,14 @@ impl Display for DistributionStats {
         // given logging interval, the global and cell queueing decision counts print at the same width.
         // Second, it reduces variance in column width between logging intervals. 5 is simply a heuristic.
         const MIN_DECISIONS_WIDTH: usize = 5;
-        let descisions_width = max(
-            MIN_DECISIONS_WIDTH,
-            (self.global_queue_decisions as f64).log10().ceil() as usize,
-        );
+        let descisions_width = if self.global_queue_decisions > 0 {
+            max(
+                MIN_DECISIONS_WIDTH,
+                (self.global_queue_decisions as f64).log10().ceil() as usize,
+            )
+        } else {
+            MIN_DECISIONS_WIDTH
+        };
         write!(
             f,
             "{:width$} {:5.1}% | Local:{:4.1}% From: CPU:{:4.1}% Cell:{:4.1}% | V:{:4.1}%",

From d9c355fa829ca93b38b4739b7f2947f852e58178 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:12:54 +0000
Subject: [PATCH 6/8] Add bounds check before indexing percpu map vector

Add validation that the percpu map lookup returns at least
NR_CPUS_POSSIBLE entries before indexing into it. If the map
returns fewer entries than expected, fail with a clear error
message rather than panicking with an out-of-bounds access.
---
 scheds/rust/scx_mitosis/src/main.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 91de2681ce..647ffcca7e 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -523,6 +523,13 @@ fn read_cpu_ctxs(skel: &BpfSkel) -> Result<Vec<bpf_intf::cpu_ctx>> {
         .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY)
         .context("Failed to lookup cpu_ctx")?
         .unwrap();
+    if cpu_ctxs_vec.len() < *NR_CPUS_POSSIBLE {
+        bail!(
+            "Percpu map returned {} entries but expected {}",
+            cpu_ctxs_vec.len(),
+            *NR_CPUS_POSSIBLE
+        );
+    }
     for cpu in 0..*NR_CPUS_POSSIBLE {
         cpu_ctxs.push(*unsafe {
             &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const bpf_intf::cpu_ctx)

From cdc5cb11ce35349f54c76103548fac564383006e Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:23:48 +0000
Subject: [PATCH 7/8] Fix cgroup lifecycle and cell discovery races

Address several race conditions and lifecycle issues that were causing
errors under load:

BPF side (mitosis.bpf.c):
- Fix cgroup_exit incorrectly using BPF_LOCAL_STORAGE_GET_F_CREATE
  flag. The exit handler should not create storage; if storage doesn't
  exist, the cgroup was never a cell owner and there's nothing to free.
  This was causing "cgrp_ctx creation failed" errors.

Rust side (main.rs):
- Fix TOCTOU race in cell discovery by inferring cell existence from
  CPU assignments rather than reading in_use flag separately. The
  in_use flag is set early in allocate_cell() but CPUs aren't assigned
  until later in update_timer_cb. Reading in_use with Acquire doesn't
  synchronize with CPU assignments (which are synchronized via
  applied_configuration_seq). This was causing "missing cell in cpu
  map" warnings.

Known remaining issue:
- Cells are not freed when a cgroup's cpuset is removed (only when
  cgroup exits). See TODO at line 870 in mitosis.bpf.c. With
  MAX_CELLS=16, systems with many dynamic cpuset changes can exhaust
  the cell pool, leading to "No available cells to allocate" errors.
  This may be an issue only present in synthetic tests.
---
 scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c | 12 ++--
 scheds/rust/scx_mitosis/src/main.rs           | 59 +++++++++----------
 2 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
index a7bb177215..9be312c0bc 100644
--- a/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
+++ b/scheds/rust/scx_mitosis/src/bpf/mitosis.bpf.c
@@ -1216,11 +1216,13 @@ s32 BPF_STRUCT_OPS(mitosis_cgroup_exit, struct cgroup *cgrp)
 
 	record_cgroup_exit(cgrp->kn->id);
 
-	if (!(cgc = bpf_cgrp_storage_get(&cgrp_ctxs, cgrp, 0,
-					 BPF_LOCAL_STORAGE_GET_F_CREATE))) {
-		scx_bpf_error("cgrp_ctx creation failed for cgid %llu",
-			      cgrp->kn->id);
-		return -ENOENT;
+	/*
+	 * Use lookup without CREATE since this is the exit path. If the cgroup
+	 * doesn't have storage, it's not a cell owner anyway.
+	 */
+	if (!(cgc = lookup_cgrp_ctx(cgrp))) {
+		/* Errors above on failure, verifier. */
+		return 0;
 	}
 
 	if (cgc->cell_owner) {
diff --git a/scheds/rust/scx_mitosis/src/main.rs b/scheds/rust/scx_mitosis/src/main.rs
index 647ffcca7e..58cc138367 100644
--- a/scheds/rust/scx_mitosis/src/main.rs
+++ b/scheds/rust/scx_mitosis/src/main.rs
@@ -8,7 +8,7 @@ pub mod bpf_intf;
 mod stats;
 
 use std::cmp::max;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::fmt;
 use std::fmt::Display;
 use std::mem::MaybeUninit;
@@ -477,38 +477,35 @@ impl<'a> Scheduler<'a> {
         // Create cells we don't have yet, drop cells that are no longer in use.
         // If we continue to drop cell metrics once a cell is removed, we'll need to make sure we
         // flush metrics for a cell before we remove it completely.
-        let cells = &self.skel.maps.bss_data.as_ref().unwrap().cells;
-        for i in 0..MAX_CELLS {
-            let cell_idx = i as u32;
-            let bpf_cell = cells[i];
-            let in_use = unsafe {
-                let ptr = &bpf_cell.in_use as *const u32;
-                (ptr as *const std::sync::atomic::AtomicU32)
-                    .as_ref()
-                    .unwrap()
-                    .load(std::sync::atomic::Ordering::Acquire)
-            };
-            if in_use > 0 {
-                let cpus = cell_to_cpus.get(&cell_idx).cloned().unwrap_or_else(|| {
-                    warn!(
-                        "Cell {} marked in_use but has no CPUs assigned, using empty cpumask",
-                        cell_idx
-                    );
-                    Cpumask::new()
-                });
-                self.cells
-                    .entry(cell_idx)
-                    .or_insert_with(|| Cell {
-                        cpus: Cpumask::new(),
-                    })
-                    .cpus = cpus;
-                self.metrics.cells.insert(cell_idx, CellMetrics::default());
-            } else {
-                self.cells.remove(&cell_idx);
-                self.metrics.cells.remove(&cell_idx);
-            }
+        //
+        // IMPORTANT: We determine which cells exist based on CPU assignments (which are
+        // synchronized by applied_configuration_seq), NOT by reading the in_use field
+        // separately. This avoids a TOCTOU race where a cell's in_use is set before
+        // CPUs are assigned.
+
+        // Cell 0 (root cell) always exists even if it has no CPUs temporarily
+        let cells_with_cpus: HashSet<u32> = cell_to_cpus.keys().copied().collect();
+        let mut active_cells = cells_with_cpus.clone();
+        active_cells.insert(0);
+
+        for cell_idx in &active_cells {
+            let cpus = cell_to_cpus
+                .get(cell_idx)
+                .cloned()
+                .unwrap_or_else(|| Cpumask::new());
+            self.cells
+                .entry(*cell_idx)
+                .or_insert_with(|| Cell {
+                    cpus: Cpumask::new(),
+                })
+                .cpus = cpus;
+            self.metrics.cells.insert(*cell_idx, CellMetrics::default());
         }
 
+        // Remove cells that no longer have CPUs assigned
+        self.cells.retain(|&k, _| active_cells.contains(&k));
+        self.metrics.cells.retain(|&k, _| active_cells.contains(&k));
+
         self.last_configuration_seq = Some(applied_configuration);
 
         Ok(())

From 67be89cb69b50d58eb121134d2b75f719d66bf11 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 17 Oct 2025 05:28:58 +0000
Subject: [PATCH 8/8] mitosis: Add a test and cleanup script

Add a script to stress mitosis by creating and destroying cells. This
has proven useful for identifying bugs.
---
 .../scx_mitosis/test/cleanup_test_cgroups.sh  | 105 ++++++++++++
 .../test/test_exhaust_unpatched.sh            | 162 ++++++++++++++++++
 2 files changed, 267 insertions(+)
 create mode 100755 scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh
 create mode 100755 scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh

diff --git a/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh b/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh
new file mode 100755
index 0000000000..a6fc804022
--- /dev/null
+++ b/scheds/rust/scx_mitosis/test/cleanup_test_cgroups.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Cleanup all test cgroups created by test scripts
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+if [ "$EUID" -ne 0 ]; then
+    echo -e "${RED}Must run as root${NC}"
+    exit 1
+fi
+
+echo -e "${YELLOW}========================================${NC}"
+echo -e "${YELLOW}Cleaning up all test cgroups${NC}"
+echo -e "${YELLOW}========================================${NC}\n"
+
+# Find all test-related cgroups with pattern matching
+TEST_PATTERNS=(
+    "test_mitosis*"
+    "test_cell*"
+    "test_simple*"
+    "test_brutal*"
+    "test_absolute*"
+    "test_working*"
+    "test_reuse*"
+    "test_verify*"
+    "test_cycle*"
+    "test_16*"
+    "test_15*"
+    "test_*"
+    "scx_mitosis_test*"
+)
+
+CLEANED=0
+FAILED=0
+
+# First pass: Find all matching cgroups at root level
+ALL_TEST_CGROUPS=()
+for pattern in "${TEST_PATTERNS[@]}"; do
+    shopt -s nullglob  # Make glob return nothing if no matches
+    for cg in /sys/fs/cgroup/${pattern}; do
+        if [ -d "$cg" ]; then
+            ALL_TEST_CGROUPS+=("$cg")
+        fi
+    done
+    shopt -u nullglob
+done
+
+# Remove duplicates and sort
+UNIQUE_CGROUPS=($(printf '%s\n' "${ALL_TEST_CGROUPS[@]}" | sort -u))
+
+if [ ${#UNIQUE_CGROUPS[@]} -eq 0 ]; then
+    echo -e "${GREEN}No test cgroups found to clean up${NC}"
+    exit 0
+fi
+
+echo -e "${YELLOW}Found ${#UNIQUE_CGROUPS[@]} test cgroups to clean up${NC}\n"
+
+# Clean each cgroup
+for test_root in "${UNIQUE_CGROUPS[@]}"; do
+    if [ ! -d "$test_root" ]; then
+        continue
+    fi
+
+    echo -e "${YELLOW}Cleaning $(basename $test_root)...${NC}"
+
+    # Find all child cgroups, deepest first
+    if [ -d "$test_root" ]; then
+        find "$test_root" -mindepth 1 -type d 2>/dev/null | sort -r | while read -r cg; do
+            # Kill all processes in the cgroup
+            if [ -f "$cg/cgroup.procs" ]; then
+                cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true
+            fi
+            sleep 0.05
+            # Remove the cgroup
+            rmdir "$cg" 2>/dev/null || true
+        done
+    fi
+
+    # Remove the root test cgroup
+    if [ -f "$test_root/cgroup.procs" ]; then
+        cat "$test_root/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true
+    fi
+    sleep 0.1
+    rmdir "$test_root" 2>/dev/null || true
+
+    if [ -d "$test_root" ]; then
+        echo -e "${RED}  ✗ Failed to remove $(basename $test_root)${NC}"
+        FAILED=$((FAILED + 1))
+    else
+        echo -e "${GREEN}  ✓ Removed $(basename $test_root)${NC}"
+        CLEANED=$((CLEANED + 1))
+    fi
+done
+
+echo -e "\n${YELLOW}========================================${NC}"
+echo -e "${GREEN}Cleaned: $CLEANED cgroups${NC}"
+if [ $FAILED -gt 0 ]; then
+    echo -e "${RED}Failed: $FAILED cgroups${NC}"
+    echo -e "${YELLOW}Try running the cleanup again if some cgroups are still active${NC}"
+else
+    echo -e "${GREEN}All test cgroups removed successfully!${NC}"
+fi
+echo -e "${YELLOW}========================================${NC}"
diff --git a/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh b/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh
new file mode 100755
index 0000000000..ab6ca0e13e
--- /dev/null
+++ b/scheds/rust/scx_mitosis/test/test_exhaust_unpatched.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# Test that MUST fail on unpatched code and pass on patched code
+# Strategy: Fill all cells, remove cpusets (leak cells), try to allocate more
+
+set -e
+
+CGROUP_ROOT="/sys/fs/cgroup"
+MAX_CELLS=16
+INITIAL_CELLS=15  # Leave 1 slot for root cell
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+if [ "$EUID" -ne 0 ]; then
+    echo -e "${RED}Must run as root${NC}"
+    exit 1
+fi
+
+if ! pgrep -x scx_mitosis > /dev/null; then
+    echo -e "${RED}scx_mitosis not running${NC}"
+    exit 1
+fi
+
+NUM_CPUS=$(nproc)
+if [ "$NUM_CPUS" -lt "$INITIAL_CELLS" ]; then
+    echo -e "${RED}Need at least $INITIAL_CELLS CPUs${NC}"
+    exit 1
+fi
+
+CREATED_CGROUPS=()
+
+cleanup() {
+    echo -e "\n${YELLOW}Cleanup...${NC}"
+    for cg in "${CREATED_CGROUPS[@]}"; do
+        if [ -d "$cg" ]; then
+            cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true
+            rmdir "$cg" 2>/dev/null || true
+        fi
+    done
+}
+
+trap cleanup EXIT INT TERM
+
+echo -e "${YELLOW}========================================${NC}"
+echo -e "${YELLOW}EXHAUSTION TEST (unpatched should FAIL)${NC}"
+echo -e "${YELLOW}========================================${NC}\n"
+
+echo "+cpuset" > "$CGROUP_ROOT/cgroup.subtree_control" 2>/dev/null || true
+dmesg -c > /dev/null
+initial_errors=$(dmesg | grep -c "No available cells to allocate" || true)
+
+# PHASE 1: Fill all cells (create 15 cgroups = 15 cells + 1 root = 16 total)
+echo -e "${YELLOW}PHASE 1: Creating $INITIAL_CELLS cgroups to fill all cells${NC}"
+for i in $(seq 1 $INITIAL_CELLS); do
+    cg="$CGROUP_ROOT/test_exhaust_batch1_$i"
+    CREATED_CGROUPS+=("$cg")
+
+    mkdir -p "$cg"
+    echo "0" > "$cg/cpuset.mems"
+    cpu=$((i - 1))
+    echo "$cpu" > "$cg/cpuset.cpus"
+
+    # Spawn task
+    (echo $$ > "$cg/cgroup.procs"; sleep 600 &) 2>/dev/null || true
+
+    echo -e "  Created cgroup $i: CPU $cpu"
+    sleep 0.12
+done
+
+echo -e "${GREEN}Created $INITIAL_CELLS cgroups - all cells allocated${NC}"
+sleep 2
+
+# Verify all cells allocated
+current_errors=$(dmesg | grep -c "No available cells to allocate" || true)
+if [ "$current_errors" -gt "$initial_errors" ]; then
+    echo -e "${RED}ERROR: Unexpected exhaustion during phase 1!${NC}"
+    exit 1
+fi
+
+# PHASE 2: Remove cpusets from HALF the cgroups (leak cells on unpatched)
+REMOVE_COUNT=$((INITIAL_CELLS / 2))
+echo -e "\n${YELLOW}PHASE 2: Removing cpusets from $REMOVE_COUNT cgroups${NC}"
+echo -e "${YELLOW}  On UNPATCHED: Cells stay allocated (leaked)${NC}"
+echo -e "${YELLOW}  On PATCHED: Cells freed when DSQ empty${NC}"
+
+for i in $(seq 1 $REMOVE_COUNT); do
+    cg="$CGROUP_ROOT/test_exhaust_batch1_$i"
+
+    # Kill tasks first to ensure DSQ is empty
+    cat "$cg/cgroup.procs" 2>/dev/null | xargs -r kill -9 2>/dev/null || true
+    sleep 0.05
+
+    # Remove cpuset restriction (assign to all CPUs)
+    echo "0-$((NUM_CPUS - 1))" > "$cg/cpuset.cpus" 2>/dev/null || true
+
+    echo -e "  Removed cpuset from cgroup $i"
+    sleep 0.1
+done
+
+echo -e "${GREEN}Removed cpusets from $REMOVE_COUNT cgroups${NC}"
+echo -e "${YELLOW}Waiting for timer to process (100ms intervals)...${NC}"
+sleep 3
+
+# PHASE 3: Try to create NEW cgroups with cpusets
+echo -e "\n${YELLOW}PHASE 3: Creating $REMOVE_COUNT NEW cgroups with cpusets${NC}"
+echo -e "${YELLOW}  On UNPATCHED: Should FAIL (cells leaked)${NC}"
+echo -e "${YELLOW}  On PATCHED: Should SUCCEED (cells freed)${NC}\n"
+
+success=0
+for i in $(seq 1 $REMOVE_COUNT); do
+    cg="$CGROUP_ROOT/test_exhaust_batch2_$i"
+    CREATED_CGROUPS+=("$cg")
+
+    mkdir -p "$cg"
+    echo "0" > "$cg/cpuset.mems"
+    cpu=$((i - 1))
+
+    echo -ne "${YELLOW}Creating NEW cgroup $i (CPU $cpu)...${NC}"
+
+    if echo "$cpu" > "$cg/cpuset.cpus" 2>/dev/null; then
+        (echo $$ > "$cg/cgroup.procs"; sleep 600 &) 2>/dev/null || true
+        echo -e " ${GREEN}OK${NC}"
+        success=$((success + 1))
+    else
+        echo -e " ${RED}FAILED${NC}"
+    fi
+
+    sleep 0.15
+
+    # Check for exhaustion immediately
+    current_errors=$(dmesg | grep -c "No available cells to allocate" || true)
+    if [ "$current_errors" -gt "$initial_errors" ]; then
+        new_errors=$((current_errors - initial_errors))
+        echo -e "\n${RED}========================================${NC}"
+        echo -e "${RED}CELL EXHAUSTION DETECTED!${NC}"
+        echo -e "${RED}========================================${NC}"
+        echo -e "${RED}Failed at NEW cgroup $i${NC}"
+        echo -e "${RED}Successfully created: $success/$REMOVE_COUNT${NC}"
+        echo -e "${RED}Exhaustion errors: $new_errors${NC}\n"
+        dmesg | grep "No available cells" | tail -10
+        echo -e "\n${GREEN}TEST RESULT: BUG REPRODUCED (unpatched code)${NC}"
+        echo -e "${GREEN}This test would PASS with the patch applied${NC}"
+        exit 0
+    fi
+done
+
+# Check final errors
+final_errors=$(dmesg | grep -c "No available cells to allocate" || true)
+if [ "$final_errors" -gt "$initial_errors" ]; then
+    echo -e "\n${GREEN}BUG REPRODUCED! (unpatched code)${NC}"
+    exit 0
+else
+    echo -e "\n${YELLOW}========================================${NC}"
+    echo -e "${YELLOW}NO EXHAUSTION - Patch appears to be applied${NC}"
+    echo -e "${YELLOW}========================================${NC}"
+    echo -e "${GREEN}All $REMOVE_COUNT new cgroups created successfully${NC}"
+    echo -e "${GREEN}Cells were properly freed and reused${NC}"
+    echo -e "\n${GREEN}TEST RESULT: PASS (patched code)${NC}"
+    exit 0
+fi