diff --git a/CLAUDE.md b/CLAUDE.md index 223d673..2551e0f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -95,6 +95,7 @@ WiFi and BLE share the same 2.4 GHz radio. The Arduino-ESP32 default is `WIFI_PS - `LittleFS.begin()` is idempotent. - Multiple concurrent WS clients are supported (on-device web UI + a separate app). `cleanupClients()` caps at `DEFAULT_MAX_WS_CLIENTS` (8 on ESP32); shared session state (rate, events) resets only when the last client disconnects (`server->count() == 0`). - Broadcast with `websocket.printfAll(...)`, not a hand-rolled `getClients()` loop: `getClients()` doesn't take the library's client-list mutex, so iterating it on the loop task races a client disconnect on the AsyncTCP task (use-after-free). `printfAll` holds the lock and sends to each client. +- **`printfAll` allocates per client and throws `std::bad_alloc` when the heap is exhausted** — and Arduino-ESP32 builds `-fno-exceptions`, so the throw goes straight to `std::terminate()`→`abort()`→reboot. Connection churn (many/half-open WS clients lingering under the 30 s ack timeout) can collapse the heap, so every broadcast-to-all helper (`sendWebsocketWeightAll`, `sendWebsocketStatusAll`, button/power-off) is gated by `wsBroadcastHeapOk()` (`WS_BROADCAST_HEAP_FLOOR`, ~25 KB, above the 15 KB heap watchdog): when heap is below the floor the frame is **skipped**, not allocated. Per-client queues are capped via `-D WS_MAX_QUEUED_MESSAGES=8` (lib default 32) so a backed-up client can't hoard heap. Don't add a new `printfAll` broadcast without the `wsBroadcastHeapOk()` guard. WS frame parsing: only act on complete unfragmented text frames: @@ -135,6 +136,7 @@ Functions and locals are camelCase. Some legacy snake_case remains; don't churn | Boot logs show `LittleFS mount failed` | Run `pio run -t uploadfs` to write the filesystem image — firmware-only flashes don't touch it. | | `pio device monitor` hangs in a non-PTY shell | Use the pyserial snippet in Quick reference. | | `pio` flash takes >60s instead of ~15s | Bad firmware is choking the bootloader handshake. Symptom of a serious bug on the device (WiFi coex, OLED stuck, etc.), not a hardware fault. | +| `reset_reason=panic` / `abort()` + reboot under sustained multi-client WiFi load | Heap-exhaustion OOM in a WS broadcast: `printfAll` → `operator new` throws `bad_alloc`. Broadcasts are heap-gated (`wsBroadcastHeapOk`); look for `[ws] low heap … skip broadcast` on serial and a falling `[health] heap=`. Driven by WS connection churn (half-open clients lingering on the 30 s ack timeout). Not thermal. | ## Keeping this file fresh diff --git a/include/websocket.h b/include/websocket.h index 0816475..750af5d 100644 --- a/include/websocket.h +++ b/include/websocket.h @@ -156,8 +156,37 @@ void processWsPendingCmds() { } } +// --- Heap-floor gate for periodic WS broadcasts ------------------------------ +// printfAll() allocates an AsyncWebSocketMessage (a heap buffer) for EVERY +// connected client. Under WebSocket connection churn the heap can collapse, and +// that allocation then throws std::bad_alloc -> std::terminate() -> abort() +// (Arduino-ESP32 builds with -fno-exceptions, so the throw can't be caught) -> +// reboot. That OOM-reboot is the "weight stops being collected under sustained +// multi-client load" failure. Skipping a frame is invisible (the next weight +// frame is <=500 ms away, status <=5 s); crashing is not. The floor sits above +// the 15 KB heap watchdog (wifi_setup.cpp) so broadcasts back off well before a +// reboot is even considered. Every broadcast helper below runs on the main loop, +// so the skip counter needs no synchronization. +static const uint32_t WS_BROADCAST_HEAP_FLOOR = 25000; +static uint32_t g_wsBroadcastHeapSkips = 0; +static inline bool wsBroadcastHeapOk() { + if (ESP.getFreeHeap() >= WS_BROADCAST_HEAP_FLOOR) return true; + g_wsBroadcastHeapSkips++; + static unsigned long lastLog = 0; + unsigned long now = millis(); + if (now - lastLog >= 2000) { // rate-limit: broadcasts can be 10 Hz + lastLog = now; + Serial.printf("[ws] low heap %lu < %lu -> skip broadcast (total skips=%lu)\n", + (unsigned long)ESP.getFreeHeap(), + (unsigned long)WS_BROADCAST_HEAP_FLOOR, + (unsigned long)g_wsBroadcastHeapSkips); + } + return false; +} + void sendWebsocketButton(int buttonNumber, int buttonShortPress) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"button\",\"button\":\"%s\",\"button_number\":%d,\"press\":\"%s\",\"press_code\":%d,\"ms\":%lu}", websocketButtonName(buttonNumber), buttonNumber, @@ -168,6 +197,7 @@ void sendWebsocketButton(int buttonNumber, int buttonShortPress) { void sendWebsocketPowerOff(int i_reason) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"power\",\"event\":\"power_off\",\"reason\":\"%s\",\"reason_code\":%d,\"ms\":%lu}", websocketPowerOffReason(i_reason), i_reason, @@ -219,6 +249,7 @@ void sendWebsocketStatus(AsyncWebSocketClient *client, const char *status) { // without blocking the others. void sendWebsocketStatusAll(const char *status) { if (!b_wifiEnabled || !b_websocketEventsEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"type\":\"status\",\"status\":\"%s\",\"protocol_version\":1,\"firmware_version\":\"%s\",\"grams\":%.2f,\"ms\":%lu,\"battery_percent\":%d,\"battery_voltage\":%.2f,\"charging\":%s,\"timer_running\":%s,\"timer_seconds\":%lu,\"display_on\":%s,\"low_power\":%s,\"soft_sleep\":%s,\"events_enabled\":%s,\"rate_hz\":%lu,\"interval_ms\":%lu,\"soc_temp_c\":%.1f,\"soc_temp_max_c\":%.1f,\"weight_stalled\":%s,\"stall_count\":%lu,\"last_stall_ms\":%lu,\"last_stall_temp_c\":%.1f,\"adc_recovery_count\":%lu,\"reset_reason\":\"%s\"}", status, FIRMWARE_VER, @@ -247,6 +278,7 @@ void sendWebsocketStatusAll(const char *status) { void sendWebsocketWeightAll(float grams, unsigned long ms) { if (!b_wifiEnabled || websocket.count() == 0) return; + if (!wsBroadcastHeapOk()) return; websocket.printfAll("{\"grams\":%.2f,\"ms\":%lu}", grams, ms); } diff --git a/platformio.ini b/platformio.ini index 289a3f1..1192edc 100644 --- a/platformio.ini +++ b/platformio.ini @@ -27,6 +27,10 @@ build_flags = ; -DESP32 -D CONFIG_ASYNC_TCP_RUNNING_CORE=1 -DELEGANTOTA_USE_ASYNC_WEBSERVER=1 + ; Cap each WS client's outbound queue (lib default 32) so a backed-up or + ; half-open client (connection churn) can't hoard heap. Bounds aggregate heap + ; growth; complements the WS_BROADCAST_HEAP_FLOOR gate in include/websocket.h. + -D WS_MAX_QUEUED_MESSAGES=8 !python3 git_rev_macro.py # -D DEBUG