From 942662eaa2a020b87e0acc792a3afdd13a1e4a84 Mon Sep 17 00:00:00 2001 From: MasterPtato Date: Wed, 25 Jun 2025 17:09:36 +0000 Subject: [PATCH] feat: add actor queue --- Cargo.lock | 409 +--- .../dev-full/grafana/dashboards/pegboard.json | 2082 +++++++++++++++++ .../system-test-actor/src/managerClient.ts | 1 + packages/common/fdb-util/src/keys.rs | 12 +- packages/common/server-cli/src/util/fdb.rs | 2 + .../core/services/build/src/ops/create.rs | 2 +- packages/core/services/build/src/types.rs | 2 +- packages/edge/api/actor/src/route/actors.rs | 150 +- .../edge/infra/client/manager/src/main.rs | 12 +- .../20250619144812_add_runner_id.down.sql | 2 + .../20250625170301_add_alloc_ts.down.sql | 2 + .../20250625170301_add_alloc_ts.up.sql | 2 + .../services/pegboard/src/keys/datacenter.rs | 208 +- .../edge/services/pegboard/src/keys/runner.rs | 10 +- .../edge/services/pegboard/src/metrics.rs | 40 +- .../src/ops/client/update_allocation_idx.rs | 20 +- .../src/workflows/actor2/analytics.rs | 8 + .../pegboard/src/workflows/actor2/mod.rs | 69 +- .../pegboard/src/workflows/actor2/runtime.rs | 813 +++++-- .../pegboard/src/workflows/actor2/setup.rs | 97 +- .../pegboard/src/workflows/client/mod.rs | 562 ++++- .../usage-metrics-publish/src/lib.rs | 132 +- .../pegboard/standalone/ws/src/lib.rs | 25 +- 23 files changed, 3710 insertions(+), 952 deletions(-) create mode 100644 docker/dev-full/grafana/dashboards/pegboard.json create mode 100644 packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.down.sql create mode 100644 packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.up.sql diff --git a/Cargo.lock b/Cargo.lock index f2902f9c71..81c5e0f4b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,12 +27,6 @@ dependencies = [ "gimli", ] -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - [[package]] name = "adler2" version = "2.0.0" @@ -1805,7 +1799,7 @@ version = "1.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4fbd94a32b3a7d55d3806fe27d98d3ad393050439dd05eb53ece36ec5e3d3510" dependencies = [ - "base64-simd 0.8.0", + "base64-simd", "bytes", "bytes-utils", "futures-core", @@ -1844,7 +1838,7 @@ dependencies = [ "aws-smithy-async 1.2.1", "aws-smithy-runtime-api", "aws-smithy-types 1.2.9", - "rustc_version 0.4.1", + "rustc_version", "tracing", ] @@ -1895,12 +1889,6 @@ dependencies = [ "tower-service", ] -[[package]] -name = "az" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b7e4c2464d97fe331d41de9d5db0def0a96f4d823b8b32a2efd503578988973" - [[package]] name = "backtrace" version = "0.3.74" @@ -1910,7 +1898,7 @@ dependencies = [ "addr2line", "cfg-if", "libc", - "miniz_oxide 0.8.0", + "miniz_oxide", "object", "rustc-demangle", "windows-targets 0.52.6", @@ -1946,22 +1934,13 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "base64-simd" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "781dd20c3aff0bd194fe7d2a977dd92f21c173891f3a03b677359e5fa457e5d5" -dependencies = [ - "simd-abstraction", -] - [[package]] name = "base64-simd" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" dependencies = [ - "outref 0.5.1", + "outref", "vsimd", ] @@ -1983,15 +1962,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.69.5" @@ -2035,15 +2005,6 @@ dependencies = [ "syn 2.0.90", ] -[[package]] -name = "bit-set" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" -dependencies = [ - "bit-vec", -] - [[package]] name = "bit-vec" version = "0.6.3" @@ -2065,18 +2026,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bitvec" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" -dependencies = [ - "funty 2.0.0", - "radium", - "tap", - "wyz", -] - [[package]] name = "blake3" version = "1.5.5" @@ -2483,7 +2432,7 @@ checksum = "8769706aad5d996120af43197bf46ef6ad0fda35216b4505f926a365a232d924" dependencies = [ "camino", "cargo-platform", - "semver 1.0.23", + "semver", "serde", "serde_json", "thiserror 2.0.12", @@ -3682,12 +3631,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" -[[package]] -name = "cooked-waker" -version = "5.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147be55d677052dabc6b22252d5dd0fd4c29c8c27aa4f2fbef0f94aa003b406f" - [[package]] name = "core-foundation" version = "0.9.4" @@ -3744,7 +3687,7 @@ version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" dependencies = [ - "rustc_version 0.4.1", + "rustc_version", ] [[package]] @@ -3928,7 +3871,7 @@ dependencies = [ "curve25519-dalek-derive", "digest 0.10.7", "fiat-crypto", - "rustc_version 0.4.1", + "rustc_version", "subtle", ] @@ -4076,16 +4019,6 @@ dependencies = [ "sqlx", ] -[[package]] -name = "debugid" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" -dependencies = [ - "serde", - "uuid", -] - [[package]] name = "deno-embed" version = "25.5.2" @@ -4099,72 +4032,6 @@ dependencies = [ "zip", ] -[[package]] -name = "deno_core" -version = "0.323.0" -source = "git+https://github.com/rivet-gg/deno_core?rev=8a313913fa73d58f4f9532565b0084e723bc34ad#8a313913fa73d58f4f9532565b0084e723bc34ad" -dependencies = [ - "anyhow", - "az", - "bincode", - "bit-set", - "bit-vec", - "bytes", - "cooked-waker", - "deno_core_icudata", - "deno_ops", - "deno_unsync", - "futures", - "indexmap 2.7.0", - "libc", - "memoffset", - "parking_lot 0.12.3", - "percent-encoding", - "pin-project", - "serde", - "serde_json", - "serde_v8", - "smallvec", - "sourcemap", - "static_assertions", - "tokio", - "url", - "v8", - "wasm_dep_analyzer", -] - -[[package]] -name = "deno_core_icudata" -version = "0.74.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4dccb6147bb3f3ba0c7a48e993bfeb999d2c2e47a81badee80e2b370c8d695" - -[[package]] -name = "deno_ops" -version = "0.199.0" -source = "git+https://github.com/rivet-gg/deno_core?rev=8a313913fa73d58f4f9532565b0084e723bc34ad#8a313913fa73d58f4f9532565b0084e723bc34ad" -dependencies = [ - "proc-macro-rules", - "proc-macro2", - "quote", - "stringcase", - "strum 0.25.0", - "strum_macros 0.25.3", - "syn 2.0.90", - "thiserror 1.0.69", -] - -[[package]] -name = "deno_unsync" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d774fd83f26b24f0805a6ab8b26834a0d06ceac0db517b769b1e4633c96a2057" -dependencies = [ - "futures", - "parking_lot 0.12.3", - "tokio", -] - [[package]] name = "der" version = "0.6.1" @@ -4965,7 +4832,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", - "miniz_oxide 0.8.0", + "miniz_oxide", ] [[package]] @@ -5094,28 +4961,12 @@ dependencies = [ "libc", ] -[[package]] -name = "fslock" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04412b8935272e3a9bae6f48c7bfff74c2911f60525404edfdd28e49884c3bfb" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "funty" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" -[[package]] -name = "funty" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" - [[package]] name = "futures" version = "0.3.31" @@ -5709,15 +5560,6 @@ dependencies = [ "subtle", ] -[[package]] -name = "gzip-header" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95cc527b92e6029a62960ad99aa8a6660faa4555fe5f731aab13aa6a921795a2" -dependencies = [ - "crc32fast", -] - [[package]] name = "h2" version = "0.3.26" @@ -6376,12 +6218,6 @@ dependencies = [ "icu_properties", ] -[[package]] -name = "if_chain" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" - [[package]] name = "ignore" version = "0.4.23" @@ -7224,15 +7060,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "merkle_hash" version = "3.7.0" @@ -7266,15 +7093,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" -[[package]] -name = "miniz_oxide" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" -dependencies = [ - "adler", -] - [[package]] name = "miniz_oxide" version = "0.8.0" @@ -7788,7 +7606,7 @@ dependencies = [ "loom", "parking_lot 0.12.3", "portable-atomic", - "rustc_version 0.4.1", + "rustc_version", "smallvec", "tagptr", "thiserror 1.0.69", @@ -8120,7 +7938,6 @@ checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ "num-integer", "num-traits", - "rand 0.8.5", ] [[package]] @@ -8440,12 +8257,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" -[[package]] -name = "outref" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f222829ae9293e33a9f5e9f440c6760a3d450a64affe1846486b140db81c1f4" - [[package]] name = "outref" version = "0.5.1" @@ -8670,12 +8481,12 @@ name = "pegboard-actor-kv" version = "0.1.0" dependencies = [ "anyhow", - "deno_core", "fdb-util", "foundationdb", "futures-util", "indexmap 2.7.0", "prost 0.13.4", + "rivet-util-id", "serde", "serde_json", "tokio", @@ -8691,9 +8502,11 @@ name = "pegboard-config" version = "25.5.2" dependencies = [ "anyhow", + "indexmap 2.7.0", "ipnet", "pegboard", - "rivet-util", + "pegboard-actor-kv", + "rivet-util-id", "schemars", "serde", "serde_json", @@ -8740,6 +8553,8 @@ version = "25.5.2" dependencies = [ "anyhow", "bytes", + "fdb-util", + "foundationdb", "futures-util", "hyper 0.14.31", "indoc 2.0.5", @@ -8748,6 +8563,7 @@ dependencies = [ "nix 0.30.1", "notify", "pegboard", + "pegboard-actor-kv", "pegboard-config", "portpicker", "prometheus", @@ -9131,29 +8947,6 @@ version = "0.5.20+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" -[[package]] -name = "proc-macro-rules" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c277e4e643ef00c1233393c673f655e3672cf7eb3ba08a00bdd0ea59139b5f" -dependencies = [ - "proc-macro-rules-macros", - "proc-macro2", - "syn 2.0.90", -] - -[[package]] -name = "proc-macro-rules-macros" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "207fffb0fe655d1d47f6af98cc2793405e85929bdbc420d685554ff07be27ac7" -dependencies = [ - "once_cell", - "proc-macro2", - "quote", - "syn 2.0.90", -] - [[package]] name = "proc-macro2" version = "1.0.92" @@ -9441,12 +9234,6 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" -[[package]] -name = "radium" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" - [[package]] name = "radix_trie" version = "0.2.1" @@ -10547,7 +10334,7 @@ dependencies = [ "divan", "fdb-util", "foundationdb", - "funty 1.1.0", + "funty", "futures-util", "global-error", "governor", @@ -11026,22 +10813,13 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" -[[package]] -name = "rustc_version" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" -dependencies = [ - "semver 0.9.0", -] - [[package]] name = "rustc_version" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "semver 1.0.23", + "semver", ] [[package]] @@ -11427,15 +11205,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "semver" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" -dependencies = [ - "semver-parser", -] - [[package]] name = "semver" version = "1.0.23" @@ -11445,12 +11214,6 @@ dependencies = [ "serde", ] -[[package]] -name = "semver-parser" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" - [[package]] name = "serde" version = "1.0.219" @@ -11517,7 +11280,6 @@ version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ - "indexmap 2.7.0", "itoa 1.0.14", "memchr", "ryu", @@ -11565,18 +11327,6 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_v8" -version = "0.232.0" -source = "git+https://github.com/rivet-gg/deno_core?rev=8a313913fa73d58f4f9532565b0084e723bc34ad#8a313913fa73d58f4f9532565b0084e723bc34ad" -dependencies = [ - "num-bigint", - "serde", - "smallvec", - "thiserror 1.0.69", - "v8", -] - [[package]] name = "serde_with" version = "2.3.3" @@ -11801,15 +11551,6 @@ dependencies = [ "wide", ] -[[package]] -name = "simd-abstraction" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cadb29c57caadc51ff8346233b5cec1d240b68ce55cf1afc764818791876987" -dependencies = [ - "outref 0.1.0", -] - [[package]] name = "simple_asn1" version = "0.6.2" @@ -11860,25 +11601,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "sourcemap" -version = "8.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "208d40b9e8cad9f93613778ea295ed8f3c2b1824217c6cfc7219d3f6f45b96d4" -dependencies = [ - "base64-simd 0.7.0", - "bitvec", - "data-encoding", - "debugid", - "if_chain", - "rustc-hash 1.1.0", - "rustc_version 0.2.3", - "serde", - "serde_json", - "unicode-id-start", - "url", -] - [[package]] name = "spin" version = "0.5.2" @@ -12190,12 +11912,6 @@ dependencies = [ "rand 0.8.5", ] -[[package]] -name = "stringcase" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04028eeb851ed08af6aba5caa29f2d59a13ed168cee4d6bd753aeefcf1d636b0" - [[package]] name = "stringprep" version = "0.1.5" @@ -12260,15 +11976,6 @@ dependencies = [ "strum_macros 0.24.3", ] -[[package]] -name = "strum" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" -dependencies = [ - "strum_macros 0.25.3", -] - [[package]] name = "strum" version = "0.26.3" @@ -12291,19 +11998,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "strum_macros" -version = "0.25.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" -dependencies = [ - "heck 0.4.1", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.90", -] - [[package]] name = "strum_macros" version = "0.26.4" @@ -12472,12 +12166,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "tar" version = "0.4.43" @@ -13606,12 +13294,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" -[[package]] -name = "unicode-id-start" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f322b60f6b9736017344fa0635d64be2f458fbc04eef65f6be22976dd1ffd5b" - [[package]] name = "unicode-ident" version = "1.0.14" @@ -14038,23 +13720,6 @@ dependencies = [ "serde", ] -[[package]] -name = "v8" -version = "130.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b61316a57fcd7e5f3840fe085f13e6dfd37e92d73b040033d2f598c7a1984c3" -dependencies = [ - "bindgen 0.70.1", - "bitflags 2.6.0", - "fslock", - "gzip-header", - "home", - "miniz_oxide 0.7.4", - "once_cell", - "paste", - "which 6.0.3", -] - [[package]] name = "valuable" version = "0.1.0" @@ -14077,7 +13742,7 @@ dependencies = [ "cargo_metadata", "derive_builder 0.20.2", "regex", - "rustc_version 0.4.1", + "rustc_version", "rustversion", "time 0.3.37", "vergen-lib", @@ -14301,15 +13966,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasm_dep_analyzer" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f270206a91783fd90625c8bb0d8fbd459d0b1d1bf209b656f713f01ae7c04b8" -dependencies = [ - "thiserror 1.0.69", -] - [[package]] name = "web-sys" version = "0.3.74" @@ -14417,18 +14073,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "which" -version = "6.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ee928febd44d98f2f459a4a79bd4d928591333a494a10a868418ac1b39cf1f" -dependencies = [ - "either", - "home", - "rustix", - "winsafe", -] - [[package]] name = "whoami" version = "1.5.2" @@ -14865,12 +14509,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "winsafe" -version = "0.0.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" - [[package]] name = "wit-bindgen-rt" version = "0.39.0" @@ -14892,15 +14530,6 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" -[[package]] -name = "wyz" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" -dependencies = [ - "tap", -] - [[package]] name = "xattr" version = "1.3.1" diff --git a/docker/dev-full/grafana/dashboards/pegboard.json b/docker/dev-full/grafana/dashboards/pegboard.json new file mode 100644 index 0000000000..dd4c17092d --- /dev/null +++ b/docker/dev-full/grafana/dashboards/pegboard.json @@ -0,0 +1,2082 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 7, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 56, + "panels": [], + "title": "Usage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "cores" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Allocated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Pending" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 33, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n sum without (state) (\n max by (client_id) (\n rivet_pegboard_client_cpu_total{flavor!=\"isolate\", state=\"active\"}\n )\n ) - sum without(state) (\n max by (client_id) (\n rivet_pegboard_client_cpu_allocated{flavor!=\"isolate\"}\n )\n )\n) / 1000\n- sum(rivet_pegboard_actor_cpu_pending_allocation) / 1000", + "format": "heatmap", + "legendFormat": "Available", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_cpu_allocated{flavor!=\"isolate\", state=\"draining\"}\n )\n) / 1000", + "hide": false, + "instant": false, + "legendFormat": "Draining", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rivet_pegboard_actor_cpu_pending_allocation) / 1000", + "hide": false, + "instant": false, + "legendFormat": "Pending", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_cpu_allocated{flavor!=\"isolate\", state=\"active\"}\n )\n) / 1000", + "hide": false, + "instant": false, + "legendFormat": "Allocated", + "range": true, + "refId": "C" + } + ], + "title": "CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "cores" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Allocated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining Capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#a26e3b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 47, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_cpu_allocated{flavor!=\"isolate\", state=\"draining\"}\n )\n) / 1000", + "hide": false, + "instant": false, + "legendFormat": "Draining", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_cpu_total{flavor!=\"isolate\", state=\"draining\"}\n )\n) / 1000", + "hide": false, + "instant": false, + "legendFormat": "Draining Capacity", + "range": true, + "refId": "C" + } + ], + "title": "Draining CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "mbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Allocated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Pending" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 49, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n sum without (state) (\n max by (client_id) (\n rivet_pegboard_client_memory_total{flavor!=\"isolate\", state=\"active\"}\n )\n ) - sum without(state) (\n max by (client_id) (\n rivet_pegboard_client_memory_allocated{flavor!=\"isolate\"}\n )\n )\n)\n- sum (rivet_pegboard_actor_memory_pending_allocation)", + "format": "heatmap", + "legendFormat": "Available", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_memory_allocated{flavor!=\"isolate\", state=\"draining\"} \n )\n)", + "hide": false, + "instant": false, + "legendFormat": "Draining", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (rivet_pegboard_actor_memory_pending_allocation)", + "hide": false, + "instant": false, + "legendFormat": "Pending", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_memory_allocated{flavor!=\"isolate\", state=\"active\"}\n )\n)", + "hide": false, + "instant": false, + "legendFormat": "Allocated", + "range": true, + "refId": "C" + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 8, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "mbytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Allocated" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Draining Capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#a26e3b", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 48, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by(client_id) (\n rivet_pegboard_client_memory_allocated{flavor!=\"isolate\", state=\"draining\"}\n )\n)", + "hide": false, + "instant": false, + "legendFormat": "Draining", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(\n max by (client_id) (\n rivet_pegboard_client_memory_total{flavor!=\"isolate\", state=\"draining\"}\n )\n)", + "hide": false, + "instant": false, + "legendFormat": "Draining Capacity", + "range": true, + "refId": "D" + } + ], + "title": "Draining Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 37, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_pegboard_actor_allocate_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Actor Allocate Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 38, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(rivet_pegboard_actor_start_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Actor Start Duration", + "type": "heatmap" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 57, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pkt/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 39, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (irate(pegboard_packet_recv_total [$__rate_interval])) / 1000", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Packet Receive", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pkt/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 40, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum (irate(pegboard_packet_send_total [$__rate_interval])) / 1000", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Packet Send", + "type": "timeseries" + } + ], + "title": "Network", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 59, + "panels": [], + "title": "Images", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 62, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (datacenter_id, server_id) (\n max by (datacenter_id, server_id) (\n pegboard_image_cache_count\n )\n)", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Image Cache Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 63, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (datacenter_id, server_id) (\n max by (datacenter_id, server_id) (\n pegboard_image_cache_size\n )\n)", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Image Cache Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "req/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 60, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (datacenter_id, server_id) (\n max by (datacenter_id, server_id) (\n rate(\n pegboard_image_download_request_total [$__rate_interval]\n )\n )\n)", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Image Download Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "req/s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 61, + "interval": "15s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum by (datacenter_id, server_id) (\n max by (datacenter_id, server_id) (\n rate(\n pegboard_image_download_cache_miss_total [$__rate_interval]\n )\n )\n)", + "format": "heatmap", + "legendFormat": "{{datacenter_id}} – {{server_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Image Download Cache Miss Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 58, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_download_image_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Image Download Duration", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 55, + "panels": [], + "title": "Setup", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 50, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_actor_setup_download_image_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Actor Setup Download Image Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 52, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_actor_setup_bind_ports_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bind Ports Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 51, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_actor_setup_make_fs_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Make FS Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 54, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_actor_setup_cni_network_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Setup CNI Network Duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 53, + "interval": "15s", + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size" + } + }, + "cellGap": 0, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "RdBu", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "max": "60", + "min": 0, + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(pegboard_actor_setup_oci_bundle_duration_bucket [$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Setup OCI Bundle Duration", + "type": "heatmap" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Pegboard", + "uid": "0108bf74-ba6c-46d2-a4c7-75cbbf324f40", + "version": 11, + "weekStart": "" +} \ No newline at end of file diff --git a/examples/system-test-actor/src/managerClient.ts b/examples/system-test-actor/src/managerClient.ts index 62fbdcebac..8314751473 100644 --- a/examples/system-test-actor/src/managerClient.ts +++ b/examples/system-test-actor/src/managerClient.ts @@ -119,6 +119,7 @@ function encodeFrame(payload: any): Buffer { payloadLength.writeUInt32BE(json.length, 0); const header = Buffer.alloc(4); // All zeros for now + return Buffer.concat([payloadLength, header, Buffer.from(json)]); } diff --git a/packages/common/fdb-util/src/keys.rs b/packages/common/fdb-util/src/keys.rs index dc9be26364..fadcafa55d 100644 --- a/packages/common/fdb-util/src/keys.rs +++ b/packages/common/fdb-util/src/keys.rs @@ -41,17 +41,19 @@ pub const ENV: usize = 39; pub const PORT: usize = 40; pub const INGRESS: usize = 41; pub const PROXIED: usize = 42; -pub const CLIENTS_BY_REMAINING_MEM: usize = 43; +pub const CLIENT_BY_REMAINING_MEM: usize = 43; pub const SQLITE: usize = 44; pub const INTERNAL: usize = 45; pub const METADATA: usize = 46; pub const COMPRESSED_DATA: usize = 47; pub const RUNNER: usize = 48; -pub const RUNNERS_BY_REMAINING_SLOTS: usize = 49; +pub const RUNNER_BY_REMAINING_SLOTS: usize = 49; pub const REMAINING_SLOTS: usize = 50; pub const TOTAL_SLOTS: usize = 51; pub const IMAGE_ID: usize = 52; pub const ACTOR2: usize = 53; +pub const PENDING_ACTOR: usize = 54; +pub const PENDING_ACTOR_BY_IMAGE_ID: usize = 55; // Directories with fdbrs must use string paths instead of tuples pub mod dir { @@ -105,17 +107,19 @@ pub fn key_from_str(key: &str) -> Option { "port" => Some(PORT), "ingress" => Some(INGRESS), "proxied" => Some(PROXIED), - "clients_by_remaining_mem" => Some(CLIENTS_BY_REMAINING_MEM), + "client_by_remaining_mem" => Some(CLIENT_BY_REMAINING_MEM), "sqlite" => Some(SQLITE), "internal" => Some(INTERNAL), "metadata" => Some(METADATA), "compressed_data" => Some(COMPRESSED_DATA), "runner" => Some(RUNNER), - "runners_by_remaining_slots" => Some(RUNNERS_BY_REMAINING_SLOTS), + "runner_by_remaining_slots" => Some(RUNNER_BY_REMAINING_SLOTS), "remaining_slots" => Some(REMAINING_SLOTS), "total_slots" => Some(TOTAL_SLOTS), "image_id" => Some(IMAGE_ID), "actor2" => Some(ACTOR2), + "pending_actor" => Some(PENDING_ACTOR), + "pending_actor_by_image_id" => Some(PENDING_ACTOR_BY_IMAGE_ID), _ => None, } } diff --git a/packages/common/server-cli/src/util/fdb.rs b/packages/common/server-cli/src/util/fdb.rs index 4f34b65127..a13fbb9d35 100644 --- a/packages/common/server-cli/src/util/fdb.rs +++ b/packages/common/server-cli/src/util/fdb.rs @@ -36,6 +36,8 @@ impl SimpleTupleValue { SimpleTupleValue::F64(v) } else if let Ok(v) = Uuid::from_str(value) { SimpleTupleValue::Uuid(v) + } else if let Ok(v) = rivet_util::Id::from_str(value) { + SimpleTupleValue::Id(v) } else { SimpleTupleValue::String(unescape(value)) } diff --git a/packages/core/services/build/src/ops/create.rs b/packages/core/services/build/src/ops/create.rs index 74a7360633..438db4f7ef 100644 --- a/packages/core/services/build/src/ops/create.rs +++ b/packages/core/services/build/src/ops/create.rs @@ -19,7 +19,7 @@ pub struct Input { pub kind: BuildKind, pub compression: BuildCompression, pub allocation_type: BuildAllocationType, - pub allocation_total_slots: u64, + pub allocation_total_slots: u32, pub resources: Option, } diff --git a/packages/core/services/build/src/types.rs b/packages/core/services/build/src/types.rs index 305589146d..9eb635aa42 100644 --- a/packages/core/services/build/src/types.rs +++ b/packages/core/services/build/src/types.rs @@ -51,7 +51,7 @@ pub struct Build { pub kind: BuildKind, pub compression: BuildCompression, pub allocation_type: BuildAllocationType, - pub allocation_total_slots: u64, + pub allocation_total_slots: u32, pub resources: Option, pub tags: HashMap, } diff --git a/packages/edge/api/actor/src/route/actors.rs b/packages/edge/api/actor/src/route/actors.rs index 11970f77ba..9bfdce2f2e 100644 --- a/packages/edge/api/actor/src/route/actors.rs +++ b/packages/edge/api/actor/src/route/actors.rs @@ -1,10 +1,10 @@ use std::collections::HashMap; use api_helper::{anchor::WatchIndexQuery, ctx::Ctx}; +use chirp_workflow::prelude::*; use futures_util::{FutureExt, StreamExt, TryStreamExt}; use rivet_api::models; use rivet_convert::{ApiInto, ApiTryInto}; -use rivet_operation::prelude::*; use serde::Deserialize; use serde_json::json; use util::serde::AsHashableExt; @@ -152,14 +152,14 @@ pub async fn create( } }; - let allocated_fut = if network.wait_ready.unwrap_or_default() { + let created_fut = if network.wait_ready.unwrap_or_default() { std::future::pending().boxed() } else { - let mut allocated_sub = ctx - .subscribe::(("actor_id", actor_id)) + let mut created_sub = ctx + .subscribe::(("actor_id", actor_id)) .await?; - async move { allocated_sub.next().await }.boxed() + async move { created_sub.next().await }.boxed() }; let mut ready_sub = ctx .subscribe::(("actor_id", actor_id)) @@ -239,9 +239,9 @@ pub async fn create( .tag("actor_id", actor_id) .dispatch() .await?; - // Wait for allocated/ready, fail, or destroy + // Wait for create/ready, fail, or destroy tokio::select! { - res = allocated_fut => { res?; }, + res = created_fut => { res?; }, res = ready_sub.next() => { res?; }, res = fail_sub.next() => { let msg = res?; @@ -258,14 +258,14 @@ pub async fn create( let actor_id = util::Id::new_v1(ctx.config().server()?.rivet.edge()?.datacenter_label()); tracing::info!(?actor_id, ?tags, "creating actor with tags"); - let allocated_fut = if network.wait_ready.unwrap_or_default() { + let created_fut = if network.wait_ready.unwrap_or_default() { std::future::pending().boxed() } else { - let mut allocated_sub = ctx - .subscribe::(("actor_id", actor_id)) + let mut created_sub = ctx + .subscribe::(("actor_id", actor_id)) .await?; - async move { allocated_sub.next().await }.boxed() + async move { created_sub.next().await }.boxed() }; let mut ready_sub = ctx .subscribe::(("actor_id", actor_id)) @@ -348,7 +348,7 @@ pub async fn create( // Wait for create/ready, fail, or destroy tokio::select! { - res = allocated_fut => { res?; }, + res = created_fut => { res?; }, res = ready_sub.next() => { res?; }, res = fail_sub.next() => { let msg = res?; @@ -425,6 +425,9 @@ pub async fn destroy( ); let mut sub = ctx + .subscribe::(("actor_id", actor_id)) + .await?; + let mut old_sub = ctx .subscribe::(("actor_id", actor_id)) .await?; @@ -436,15 +439,33 @@ pub async fn destroy( return Ok(json!({})); } - ctx.signal(pegboard::workflows::actor::Destroy { - override_kill_timeout_ms: query.override_kill_timeout, - }) - .to_workflow::() - .tag("actor_id", actor_id) - .send() - .await?; + // Try actor2 first + let res = ctx + .signal(pegboard::workflows::actor2::Destroy { + override_kill_timeout_ms: query.override_kill_timeout, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + // Try old actors + ctx + .signal(pegboard::workflows::actor::Destroy { + override_kill_timeout_ms: query.override_kill_timeout, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await?; + + old_sub.next().await?; + } else { + res?; - sub.next().await?; + sub.next().await?; + } Ok(json!({})) } @@ -481,21 +502,29 @@ pub async fn upgrade( ) .await?; - // TODO: Add back once we figure out how to cleanly handle if a wf is already complete when - // upgrading - // let mut sub = ctx - // .subscribe::(("actor_id", actor_id)) - // .await?; - - ctx.signal(pegboard::workflows::actor::Upgrade { - image_id: build.build_id, - }) - .to_workflow::() - .tag("actor_id", actor_id) - .send() - .await?; - - // sub.next().await?; + // Try actor2 first + let res = ctx + .signal(pegboard::workflows::actor2::Upgrade { + image_id: build.build_id, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + // Try old actors + ctx + .signal(pegboard::workflows::actor::Upgrade { + image_id: build.build_id, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await?; + } else { + res?; + } Ok(json!({})) } @@ -589,35 +618,42 @@ pub async fn upgrade_all( // cursor of [created_at, actor_id] that we pass to the fdb range created_before = list_res.actors.last().map(|x| x.create_ts - 1); - // TODO: Add back once we figure out how to cleanly handle if a wf is already complete when - // upgrading - // let subs = futures_util::stream::iter(list_res.actor_ids.clone()) - // .map(|actor_id| { - // ctx.subscribe::(("actor_id", actor_id)) - // }) - // .buffer_unordered(32) - // .try_collect::>() - // .await?; - + let ctx = (*ctx).clone(); futures_util::stream::iter(list_res.actors) .map(|actor| { - ctx.signal(pegboard::workflows::actor::Upgrade { - image_id: build.build_id, - }) - .to_workflow::() - .tag("actor_id", actor.actor_id) - .send() + let ctx = ctx.clone(); + async move { + // Try actor2 first + let res = ctx + .signal(pegboard::workflows::actor2::Upgrade { + image_id: build.build_id, + }) + .to_workflow::() + .tag("actor_id", actor.actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + // Try old actors + ctx + .signal(pegboard::workflows::actor::Upgrade { + image_id: build.build_id, + }) + .to_workflow::() + .tag("actor_id", actor.actor_id) + .send() + .await?; + } else { + res?; + } + + GlobalResult::Ok(()) + } }) .buffer_unordered(32) .try_collect::>() .await?; - // futures_util::stream::iter(subs) - // .map(|mut sub| async move { sub.next().await }) - // .buffer_unordered(32) - // .try_collect::>() - // .await?; - if count < 10_000 { break; } diff --git a/packages/edge/infra/client/manager/src/main.rs b/packages/edge/infra/client/manager/src/main.rs index e2381d593a..5e6cbe28ac 100644 --- a/packages/edge/infra/client/manager/src/main.rs +++ b/packages/edge/infra/client/manager/src/main.rs @@ -128,6 +128,12 @@ async fn init() -> Result { ), }; + // SAFETY: No other task has spawned yet. + // Set client_id env var so it can be read by the prometheus registry + unsafe { + std::env::set_var("CLIENT_ID", config.client.cluster.client_id.to_string()); + } + if config.client.logs.redirect_logs() { rivet_logs::Logs::new( config.client.data_dir().join("logs"), @@ -137,12 +143,6 @@ async fn init() -> Result { .await?; } - // SAFETY: No other task has spawned yet. - // Set client_id env var so it can be read by the prometheus registry - unsafe { - std::env::set_var("CLIENT_ID", config.client.cluster.client_id.to_string()); - } - // Read system metrics let system = crate::system_info::fetch().await?; diff --git a/packages/edge/services/pegboard/db/analytics/migrations/20250619144812_add_runner_id.down.sql b/packages/edge/services/pegboard/db/analytics/migrations/20250619144812_add_runner_id.down.sql index e69de29bb2..374ffe451b 100644 --- a/packages/edge/services/pegboard/db/analytics/migrations/20250619144812_add_runner_id.down.sql +++ b/packages/edge/services/pegboard/db/analytics/migrations/20250619144812_add_runner_id.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE actors +DROP COLUMN runner_id; diff --git a/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.down.sql b/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.down.sql new file mode 100644 index 0000000000..cfa35503ec --- /dev/null +++ b/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE actors +DROP COLUMN pending_allocation_at; diff --git a/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.up.sql b/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.up.sql new file mode 100644 index 0000000000..0458db7760 --- /dev/null +++ b/packages/edge/services/pegboard/db/analytics/migrations/20250625170301_add_alloc_ts.up.sql @@ -0,0 +1,2 @@ +ALTER TABLE actors +ADD COLUMN pending_allocation_at DateTime64(9); diff --git a/packages/edge/services/pegboard/src/keys/datacenter.rs b/packages/edge/services/pegboard/src/keys/datacenter.rs index 789d4a8f63..e00e270a82 100644 --- a/packages/edge/services/pegboard/src/keys/datacenter.rs +++ b/packages/edge/services/pegboard/src/keys/datacenter.rs @@ -1,6 +1,7 @@ use std::result::Result::Ok; use anyhow::*; +use build::types::BuildAllocationType; use chirp_workflow::prelude::*; use fdb_util::prelude::*; use serde::{Deserialize, Serialize}; @@ -68,7 +69,7 @@ impl TuplePack for ClientsByRemainingMemKey { ) -> std::io::Result { let t = ( DATACENTER, - CLIENTS_BY_REMAINING_MEM, + CLIENT_BY_REMAINING_MEM, self.flavor as usize, self.remaining_mem, self.last_ping_ts, @@ -133,7 +134,7 @@ impl TuplePack for ClientsByRemainingMemSubspaceKey { ) -> std::io::Result { let mut offset = VersionstampOffset::None { size: 0 }; - let t = (DATACENTER, CLIENTS_BY_REMAINING_MEM); + let t = (DATACENTER, CLIENT_BY_REMAINING_MEM); offset += t.pack(w, tuple_depth)?; if let Some(flavor) = &self.flavor { @@ -151,12 +152,12 @@ impl TuplePack for ClientsByRemainingMemSubspaceKey { #[derive(Debug)] pub struct RunnersByRemainingSlotsKey { pub build_id: Uuid, - pub remaining_slots: u64, + pub remaining_slots: u32, pub runner_id: Uuid, } impl RunnersByRemainingSlotsKey { - pub fn new(build_id: Uuid, remaining_slots: u64, runner_id: Uuid) -> Self { + pub fn new(build_id: Uuid, remaining_slots: u32, runner_id: Uuid) -> Self { RunnersByRemainingSlotsKey { build_id, remaining_slots, @@ -170,7 +171,7 @@ impl RunnersByRemainingSlotsKey { pub fn subspace_with_slots( build_id: Uuid, - remaining_slots: u64, + remaining_slots: u32, ) -> RunnersByRemainingSlotsSubspaceKey { RunnersByRemainingSlotsSubspaceKey::new_with_slots(build_id, remaining_slots) } @@ -196,7 +197,7 @@ impl TuplePack for RunnersByRemainingSlotsKey { ) -> std::io::Result { let t = ( DATACENTER, - RUNNERS_BY_REMAINING_SLOTS, + RUNNER_BY_REMAINING_SLOTS, self.build_id, self.remaining_slots, self.runner_id, @@ -208,7 +209,7 @@ impl TuplePack for RunnersByRemainingSlotsKey { impl<'de> TupleUnpack<'de> for RunnersByRemainingSlotsKey { fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { let (input, (_, _, build_id, remaining_slots, runner_id)) = - <(usize, usize, Uuid, u64, Uuid)>::unpack(input, tuple_depth)?; + <(usize, usize, Uuid, u32, Uuid)>::unpack(input, tuple_depth)?; let v = RunnersByRemainingSlotsKey { build_id, @@ -228,7 +229,7 @@ pub struct RunnersByRemainingSlotsKeyData { pub struct RunnersByRemainingSlotsSubspaceKey { pub build_id: Uuid, - pub remaining_slots: Option, + pub remaining_slots: Option, } impl RunnersByRemainingSlotsSubspaceKey { @@ -239,7 +240,7 @@ impl RunnersByRemainingSlotsSubspaceKey { } } - pub fn new_with_slots(build_id: Uuid, remaining_slots: u64) -> Self { + pub fn new_with_slots(build_id: Uuid, remaining_slots: u32) -> Self { RunnersByRemainingSlotsSubspaceKey { build_id, remaining_slots: Some(remaining_slots), @@ -255,7 +256,7 @@ impl TuplePack for RunnersByRemainingSlotsSubspaceKey { ) -> std::io::Result { let mut offset = VersionstampOffset::None { size: 0 }; - let t = (DATACENTER, RUNNERS_BY_REMAINING_SLOTS, self.build_id); + let t = (DATACENTER, RUNNER_BY_REMAINING_SLOTS, self.build_id); offset += t.pack(w, tuple_depth)?; if let Some(remaining_slots) = &self.remaining_slots { @@ -265,3 +266,190 @@ impl TuplePack for RunnersByRemainingSlotsSubspaceKey { Ok(offset) } } + +#[derive(Debug)] +pub struct PendingActorByImageIdKey { + pub image_id: Uuid, + pub ts: i64, + pub actor_id: util::Id, +} + +impl PendingActorByImageIdKey { + pub fn new(image_id: Uuid, ts: i64, actor_id: util::Id) -> Self { + PendingActorByImageIdKey { + image_id, + ts, + actor_id, + } + } + + pub fn sister(&self) -> PendingActorKey { + PendingActorKey { + ts: self.ts, + actor_id: self.actor_id, + } + } + + pub fn subspace(image_id: Uuid) -> PendingActorByImageIdSubspaceKey { + PendingActorByImageIdSubspaceKey::new(image_id) + } +} + +impl FormalKey for PendingActorByImageIdKey { + type Value = PendingActorByImageIdKeyData; + + fn deserialize(&self, raw: &[u8]) -> Result { + serde_json::from_slice(raw).map_err(Into::into) + } + + fn serialize(&self, value: Self::Value) -> Result> { + serde_json::to_vec(&value).map_err(Into::into) + } +} + +impl TuplePack for PendingActorByImageIdKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = ( + DATACENTER, + PENDING_ACTOR_BY_IMAGE_ID, + self.image_id, + self.ts, + self.actor_id, + ); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for PendingActorByImageIdKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, _, image_id, ts, actor_id)) = + <(usize, usize, usize, Uuid, i64, util::Id)>::unpack(input, tuple_depth)?; + + let v = PendingActorByImageIdKey { + image_id, + ts, + actor_id, + }; + + Ok((input, v)) + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct PendingActorByImageIdKeyData { + pub generation: u32, + pub build_allocation_type: BuildAllocationType, + pub build_allocation_total_slots: u32, + /// Millicore (1/1000 of a core). + pub cpu: u64, + /// Bytes. + pub memory: u64, +} + +pub struct PendingActorByImageIdSubspaceKey { + pub image_id: Uuid, +} + +impl PendingActorByImageIdSubspaceKey { + pub fn new(image_id: Uuid) -> Self { + PendingActorByImageIdSubspaceKey { image_id } + } +} + +impl TuplePack for PendingActorByImageIdSubspaceKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (DATACENTER, PENDING_ACTOR_BY_IMAGE_ID, self.image_id); + t.pack(w, tuple_depth) + } +} + +#[derive(Debug)] +pub struct PendingActorKey { + pub ts: i64, + pub actor_id: util::Id, +} + +impl PendingActorKey { + pub fn new(ts: i64, actor_id: util::Id) -> Self { + PendingActorKey { ts, actor_id } + } + + pub fn sister(&self, image_id: Uuid) -> PendingActorByImageIdKey { + PendingActorByImageIdKey { + image_id, + ts: self.ts, + actor_id: self.actor_id, + } + } + + pub fn subspace() -> PendingActorSubspaceKey { + PendingActorSubspaceKey {} + } +} + +impl FormalKey for PendingActorKey { + type Value = PendingActorKeyData; + + fn deserialize(&self, raw: &[u8]) -> Result { + serde_json::from_slice(raw).map_err(Into::into) + } + + fn serialize(&self, value: Self::Value) -> Result> { + serde_json::to_vec(&value).map_err(Into::into) + } +} + +impl TuplePack for PendingActorKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (DATACENTER, PENDING_ACTOR, self.ts, self.actor_id); + t.pack(w, tuple_depth) + } +} + +impl<'de> TupleUnpack<'de> for PendingActorKey { + fn unpack(input: &[u8], tuple_depth: TupleDepth) -> PackResult<(&[u8], Self)> { + let (input, (_, _, _, ts, actor_id)) = + <(usize, usize, usize, i64, util::Id)>::unpack(input, tuple_depth)?; + + let v = PendingActorKey { ts, actor_id }; + + Ok((input, v)) + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct PendingActorKeyData { + pub generation: u32, + pub image_id: Uuid, + pub build_allocation_type: BuildAllocationType, + pub build_allocation_total_slots: u32, + /// Millicore (1/1000 of a core). + pub cpu: u64, + /// Bytes. + pub memory: u64, +} + +pub struct PendingActorSubspaceKey {} + +impl TuplePack for PendingActorSubspaceKey { + fn pack( + &self, + w: &mut W, + tuple_depth: TupleDepth, + ) -> std::io::Result { + let t = (DATACENTER, PENDING_ACTOR); + t.pack(w, tuple_depth) + } +} diff --git a/packages/edge/services/pegboard/src/keys/runner.rs b/packages/edge/services/pegboard/src/keys/runner.rs index 408b0cbede..f734368cc7 100644 --- a/packages/edge/services/pegboard/src/keys/runner.rs +++ b/packages/edge/services/pegboard/src/keys/runner.rs @@ -62,10 +62,10 @@ impl RemainingSlotsKey { impl FormalKey for RemainingSlotsKey { /// Slots. - type Value = u64; + type Value = u32; fn deserialize(&self, raw: &[u8]) -> Result { - Ok(u64::from_be_bytes(raw.try_into()?)) + Ok(u32::from_be_bytes(raw.try_into()?)) } fn serialize(&self, value: Self::Value) -> Result> { @@ -106,11 +106,11 @@ impl TotalSlotsKey { } impl FormalKey for TotalSlotsKey { - /// MiB. - type Value = u64; + /// Slots. + type Value = u32; fn deserialize(&self, raw: &[u8]) -> Result { - Ok(u64::from_be_bytes(raw.try_into()?)) + Ok(u32::from_be_bytes(raw.try_into()?)) } fn serialize(&self, value: Self::Value) -> Result> { diff --git a/packages/edge/services/pegboard/src/metrics.rs b/packages/edge/services/pegboard/src/metrics.rs index 0eb40d908e..ead912946e 100644 --- a/packages/edge/services/pegboard/src/metrics.rs +++ b/packages/edge/services/pegboard/src/metrics.rs @@ -8,6 +8,13 @@ lazy_static::lazy_static! { *REGISTRY ).unwrap(); + pub static ref CLIENT_CPU_TOTAL: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_client_cpu_total", + "Total millicores of cpu available on a client.", + &["client_id", "flavor", "state"], + *REGISTRY + ).unwrap(); + pub static ref CLIENT_MEMORY_TOTAL: IntGaugeVec = register_int_gauge_vec_with_registry!( "pegboard_client_memory_total", "Total MiB of memory available on a client.", @@ -15,9 +22,9 @@ lazy_static::lazy_static! { *REGISTRY ).unwrap(); - pub static ref CLIENT_CPU_TOTAL: IntGaugeVec = register_int_gauge_vec_with_registry!( - "pegboard_client_cpu_total", - "Total millicores of cpu available on a client.", + pub static ref CLIENT_CPU_ALLOCATED: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_client_cpu_allocated", + "Total millicores of cpu allocated on a client.", &["client_id", "flavor", "state"], *REGISTRY ).unwrap(); @@ -29,10 +36,17 @@ lazy_static::lazy_static! { *REGISTRY ).unwrap(); - pub static ref CLIENT_CPU_ALLOCATED: IntGaugeVec = register_int_gauge_vec_with_registry!( - "pegboard_client_cpu_allocated", - "Total millicores of cpu allocated on a client.", - &["client_id", "flavor", "state"], + pub static ref ACTOR_CPU_PENDING_ALLOCATION: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_actor_cpu_pending_allocation", + "Total actor cpu waiting for availability.", + &[], + *REGISTRY + ).unwrap(); + + pub static ref ACTOR_MEMORY_PENDING_ALLOCATION: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_actor_memory_pending_allocation", + "Total actor memory waiting for availability.", + &[], *REGISTRY ).unwrap(); @@ -52,16 +66,16 @@ lazy_static::lazy_static! { *REGISTRY, ).unwrap(); - pub static ref ENV_MEMORY_USAGE: IntGaugeVec = register_int_gauge_vec_with_registry!( - "pegboard_env_memory_usage", - "Total MiB of memory used by an environment.", + pub static ref ENV_CPU_USAGE: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_env_cpu_usage", + "Total millicores used by an environment.", &["env_id", "flavor"], *REGISTRY, ).unwrap(); - pub static ref ENV_CPU_USAGE: IntGaugeVec = register_int_gauge_vec_with_registry!( - "pegboard_env_cpu_usage", - "Total millicores used by an environment.", + pub static ref ENV_MEMORY_USAGE: IntGaugeVec = register_int_gauge_vec_with_registry!( + "pegboard_env_memory_usage", + "Total MiB of memory used by an environment.", &["env_id", "flavor"], *REGISTRY, ).unwrap(); diff --git a/packages/edge/services/pegboard/src/ops/client/update_allocation_idx.rs b/packages/edge/services/pegboard/src/ops/client/update_allocation_idx.rs index 7bc99c674f..26815be4e0 100644 --- a/packages/edge/services/pegboard/src/ops/client/update_allocation_idx.rs +++ b/packages/edge/services/pegboard/src/ops/client/update_allocation_idx.rs @@ -2,7 +2,7 @@ use chirp_workflow::prelude::*; use fdb_util::{end_of_key_range, FormalKey, SERIALIZABLE}; use foundationdb::{self as fdb, options::ConflictRangeType}; -use crate::{keys, protocol}; +use crate::{keys, protocol, workflows::client::CLIENT_ELIGIBLE_THRESHOLD_MS}; #[derive(Debug)] pub enum Action { @@ -19,11 +19,12 @@ pub struct Input { pub action: Action, } +// Returns true if the ping was updated and the last ping was over the eligibility period ago #[operation] pub async fn pegboard_client_update_allocation_idx( ctx: &OperationCtx, input: &Input, -) -> GlobalResult<()> { +) -> GlobalResult { ctx.fdb() .await? .run(|tx, _mc| async move { @@ -54,7 +55,7 @@ pub async fn pegboard_client_update_allocation_idx( // ))?, // ) // .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - let last_ping_ts = last_ping_ts_key + let old_last_ping_ts = last_ping_ts_key .deserialize(&last_ping_ts_entry.ok_or(fdb::FdbBindingError::CustomError( format!("key should exist: {last_ping_ts_key:?}").into(), ))?) @@ -63,7 +64,7 @@ pub async fn pegboard_client_update_allocation_idx( let old_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( input.flavor, remaining_mem, - last_ping_ts, + old_last_ping_ts, input.client_id, ); let old_allocation_key_buf = keys::subspace().pack(&old_allocation_key); @@ -78,6 +79,8 @@ pub async fn pegboard_client_update_allocation_idx( match input.action { Action::ClearIdx => { tx.clear(&old_allocation_key_buf); + + Ok(false) } Action::AddIdx => { tx.set( @@ -86,6 +89,8 @@ pub async fn pegboard_client_update_allocation_idx( .serialize(input.client_workflow_id) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, ); + + Ok(false) } // TODO: Could be improved somehow to not require another `.get` Action::UpdatePing => { @@ -122,11 +127,14 @@ pub async fn pegboard_client_update_allocation_idx( .serialize(input.client_workflow_id) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, ); + + Ok(last_ping_ts.saturating_sub(old_last_ping_ts) + > CLIENT_ELIGIBLE_THRESHOLD_MS) + } else { + Ok(false) } } } - - Ok(()) }) .custom_instrument(tracing::info_span!("client_update_alloc_idx_tx")) .await diff --git a/packages/edge/services/pegboard/src/workflows/actor2/analytics.rs b/packages/edge/services/pegboard/src/workflows/actor2/analytics.rs index 00ed54cc5f..a0e349eca8 100644 --- a/packages/edge/services/pegboard/src/workflows/actor2/analytics.rs +++ b/packages/edge/services/pegboard/src/workflows/actor2/analytics.rs @@ -45,6 +45,8 @@ pub struct ActorClickHouseRow { /// 0 = not set started_at: i64, /// See `started_at`. + pending_allocation_at: i64, + /// See `started_at`. connectable_at: i64, /// See `started_at`. finished_at: i64, @@ -100,6 +102,7 @@ struct StateRow { lifecycle_kill_timeout_ms: i64, lifecycle_durable: bool, create_ts: i64, + pending_allocation_ts: Option, start_ts: Option, connectable_ts: Option, finish_ts: Option, @@ -148,6 +151,7 @@ pub async fn insert_clickhouse( lifecycle_kill_timeout_ms, lifecycle_durable, create_ts, + pending_allocation_ts, start_ts, connectable_ts, finish_ts, @@ -291,6 +295,10 @@ pub async fn insert_clickhouse( cpu_millicores: state_row.resources_cpu_millicores, memory_mib: state_row.resources_memory_mib, created_at: state_row.create_ts * 1_000_000, // Convert ms to ns for ClickHouse DateTime64(9) + pending_allocation_at: state_row + .pending_allocation_ts + .map(|ts| ts * 1_000_000) + .unwrap_or_default(), started_at: state_row .start_ts .map(|ts| ts * 1_000_000) diff --git a/packages/edge/services/pegboard/src/workflows/actor2/mod.rs b/packages/edge/services/pegboard/src/workflows/actor2/mod.rs index b043da1902..7f6673581c 100644 --- a/packages/edge/services/pegboard/src/workflows/actor2/mod.rs +++ b/packages/edge/services/pegboard/src/workflows/actor2/mod.rs @@ -6,6 +6,7 @@ use futures_util::FutureExt; use crate::{ protocol, types::{ActorLifecycle, ActorResources, EndpointType, NetworkMode, Routing}, + workflows::client::AllocatePendingActorsInput, }; mod analytics; @@ -137,13 +138,7 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu let Some(allocate_res) = runtime::spawn_actor(ctx, input, &initial_actor_setup, 0).await? else { - ctx.msg(Failed { - message: "Failed to allocate (no availability).".into(), - }) - .tag("actor_id", input.actor_id) - .send() - .await?; - + // Destroyed early ctx.workflow(destroy::Input { actor_id: input.actor_id, generation: 0, @@ -157,14 +152,6 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu return Ok(()); }; - ctx.v(2) - .msg(Allocated { - client_id: allocate_res.client_id, - }) - .tag("actor_id", input.actor_id) - .send() - .await?; - let lifecycle_res = ctx .loope( runtime::State::new( @@ -198,18 +185,14 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu ) .await?; - if let Some(sig) = - runtime::reschedule_actor(ctx, &input, state, state.image_id) - .await? + if runtime::reschedule_actor(ctx, &input, state, state.image_id).await? { // Destroyed early return Ok(Loop::Break(runtime::LifecycleRes { generation: state.generation, image_id: state.image_id, kill: Some(KillCtx { - kill_timeout_ms: sig - .override_kill_timeout_ms - .unwrap_or(input.lifecycle.kill_timeout_ms), + kill_timeout_ms: input.lifecycle.kill_timeout_ms, }), })); } else { @@ -349,7 +332,6 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu state.image_id, ) .await? - .is_some() { // Destroyed early return Ok(Loop::Break(runtime::LifecycleRes { @@ -411,18 +393,14 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu .await?; state.image_id = sig.image_id; - if let Some(sig) = - runtime::reschedule_actor(ctx, &input, state, state.image_id) - .await? + if runtime::reschedule_actor(ctx, &input, state, state.image_id).await? { // Destroyed early return Ok(Loop::Break(runtime::LifecycleRes { generation: state.generation, image_id: input.image_id, kill: Some(KillCtx { - kill_timeout_ms: sig - .override_kill_timeout_ms - .unwrap_or(input.lifecycle.kill_timeout_ms), + kill_timeout_ms: input.lifecycle.kill_timeout_ms, }), })); } @@ -471,17 +449,27 @@ pub async fn pegboard_actor2(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu .output() .await?; + // NOTE: The reason we allocate other actors from this actor workflow is because if we instead sent a + // signal to the client wf here it would incur a heavy throughput hit and we need the client wf to be as + // lightweight as possible; processing as few signals that aren't events/commands as possible + // Allocate other pending actors from queue + let res = ctx.activity(AllocatePendingActorsInput {}).await?; + + // Dispatch pending allocs + for alloc in res.allocations { + ctx.signal(alloc.signal) + .to_workflow::() + .tag("actor_id", alloc.actor_id) + .send() + .await?; + } + Ok(()) } #[message("pegboard_actor_create_complete")] pub struct CreateComplete {} -#[message("pegboard_actor_allocated")] -pub struct Allocated { - pub client_id: Uuid, -} - #[message("pegboard_actor_failed")] pub struct Failed { pub message: String, @@ -490,6 +478,15 @@ pub struct Failed { #[message("pegboard_actor_ready")] pub struct Ready {} +#[signal("pegboard_actor_allocate")] +#[derive(Debug)] +pub struct Allocate { + pub runner_id: Uuid, + pub new_runner: bool, + pub client_id: Uuid, + pub client_workflow_id: Uuid, +} + #[signal("pegboard_actor_destroy")] pub struct Destroy { pub override_kill_timeout_ms: Option, @@ -527,6 +524,12 @@ pub struct UpgradeStarted {} #[message("pegboard_actor_upgrade_complete")] pub struct UpgradeComplete {} +join_signal!(PendingAllocation { + Allocate, + Destroy, + // +}); + join_signal!(Main { StateUpdate, Upgrade, diff --git a/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs b/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs index f55b9235ae..8b19bf4a96 100644 --- a/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs +++ b/packages/edge/services/pegboard/src/workflows/actor2/runtime.rs @@ -2,19 +2,21 @@ use std::time::Instant; use build::types::{BuildAllocationType, BuildKind}; use chirp_workflow::prelude::*; +use cluster::types::BuildDeliveryMethod; use fdb_util::{end_of_key_range, FormalKey, SERIALIZABLE, SNAPSHOT}; use foundationdb::{ self as fdb, options::{ConflictRangeType, StreamingMode}, }; +use futures_util::StreamExt; use futures_util::{FutureExt, TryStreamExt}; use nix::sys::signal::Signal; use sqlx::Acquire; use super::{ destroy::{self, KillCtx}, - setup, Destroy, Input, ACTOR_START_THRESHOLD_MS, BASE_RETRY_TIMEOUT_MS, - RETRY_RESET_DURATION_MS, + setup, Allocate, Destroy, Input, PendingAllocation, ACTOR_START_THRESHOLD_MS, + BASE_RETRY_TIMEOUT_MS, RETRY_RESET_DURATION_MS, }; use crate::{ keys, metrics, @@ -114,6 +116,77 @@ async fn update_client_and_runner( Ok(()) } +#[derive(Debug, Serialize, Deserialize, Hash)] +struct ResolveArtifactsInput { + build_upload_id: Uuid, + build_file_name: String, + dc_build_delivery_method: BuildDeliveryMethod, +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +struct ResolveArtifactsOutput { + artifact_url_stub: String, + fallback_artifact_url: String, + /// Bytes. + artifact_size: u64, +} + +#[activity(ResolveArtifacts)] +async fn resolve_artifacts( + ctx: &ActivityCtx, + input: &ResolveArtifactsInput, +) -> GlobalResult { + // Get the fallback URL + let fallback_artifact_url = { + tracing::debug!("using s3 direct delivery"); + + // Build client + let s3_client = s3_util::Client::with_bucket_and_endpoint( + ctx.config(), + "bucket-build", + s3_util::EndpointKind::EdgeInternal, + ) + .await?; + + let presigned_req = s3_client + .get_object() + .bucket(s3_client.bucket()) + .key(format!( + "{upload_id}/{file_name}", + upload_id = input.build_upload_id, + file_name = input.build_file_name, + )) + .presigned( + s3_util::aws_sdk_s3::presigning::PresigningConfig::builder() + .expires_in(std::time::Duration::from_secs(15 * 60)) + .build()?, + ) + .await?; + + let addr_str = presigned_req.uri().to_string(); + tracing::debug!(addr = %addr_str, "resolved artifact s3 presigned request"); + + addr_str + }; + + // Get the artifact size + let uploads_res = op!([ctx] upload_get { + upload_ids: vec![input.build_upload_id.into()], + }) + .await?; + let upload = unwrap!(uploads_res.uploads.first()); + + Ok(ResolveArtifactsOutput { + artifact_url_stub: crate::util::image_artifact_url_stub( + ctx.config(), + input.build_upload_id, + &input.build_file_name, + )?, + fallback_artifact_url, + artifact_size: upload.content_length, + }) +} + #[derive(Debug, Serialize, Deserialize, Hash)] struct FetchPortsInput { actor_id: util::Id, @@ -221,7 +294,7 @@ struct AllocateActorInput { generation: u32, image_id: Uuid, build_allocation_type: BuildAllocationType, - build_allocation_total_slots: u64, + build_allocation_total_slots: u32, resources: protocol::Resources, } @@ -233,59 +306,218 @@ pub struct AllocateActorOutput { pub client_workflow_id: Uuid, } +// If no availability, returns the timestamp of the actor's queue key #[activity(AllocateActor)] async fn allocate_actor( ctx: &ActivityCtx, input: &AllocateActorInput, -) -> GlobalResult> { +) -> GlobalResult> { let client_flavor = protocol::ClientFlavor::Multi; let memory_mib = input.resources.memory / 1024 / 1024; let start_instant = Instant::now(); + // NOTE: This txn should closely resemble the one found in the allocate_pending_actors activity of the + // client wf let res = ctx .fdb() .await? .run(|tx, _mc| async move { - // Select a range that only includes runners that have enough remaining slots to allocate this actor - let start = keys::subspace().pack( - &keys::datacenter::RunnersByRemainingSlotsKey::subspace_with_slots( - input.image_id, - 1, - ), - ); - let runner_allocation_subspace = - keys::datacenter::RunnersByRemainingSlotsKey::subspace(input.image_id); - let end = keys::subspace() - .subspace(&runner_allocation_subspace) - .range() - .1; - - let mut stream = tx.get_ranges_keyvalues( - fdb::RangeOption { - mode: StreamingMode::Iterator, - // Containers bin pack so we reverse the order - reverse: true, - ..(start, end).into() - }, - // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just - // the one we choose - SNAPSHOT, - ); + // Check for availability amongst existing runners + let image_queue_exists = if let BuildAllocationType::Multi = input.build_allocation_type + { + // Check if a queue for this image exists + let pending_actor_by_image_subspace = keys::subspace().subspace( + &keys::datacenter::PendingActorByImageIdKey::subspace(input.image_id), + ); + let queue_exists = tx + .get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Exact, + limit: Some(1), + ..(&pending_actor_by_image_subspace).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with other + // inserts/clears to this range + // queue + SNAPSHOT, + ) + .try_next() + .await? + .is_some(); + + if !queue_exists { + // Select a range that only includes runners that have enough remaining slots to allocate + // this actor + let start = keys::subspace().pack( + &keys::datacenter::RunnersByRemainingSlotsKey::subspace_with_slots( + input.image_id, + 1, + ), + ); + let runner_allocation_subspace = + keys::datacenter::RunnersByRemainingSlotsKey::subspace(input.image_id); + let end = keys::subspace() + .subspace(&runner_allocation_subspace) + .range() + .1; + + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just + // the one we choose + SNAPSHOT, + ); + + loop { + let Some(entry) = stream.try_next().await? else { + break; + }; + + let old_runner_allocation_key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let data = old_runner_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Add read conflict only for this key + tx.add_conflict_range( + entry.key(), + &end_of_key_range(entry.key()), + ConflictRangeType::Read, + )?; + + // Clear old entry + tx.clear(entry.key()); + + let new_remaining_slots = + old_runner_allocation_key.remaining_slots.saturating_sub(1); + + // Write new allocation key with 1 less slot + let new_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + input.image_id, + new_remaining_slots, + old_runner_allocation_key.runner_id, + ); + tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); + + // Update runner record + let remaining_slots_key = keys::runner::RemainingSlotsKey::new( + old_runner_allocation_key.runner_id, + ); + tx.set( + &keys::subspace().pack(&remaining_slots_key), + &remaining_slots_key + .serialize(new_remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Insert actor index key + let client_actor_key = + keys::client::Actor2Key::new(data.client_id, input.actor_id); + tx.set( + &keys::subspace().pack(&client_actor_key), + &client_actor_key + .serialize(input.generation) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + return Ok(Ok(AllocateActorOutput { + runner_id: old_runner_allocation_key.runner_id, + new_runner: false, + client_id: data.client_id, + client_workflow_id: data.client_workflow_id, + })); + } + } + + queue_exists + } else { + false + }; + + // No available runner found, create a new one + + // Check if a queue exists + let pending_actor_subspace = + keys::subspace().subspace(&keys::datacenter::PendingActorKey::subspace()); + let queue_exists = if image_queue_exists { + // We don't have to check the range if the image queue exists, its guaranteed that this one + // exists too + true + } else { + tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Exact, + limit: Some(1), + ..(&pending_actor_subspace).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with other + // inserts/clears to this range + // queue + SNAPSHOT, + ) + .next() + .await + .is_some() + }; + + if !queue_exists { + let runner_id = Uuid::new_v4(); + + let ping_threshold_ts = util::timestamp::now() - CLIENT_ELIGIBLE_THRESHOLD_MS; + + // Select a range that only includes clients that have enough remaining mem to allocate this actor + let start = keys::subspace().pack( + &keys::datacenter::ClientsByRemainingMemKey::subspace_with_mem( + client_flavor, + memory_mib, + ), + ); + let client_allocation_subspace = + keys::datacenter::ClientsByRemainingMemKey::subspace(client_flavor); + let end = keys::subspace() + .subspace(&client_allocation_subspace) + .range() + .1; + + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just + // the one we choose + SNAPSHOT, + ); - if let BuildAllocationType::Multi = input.build_allocation_type { loop { let Some(entry) = stream.try_next().await? else { break; }; - let old_runner_allocation_key = keys::subspace() - .unpack::(entry.key()) + let old_client_allocation_key = keys::subspace() + .unpack::(entry.key()) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - let data = old_runner_allocation_key - .deserialize(entry.value()) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + // Scan by last ping + if old_client_allocation_key.last_ping_ts < ping_threshold_ts { + continue; + } + + let client_workflow_id = + old_client_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; // Add read conflict only for this key tx.add_conflict_range( @@ -297,30 +529,107 @@ async fn allocate_actor( // Clear old entry tx.clear(entry.key()); - let new_remaining_slots = - old_runner_allocation_key.remaining_slots.saturating_sub(1); + // Read old cpu + let remaining_cpu_key = + keys::client::RemainingCpuKey::new(old_client_allocation_key.client_id); + let remaining_cpu_key_buf = keys::subspace().pack(&remaining_cpu_key); + let remaining_cpu_entry = tx.get(&remaining_cpu_key_buf, SERIALIZABLE).await?; + let old_remaining_cpu = remaining_cpu_key + .deserialize(&remaining_cpu_entry.ok_or( + fdb::FdbBindingError::CustomError( + format!("key should exist: {remaining_cpu_key:?}").into(), + ), + )?) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - // Write new allocation key with 1 less slot - let new_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( - input.image_id, - new_remaining_slots, - old_runner_allocation_key.runner_id, + // Update allocated amount + let new_remaining_mem = old_client_allocation_key.remaining_mem - memory_mib; + let new_remaining_cpu = old_remaining_cpu - input.resources.cpu; + let new_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( + client_flavor, + new_remaining_mem, + old_client_allocation_key.last_ping_ts, + old_client_allocation_key.client_id, ); tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); - // Update runner record - let remaining_slots_key = - keys::runner::RemainingSlotsKey::new(old_runner_allocation_key.runner_id); + tracing::debug!( + old_mem=%old_client_allocation_key.remaining_mem, + old_cpu=%old_remaining_cpu, + new_mem=%new_remaining_mem, + new_cpu=%new_remaining_cpu, + "allocating runner resources" + ); + + // Update client record + let remaining_mem_key = + keys::client::RemainingMemoryKey::new(old_client_allocation_key.client_id); + tx.set( + &keys::subspace().pack(&remaining_mem_key), + &remaining_mem_key + .serialize(new_remaining_mem) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + tx.set( + &remaining_cpu_key_buf, + &remaining_cpu_key + .serialize(new_remaining_cpu) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let remaining_slots = input.build_allocation_total_slots.saturating_sub(1); + let total_slots = input.build_allocation_total_slots; + + // Insert runner records + let remaining_slots_key = keys::runner::RemainingSlotsKey::new(runner_id); tx.set( &keys::subspace().pack(&remaining_slots_key), &remaining_slots_key - .serialize(new_remaining_slots) + .serialize(remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let total_slots_key = keys::runner::TotalSlotsKey::new(runner_id); + tx.set( + &keys::subspace().pack(&total_slots_key), + &total_slots_key + .serialize(total_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let image_id_key = keys::runner::ImageIdKey::new(runner_id); + tx.set( + &keys::subspace().pack(&image_id_key), + &image_id_key + .serialize(input.image_id) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, ); + // Insert runner index key if multi. Single allocation per container runners don't need to be + // in the alloc idx because they only have 1 slot + if let BuildAllocationType::Multi = input.build_allocation_type { + let runner_idx_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + input.image_id, + remaining_slots, + runner_id, + ); + tx.set( + &keys::subspace().pack(&runner_idx_key), + &runner_idx_key + .serialize(keys::datacenter::RunnersByRemainingSlotsKeyData { + client_id: old_client_allocation_key.client_id, + client_workflow_id, + }) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + } + // Insert actor index key - let client_actor_key = - keys::client::Actor2Key::new(data.client_id, input.actor_id); + let client_actor_key = keys::client::Actor2Key::new( + old_client_allocation_key.client_id, + input.actor_id, + ); tx.set( &keys::subspace().pack(&client_actor_key), &client_actor_key @@ -328,196 +637,75 @@ async fn allocate_actor( .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, ); - return Ok(Some(AllocateActorOutput { - runner_id: old_runner_allocation_key.runner_id, - new_runner: false, - client_id: data.client_id, - client_workflow_id: data.client_workflow_id, + return Ok(Ok(AllocateActorOutput { + runner_id, + new_runner: true, + client_id: old_client_allocation_key.client_id, + client_workflow_id, })); } } - // No available runner found, create a new one - let runner_id = Uuid::new_v4(); - - let ping_threshold_ts = util::timestamp::now() - CLIENT_ELIGIBLE_THRESHOLD_MS; + // At this point in the txn there is no availability. Write the actor to the alloc queue to wait. - // Select a range that only includes clients that have enough remaining mem to allocate this actor - let start = keys::subspace().pack( - &keys::datacenter::ClientsByRemainingMemKey::subspace_with_mem( - client_flavor, - memory_mib, - ), - ); - let client_allocation_subspace = - keys::datacenter::ClientsByRemainingMemKey::subspace(client_flavor); - let end = keys::subspace() - .subspace(&client_allocation_subspace) - .range() - .1; - - let mut stream = tx.get_ranges_keyvalues( - fdb::RangeOption { - mode: StreamingMode::Iterator, - // Containers bin pack so we reverse the order - reverse: true, - ..(start, end).into() - }, - // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just - // the one we choose - SNAPSHOT, - ); + let pending_ts = util::timestamp::now(); - loop { - let Some(entry) = stream.try_next().await? else { - return Ok(None); - }; - - let old_client_allocation_key = keys::subspace() - .unpack::(entry.key()) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - - // Scan by last ping - if old_client_allocation_key.last_ping_ts < ping_threshold_ts { - continue; - } - - let client_workflow_id = old_client_allocation_key - .deserialize(entry.value()) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - - // Add read conflict only for this key - tx.add_conflict_range( - entry.key(), - &end_of_key_range(entry.key()), - ConflictRangeType::Read, - )?; - - // Clear old entry - tx.clear(entry.key()); - - // Read old cpu - let remaining_cpu_key = - keys::client::RemainingCpuKey::new(old_client_allocation_key.client_id); - let remaining_cpu_key_buf = keys::subspace().pack(&remaining_cpu_key); - let remaining_cpu_entry = tx.get(&remaining_cpu_key_buf, SERIALIZABLE).await?; - let old_remaining_cpu = remaining_cpu_key - .deserialize( - &remaining_cpu_entry.ok_or(fdb::FdbBindingError::CustomError( - format!("key should exist: {remaining_cpu_key:?}").into(), - ))?, - ) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - - // Update allocated amount - let new_remaining_mem = old_client_allocation_key.remaining_mem - memory_mib; - let new_remaining_cpu = old_remaining_cpu - input.resources.cpu; - let new_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( - client_flavor, - new_remaining_mem, - old_client_allocation_key.last_ping_ts, - old_client_allocation_key.client_id, - ); - tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); - - tracing::debug!( - old_mem=%old_client_allocation_key.remaining_mem, - old_cpu=%old_remaining_cpu, - new_mem=%new_remaining_mem, - new_cpu=%new_remaining_cpu, - "allocating runner resources" - ); - - // Update client record - let remaining_mem_key = - keys::client::RemainingMemoryKey::new(old_client_allocation_key.client_id); - tx.set( - &keys::subspace().pack(&remaining_mem_key), - &remaining_mem_key - .serialize(new_remaining_mem) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, - ); - - tx.set( - &remaining_cpu_key_buf, - &remaining_cpu_key - .serialize(new_remaining_cpu) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, - ); - - let remaining_slots = input.build_allocation_total_slots.saturating_sub(1); - let total_slots = input.build_allocation_total_slots; - - // Insert runner records - let remaining_slots_key = keys::runner::RemainingSlotsKey::new(runner_id); - tx.set( - &keys::subspace().pack(&remaining_slots_key), - &remaining_slots_key - .serialize(remaining_slots) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, - ); - - let total_slots_key = keys::runner::TotalSlotsKey::new(runner_id); - tx.set( - &keys::subspace().pack(&total_slots_key), - &total_slots_key - .serialize(total_slots) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + // Write self to image alloc queue + if let BuildAllocationType::Multi = input.build_allocation_type { + let image_pending_alloc_key = keys::datacenter::PendingActorByImageIdKey::new( + input.image_id, + pending_ts, + input.actor_id, ); + let image_pending_alloc_data = keys::datacenter::PendingActorByImageIdKeyData { + generation: input.generation, + build_allocation_type: input.build_allocation_type, + build_allocation_total_slots: input.build_allocation_total_slots, + cpu: input.resources.cpu, + memory: input.resources.memory, + }; - let image_id_key = keys::runner::ImageIdKey::new(runner_id); + // NOTE: This will conflict with serializable reads to the alloc queue, which is the behavior we + // want. If a client reads from the queue while this is being inserted, one of the two txns will + // retry and we ensure the actor does not end up in queue limbo. tx.set( - &keys::subspace().pack(&image_id_key), - &image_id_key - .serialize(input.image_id) + &keys::subspace().pack(&image_pending_alloc_key), + &image_pending_alloc_key + .serialize(image_pending_alloc_data) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, ); + } - // Insert runner index key if multi. Single allocation per container runners don't need to be - // in the alloc idx because they only have 1 slot - if let BuildAllocationType::Multi = input.build_allocation_type { - let runner_idx_key = keys::datacenter::RunnersByRemainingSlotsKey::new( - input.image_id, - remaining_slots, - runner_id, - ); - tx.set( - &keys::subspace().pack(&runner_idx_key), - &runner_idx_key - .serialize(keys::datacenter::RunnersByRemainingSlotsKeyData { - client_id: old_client_allocation_key.client_id, - client_workflow_id, - }) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, - ); - } - - // Insert actor index key - let client_actor_key = keys::client::Actor2Key::new( - old_client_allocation_key.client_id, - input.actor_id, - ); - tx.set( - &keys::subspace().pack(&client_actor_key), - &client_actor_key - .serialize(input.generation) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, - ); + // Write self to global alloc queue + let pending_alloc_key = + keys::datacenter::PendingActorKey::new(pending_ts, input.actor_id); + let pending_alloc_data = keys::datacenter::PendingActorKeyData { + generation: input.generation, + image_id: input.image_id, + build_allocation_type: input.build_allocation_type, + build_allocation_total_slots: input.build_allocation_total_slots, + cpu: input.resources.cpu, + memory: input.resources.memory, + }; + + // NOTE: This will conflict with serializable reads to the alloc queue, which is the behavior we + // want. If a client reads from the queue while this is being inserted, one of the two txns will + // retry and we ensure the actor does not end up in queue limbo. + tx.set( + &keys::subspace().pack(&pending_alloc_key), + &pending_alloc_key + .serialize(pending_alloc_data) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); - return Ok(Some(AllocateActorOutput { - runner_id, - new_runner: true, - client_id: old_client_allocation_key.client_id, - client_workflow_id, - })); - } + return Ok(Err(pending_ts)); }) .custom_instrument(tracing::info_span!("actor_allocate_tx")) .await?; let dt = start_instant.elapsed().as_secs_f64(); metrics::ACTOR_ALLOCATE_DURATION - .with_label_values(&[&res.is_some().to_string()]) + .with_label_values(&[&res.is_ok().to_string()]) .observe(dt); Ok(res) @@ -812,14 +1000,15 @@ async fn compare_retry(ctx: &ActivityCtx, input: &CompareRetryInput) -> GlobalRe Ok((now, input.last_retry_ts < now - RETRY_RESET_DURATION_MS)) } -/// Returns whether or not there was availability to spawn the actor. +/// Returns None if a destroy signal was received while pending for allocation. pub async fn spawn_actor( ctx: &mut WorkflowCtx, input: &Input, actor_setup: &setup::ActorSetupCtx, generation: u32, ) -> GlobalResult> { - let res = ctx + // Attempt allocation + let allocate_res = ctx .activity(AllocateActorInput { actor_id: input.actor_id, generation, @@ -830,18 +1019,74 @@ pub async fn spawn_actor( }) .await?; - let Some(res) = res else { - return Ok(None); + let allocate_res = match allocate_res { + Ok(x) => x, + Err(pending_allocation_ts) => { + tracing::warn!( + actor_id=?input.actor_id, + "failed to allocate (no availability), waiting for allocation", + ); + + ctx.activity(SetPendingAllocationInput { + pending_allocation_ts, + }) + .await?; + + // If allocation fails, the allocate txn already inserted this actor into the queue. Now we wait for + // an `Allocate` signal + match ctx.listen::().await? { + PendingAllocation::Allocate(sig) => AllocateActorOutput { + runner_id: sig.runner_id, + new_runner: sig.new_runner, + client_id: sig.client_id, + client_workflow_id: sig.client_workflow_id, + }, + // We ignore the signal's override_kill_timeout_ms because the actor isn't allocated + PendingAllocation::Destroy(_sig) => { + tracing::debug!("destroying before actor allocated"); + + let cleared = ctx + .activity(ClearPendingAllocationInput { + actor_id: input.actor_id, + pending_allocation_ts, + }) + .await?; + + // If this actor was no longer present in the queue it means it was allocated. We must now + // wait for the allocated signal to prevent a race condition. + if !cleared { + let sig = ctx.listen::().await?; + + ctx.activity(UpdateClientAndRunnerInput { + client_id: sig.client_id, + client_workflow_id: sig.client_workflow_id, + runner_id: sig.runner_id, + }) + .await?; + } + + return Ok(None); + } + } + } }; - let (_, ports_res) = ctx + let (_, artifacts_res, ports_res) = ctx .join(( activity(UpdateClientAndRunnerInput { - client_id: res.client_id, - client_workflow_id: res.client_workflow_id, - runner_id: res.runner_id, + client_id: allocate_res.client_id, + client_workflow_id: allocate_res.client_workflow_id, + runner_id: allocate_res.runner_id, + }), + // NOTE: We resolve the artifacts here instead of in setup::setup because we don't know how + // long it will be after setup until an actor is allocated so the presigned artifact url might + // expire. + activity(ResolveArtifactsInput { + build_upload_id: actor_setup.meta.build_upload_id, + build_file_name: actor_setup.meta.build_file_name.clone(), + dc_build_delivery_method: actor_setup.meta.dc_build_delivery_method, }), - v(2).activity(FetchPortsInput { + activity(FetchPortsInput { actor_id: input.actor_id, endpoint_type: input.endpoint_type, }), @@ -852,9 +1097,9 @@ pub async fn spawn_actor( let image = protocol::Image { id: actor_setup.image_id, - artifact_url_stub: actor_setup.artifact_url_stub.clone(), - fallback_artifact_url: Some(actor_setup.fallback_artifact_url.clone()), - artifact_size: actor_setup.artifact_size, + artifact_url_stub: artifacts_res.artifact_url_stub.clone(), + fallback_artifact_url: Some(artifacts_res.fallback_artifact_url.clone()), + artifact_size: artifacts_res.artifact_size, kind: match actor_setup.meta.build_kind { BuildKind::DockerImage => protocol::ImageKind::DockerImage, BuildKind::OciBundle => protocol::ImageKind::OciBundle, @@ -907,9 +1152,9 @@ pub async fn spawn_actor( actor_id: input.actor_id, generation, config: Box::new(protocol::ActorConfig { - runner: if res.new_runner { + runner: if allocate_res.new_runner { Some(protocol::ActorRunner::New { - runner_id: res.runner_id, + runner_id: allocate_res.runner_id, config: protocol::RunnerConfig { image: image.clone(), root_user_enabled: input.root_user_enabled, @@ -921,7 +1166,7 @@ pub async fn spawn_actor( }) } else { Some(protocol::ActorRunner::Existing { - runner_id: res.runner_id, + runner_id: allocate_res.runner_id, }) }, env: input.environment.clone(), @@ -964,19 +1209,20 @@ pub async fn spawn_actor( network_mode, }), }) - .to_workflow_id(res.client_workflow_id) + .to_workflow_id(allocate_res.client_workflow_id) .send() .await?; - Ok(Some(res)) + Ok(Some(allocate_res)) } +/// Returns true if the actor should be destroyed. pub async fn reschedule_actor( ctx: &mut WorkflowCtx, input: &Input, state: &mut State, image_id: Uuid, -) -> GlobalResult> { +) -> GlobalResult { tracing::debug!(actor_id=?input.actor_id, "rescheduling actor"); let res = ctx @@ -1031,22 +1277,21 @@ pub async fn reschedule_actor( let next = backoff.step().expect("should not have max retry"); // Sleep for backoff or destroy early - if let Some(sig) = ctx + if let Some(_sig) = ctx .listen_with_timeout::(Instant::from(next) - Instant::now()) .await? { tracing::debug!("destroying before actor start"); - return Ok(Loop::Break(Err(sig))); + return Ok(Loop::Break(None)); } } if let Some(res) = spawn_actor(ctx, &input, &actor_setup, next_generation).await? { - Ok(Loop::Break(Ok((state.clone(), res)))) + Ok(Loop::Break(Some((state.clone(), res)))) } else { - tracing::debug!(actor_id=?input.actor_id, "failed to reschedule actor, retrying"); - - Ok(Loop::Continue) + // Destroyed early + Ok(Loop::Break(None)) } } .boxed() @@ -1054,25 +1299,81 @@ pub async fn reschedule_actor( .await?; // Update loop state - match res { - Ok((reschedule_state, res)) => { - state.generation = next_generation; - state.runner_id = res.runner_id; - state.client_id = res.client_id; - state.client_workflow_id = res.client_workflow_id; + if let Some((reschedule_state, res)) = res { + state.generation = next_generation; + state.runner_id = res.runner_id; + state.client_id = res.client_id; + state.client_workflow_id = res.client_workflow_id; - // Save reschedule state in global state - state.reschedule_state = reschedule_state; + // Save reschedule state in global state + state.reschedule_state = reschedule_state; - // Reset gc timeout once allocated - state.gc_timeout_ts = Some(util::timestamp::now() + ACTOR_START_THRESHOLD_MS); + // Reset gc timeout once allocated + state.gc_timeout_ts = Some(util::timestamp::now() + ACTOR_START_THRESHOLD_MS); - Ok(None) - } - Err(sig) => Ok(Some(sig)), + Ok(false) + } else { + Ok(true) } } +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct SetPendingAllocationInput { + pending_allocation_ts: i64, +} + +#[activity(SetPendingAllocation)] +pub async fn set_pending_allocation( + ctx: &ActivityCtx, + input: &SetPendingAllocationInput, +) -> GlobalResult<()> { + let pool = ctx.sqlite().await?; + + sql_execute!( + [ctx, pool] + " + UPDATE state + SET pending_allocation_ts = ? + ", + input.pending_allocation_ts, + ) + .await?; + + Ok(()) +} + +#[derive(Debug, Serialize, Deserialize, Hash)] +pub struct ClearPendingAllocationInput { + actor_id: util::Id, + pending_allocation_ts: i64, +} + +#[activity(ClearPendingAllocation)] +pub async fn clear_pending_allocation( + ctx: &ActivityCtx, + input: &ClearPendingAllocationInput, +) -> GlobalResult { + // Clear self from alloc queue + let cleared = ctx + .fdb() + .await? + .run(|tx, _mc| async move { + let pending_alloc_key = keys::subspace().pack(&keys::datacenter::PendingActorKey::new( + input.pending_allocation_ts, + input.actor_id, + )); + + let exists = tx.get(&pending_alloc_key, SERIALIZABLE).await?.is_some(); + + tx.clear(&pending_alloc_key); + + Ok(exists) + }) + .await?; + + Ok(cleared) +} + #[derive(Debug, Serialize, Deserialize, Hash)] struct ClearPortsAndResourcesInput { actor_id: util::Id, diff --git a/packages/edge/services/pegboard/src/workflows/actor2/setup.rs b/packages/edge/services/pegboard/src/workflows/actor2/setup.rs index 946f943751..48bc0bf874 100644 --- a/packages/edge/services/pegboard/src/workflows/actor2/setup.rs +++ b/packages/edge/services/pegboard/src/workflows/actor2/setup.rs @@ -685,7 +685,7 @@ pub struct GetMetaOutput { pub build_kind: BuildKind, pub build_compression: BuildCompression, pub build_allocation_type: BuildAllocationType, - pub build_allocation_total_slots: u64, + pub build_allocation_total_slots: u32, pub build_resources: Option, pub dc_name_id: String, pub dc_display_name: String, @@ -782,10 +782,6 @@ pub struct ActorSetupCtx { pub image_id: Uuid, pub meta: GetMetaOutput, pub resources: protocol::Resources, - pub artifact_url_stub: String, - pub fallback_artifact_url: String, - /// Bytes. - pub artifact_size: u64, } pub async fn setup( @@ -871,27 +867,17 @@ pub async fn setup( } }; - let (resources, artifacts_res) = ctx - .join(( - activity(SelectResourcesInput { - cpu_millicores: resources.cpu_millicores, - memory_mib: resources.memory_mib, - }), - activity(ResolveArtifactsInput { - build_upload_id: meta.build_upload_id, - build_file_name: meta.build_file_name.clone(), - dc_build_delivery_method: meta.dc_build_delivery_method, - }), - )) + let resources = ctx + .activity(SelectResourcesInput { + cpu_millicores: resources.cpu_millicores, + memory_mib: resources.memory_mib, + }) .await?; Ok(ActorSetupCtx { image_id, meta, resources, - artifact_url_stub: artifacts_res.artifact_url_stub, - fallback_artifact_url: artifacts_res.fallback_artifact_url, - artifact_size: artifacts_res.artifact_size, }) } @@ -957,74 +943,3 @@ async fn select_resources( disk: tier.disk, }) } - -#[derive(Debug, Serialize, Deserialize, Hash)] -struct ResolveArtifactsInput { - build_upload_id: Uuid, - build_file_name: String, - dc_build_delivery_method: BuildDeliveryMethod, -} - -#[derive(Debug, Serialize, Deserialize, Hash)] -struct ResolveArtifactsOutput { - artifact_url_stub: String, - fallback_artifact_url: String, - /// Bytes. - artifact_size: u64, -} - -#[activity(ResolveArtifacts)] -async fn resolve_artifacts( - ctx: &ActivityCtx, - input: &ResolveArtifactsInput, -) -> GlobalResult { - // Get the fallback URL - let fallback_artifact_url = { - tracing::debug!("using s3 direct delivery"); - - // Build client - let s3_client = s3_util::Client::with_bucket_and_endpoint( - ctx.config(), - "bucket-build", - s3_util::EndpointKind::EdgeInternal, - ) - .await?; - - let presigned_req = s3_client - .get_object() - .bucket(s3_client.bucket()) - .key(format!( - "{upload_id}/{file_name}", - upload_id = input.build_upload_id, - file_name = input.build_file_name, - )) - .presigned( - s3_util::aws_sdk_s3::presigning::PresigningConfig::builder() - .expires_in(std::time::Duration::from_secs(15 * 60)) - .build()?, - ) - .await?; - - let addr_str = presigned_req.uri().to_string(); - tracing::debug!(addr = %addr_str, "resolved artifact s3 presigned request"); - - addr_str - }; - - // Get the artifact size - let uploads_res = op!([ctx] upload_get { - upload_ids: vec![input.build_upload_id.into()], - }) - .await?; - let upload = unwrap!(uploads_res.uploads.first()); - - Ok(ResolveArtifactsOutput { - artifact_url_stub: crate::util::image_artifact_url_stub( - ctx.config(), - input.build_upload_id, - &input.build_file_name, - )?, - fallback_artifact_url, - artifact_size: upload.content_length, - }) -} diff --git a/packages/edge/services/pegboard/src/workflows/client/mod.rs b/packages/edge/services/pegboard/src/workflows/client/mod.rs index aeff8090c2..90f2d1afe3 100644 --- a/packages/edge/services/pegboard/src/workflows/client/mod.rs +++ b/packages/edge/services/pegboard/src/workflows/client/mod.rs @@ -1,8 +1,12 @@ use std::convert::TryInto; +use ::build::types::BuildAllocationType; use chirp_workflow::prelude::*; -use fdb_util::{FormalKey, SERIALIZABLE, SNAPSHOT}; -use foundationdb::{self as fdb, options::StreamingMode}; +use fdb_util::{end_of_key_range, FormalKey, SERIALIZABLE, SNAPSHOT}; +use foundationdb::{ + self as fdb, + options::{ConflictRangeType, StreamingMode}, +}; use futures_util::{FutureExt, StreamExt, TryStreamExt}; use nix::sys::signal::Signal; use rivet_api::{ @@ -15,7 +19,10 @@ use rivet_api::{ use rivet_operation::prelude::proto::{self, backend::pkg::*}; use sqlx::Acquire; -use crate::{client_config, keys, metrics, protocol, protocol::ClientFlavor, system_info}; +use crate::{ + client_config, keys, metrics, protocol, protocol::ClientFlavor, system_info, + workflows::actor2::Allocate, +}; mod migrations; @@ -99,21 +106,36 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu .await?; } - ctx.join(( - activity(InsertFdbInput { - client_id, - flavor, - config, - system, - }), - activity(UpdateMetricsInput { - client_id, - flavor, - draining: state.drain_timeout_ts.is_some(), - clear: false, - }), - )) + ctx.activity(InsertFdbInput { + client_id, + flavor, + config, + system, + }) .await?; + + let (res, _) = ctx + .join(( + // Check for pending actors as soon as connected + v(2).activity(AllocatePendingActorsInput {}), + activity(UpdateMetricsInput { + client_id, + flavor, + draining: state.drain_timeout_ts.is_some(), + clear: false, + }), + )) + .await?; + + // Dispatch pending allocs + for alloc in res.allocations { + ctx.v(2) + .signal(alloc.signal) + .to_workflow::() + .tag("actor_id", alloc.actor_id) + .send() + .await?; + } } // Events are in order by index protocol::ToServer::Events(events) => { @@ -144,7 +166,7 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu { // Try actor2 first let res = ctx - .signal(crate::workflows::actor::StateUpdate { + .signal(crate::workflows::actor2::StateUpdate { generation, state: state.clone(), }) @@ -248,22 +270,62 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu // Undrain all remaining actors for actor_id in actor_ids { + // Try actor2 first let res = ctx - .signal(crate::workflows::actor::Undrain {}) - .to_workflow::() + .signal(crate::workflows::actor2::Undrain {}) + .to_workflow::() .tag("actor_id", actor_id) .send() .await; if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { - tracing::debug!( - ?actor_id, - "actor workflow not found to undrain, likely already stopped" - ); + // Try old actors + let res = ctx + .signal(crate::workflows::actor::Undrain {}) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + tracing::debug!( + ?actor_id, + "actor workflow not found to undrain, likely already stopped" + ); + } else { + res?; + } } else { res?; } } + + // Check for pending actors + let res = ctx.v(2).activity(AllocatePendingActorsInput {}).await?; + + // Dispatch pending allocs + for alloc in res.allocations { + ctx.v(2) + .signal(alloc.signal) + .to_workflow::() + .tag("actor_id", alloc.actor_id) + .send() + .await?; + } + } + Some(Main::CheckQueue(_)) => { + // Check for pending actors + let res = ctx.v(2).activity(AllocatePendingActorsInput {}).await?; + + // Dispatch pending allocs + for alloc in res.allocations { + ctx.v(2) + .signal(alloc.signal) + .to_workflow::() + .tag("actor_id", alloc.actor_id) + .send() + .await?; + } } None => { if ctx.activity(CheckExpiredInput { client_id }).await? { @@ -300,21 +362,37 @@ pub async fn pegboard_client(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResu // Set all remaining actors as lost for (actor_id, generation) in actors { + // Try actor2 first let res = ctx - .signal(crate::workflows::actor::StateUpdate { + .signal(crate::workflows::actor2::StateUpdate { generation, state: protocol::ActorState::Lost, }) - .to_workflow::() + .to_workflow::() .tag("actor_id", actor_id) .send() .await; if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { - tracing::warn!( - ?actor_id, - "actor workflow not found, likely already stopped" - ); + // Try old actors + let res = ctx + .signal(crate::workflows::actor::StateUpdate { + generation, + state: protocol::ActorState::Lost, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + tracing::warn!( + ?actor_id, + "actor workflow not found, likely already stopped" + ); + } else { + res?; + } } else { res?; } @@ -756,11 +834,24 @@ pub async fn handle_commands( // If this start actor command was received after the client started draining, immediately // inform the actor wf that it is draining if let Some(drain_timeout_ts) = drain_timeout_ts { - ctx.signal(crate::workflows::actor::Drain { drain_timeout_ts }) - .to_workflow::() + // Try actor2 first + let res = ctx + .signal(crate::workflows::actor2::Drain { drain_timeout_ts }) + .to_workflow::() .tag("actor_id", actor_id) .send() - .await?; + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + // Try old actors + ctx.signal(crate::workflows::actor::Drain { drain_timeout_ts }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await?; + } else { + res?; + } } } protocol::Command::SignalActor { @@ -770,21 +861,37 @@ pub async fn handle_commands( .. } => { if matches!(signal.try_into()?, Signal::SIGTERM | Signal::SIGKILL) { + // Try actor2 first let res = ctx - .signal(crate::workflows::actor::StateUpdate { + .signal(crate::workflows::actor2::StateUpdate { generation, state: protocol::ActorState::Stopping, }) - .to_workflow::() + .to_workflow::() .tag("actor_id", actor_id) .send() .await; if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { - tracing::warn!( - ?actor_id, - "actor workflow not found, likely already stopped" - ); + // Try old actors + let res = ctx + .signal(crate::workflows::actor::StateUpdate { + generation, + state: protocol::ActorState::Stopping, + }) + .to_workflow::() + .tag("actor_id", actor_id) + .send() + .await; + + if let Some(WorkflowError::WorkflowNotFound) = res.as_workflow_error() { + tracing::warn!( + ?actor_id, + "actor workflow not found, likely already stopped" + ); + } else { + res?; + } } else { res?; } @@ -925,8 +1032,8 @@ async fn fetch_remaining_actors( )) .map(|res| match res { Ok(entry) => { - if let Ok(key) = keys::subspace() - .unpack::(entry.key()) { + if let Ok(key) = keys::subspace().unpack::(entry.key()) + { let generation = key .deserialize(entry.value()) .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; @@ -984,6 +1091,374 @@ async fn check_expired(ctx: &ActivityCtx, input: &CheckExpiredInput) -> GlobalRe Ok(last_ping_ts < util::timestamp::now() - CLIENT_LOST_THRESHOLD_MS) } +#[derive(Debug, Serialize, Deserialize, Hash)] +pub(crate) struct AllocatePendingActorsInput {} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct AllocatePendingActorsOutput { + pub allocations: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct ActorAllocation { + pub actor_id: util::Id, + pub signal: Allocate, +} + +#[activity(AllocatePendingActors)] +pub(crate) async fn allocate_pending_actors( + ctx: &ActivityCtx, + input: &AllocatePendingActorsInput, +) -> GlobalResult { + let client_flavor = protocol::ClientFlavor::Multi; + + // NOTE: This txn should closely resemble the one found in the allocate_actor activity of the actor2 wf + let res = ctx + .fdb() + .await? + .run(|tx, _mc| async move { + let mut results = Vec::new(); + + let pending_actor_subspace = + keys::subspace().subspace(&keys::datacenter::PendingActorKey::subspace()); + let mut queue_stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + ..(&pending_actor_subspace).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, just + // the one we choose + SNAPSHOT, + ); + + 'queue_loop: loop { + let Some(queue_entry) = queue_stream.try_next().await? else { + break; + }; + + let queue_key = keys::subspace() + .unpack::(queue_entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let queue_value = queue_key + .deserialize(queue_entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + let memory_mib = queue_value.memory / 1024 / 1024; + + // Check for availability amongst existing runners + if let BuildAllocationType::Multi = queue_value.build_allocation_type { + // Select a range that only includes runners that have enough remaining slots to allocate + // this actor + let start = keys::subspace().pack( + &keys::datacenter::RunnersByRemainingSlotsKey::subspace_with_slots( + queue_value.image_id, + 1, + ), + ); + let runner_allocation_subspace = + keys::datacenter::RunnersByRemainingSlotsKey::subspace( + queue_value.image_id, + ); + let end = keys::subspace() + .subspace(&runner_allocation_subspace) + .range() + .1; + + // NOTE: This range read will include runners that were just inserted by the below code + // because fdb supports read-your-writes by default. This is the behavior we want. + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the + // keys, just the one we choose + SNAPSHOT, + ); + + loop { + let Some(entry) = stream.try_next().await? else { + break; + }; + + let old_runner_allocation_key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let data = old_runner_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Add read conflict only for this key + tx.add_conflict_range( + entry.key(), + &end_of_key_range(entry.key()), + ConflictRangeType::Read, + )?; + + // Clear old entry + tx.clear(entry.key()); + + let new_remaining_slots = + old_runner_allocation_key.remaining_slots.saturating_sub(1); + + // Write new allocation key with 1 less slot + let new_allocation_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + queue_value.image_id, + new_remaining_slots, + old_runner_allocation_key.runner_id, + ); + tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); + + // Update runner record + let remaining_slots_key = keys::runner::RemainingSlotsKey::new( + old_runner_allocation_key.runner_id, + ); + tx.set( + &keys::subspace().pack(&remaining_slots_key), + &remaining_slots_key + .serialize(new_remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Insert actor index key + let client_actor_key = + keys::client::Actor2Key::new(data.client_id, queue_key.actor_id); + tx.set( + &keys::subspace().pack(&client_actor_key), + &client_actor_key + .serialize(queue_value.generation) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Add read conflict for the queue key + tx.add_conflict_range( + queue_entry.key(), + &end_of_key_range(queue_entry.key()), + ConflictRangeType::Read, + )?; + tx.clear(queue_entry.key()); + // Clear sister queue key + tx.clear(&keys::subspace().pack(&queue_key.sister(queue_value.image_id))); + + results.push(ActorAllocation { + actor_id: queue_key.actor_id, + signal: Allocate { + runner_id: old_runner_allocation_key.runner_id, + new_runner: false, + client_id: data.client_id, + client_workflow_id: data.client_workflow_id, + }, + }); + continue 'queue_loop; + } + } + + // No available runner found, create a new one + + let runner_id = Uuid::new_v4(); + + let ping_threshold_ts = util::timestamp::now() - CLIENT_ELIGIBLE_THRESHOLD_MS; + + // Select a range that only includes clients that have enough remaining mem to allocate this + // actor + let start = keys::subspace().pack( + &keys::datacenter::ClientsByRemainingMemKey::subspace_with_mem( + client_flavor, + memory_mib, + ), + ); + let client_allocation_subspace = + keys::datacenter::ClientsByRemainingMemKey::subspace(client_flavor); + let end = keys::subspace() + .subspace(&client_allocation_subspace) + .range() + .1; + + let mut stream = tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::Iterator, + // Containers bin pack so we reverse the order + reverse: true, + ..(start, end).into() + }, + // NOTE: This is not SERIALIZABLE because we don't want to conflict with all of the keys, + // just the one we choose + SNAPSHOT, + ); + + loop { + let Some(entry) = stream.try_next().await? else { + break; + }; + + let old_client_allocation_key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Scan by last ping + if old_client_allocation_key.last_ping_ts < ping_threshold_ts { + continue; + } + + let client_workflow_id = + old_client_allocation_key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Add read conflict only for this key + tx.add_conflict_range( + entry.key(), + &end_of_key_range(entry.key()), + ConflictRangeType::Read, + )?; + + // Clear old entry + tx.clear(entry.key()); + + // Read old cpu + let remaining_cpu_key = + keys::client::RemainingCpuKey::new(old_client_allocation_key.client_id); + let remaining_cpu_key_buf = keys::subspace().pack(&remaining_cpu_key); + let remaining_cpu_entry = tx.get(&remaining_cpu_key_buf, SERIALIZABLE).await?; + let old_remaining_cpu = remaining_cpu_key + .deserialize(&remaining_cpu_entry.ok_or( + fdb::FdbBindingError::CustomError( + format!("key should exist: {remaining_cpu_key:?}").into(), + ), + )?) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + // Update allocated amount + let new_remaining_mem = old_client_allocation_key.remaining_mem - memory_mib; + let new_remaining_cpu = old_remaining_cpu - queue_value.cpu; + let new_allocation_key = keys::datacenter::ClientsByRemainingMemKey::new( + client_flavor, + new_remaining_mem, + old_client_allocation_key.last_ping_ts, + old_client_allocation_key.client_id, + ); + tx.set(&keys::subspace().pack(&new_allocation_key), entry.value()); + + tracing::debug!( + old_mem=%old_client_allocation_key.remaining_mem, + old_cpu=%old_remaining_cpu, + new_mem=%new_remaining_mem, + new_cpu=%new_remaining_cpu, + "allocating runner resources" + ); + + // Update client record + let remaining_mem_key = + keys::client::RemainingMemoryKey::new(old_client_allocation_key.client_id); + tx.set( + &keys::subspace().pack(&remaining_mem_key), + &remaining_mem_key + .serialize(new_remaining_mem) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + tx.set( + &remaining_cpu_key_buf, + &remaining_cpu_key + .serialize(new_remaining_cpu) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let remaining_slots = + queue_value.build_allocation_total_slots.saturating_sub(1); + let total_slots = queue_value.build_allocation_total_slots; + + // Insert runner records + let remaining_slots_key = keys::runner::RemainingSlotsKey::new(runner_id); + tx.set( + &keys::subspace().pack(&remaining_slots_key), + &remaining_slots_key + .serialize(remaining_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let total_slots_key = keys::runner::TotalSlotsKey::new(runner_id); + tx.set( + &keys::subspace().pack(&total_slots_key), + &total_slots_key + .serialize(total_slots) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + let image_id_key = keys::runner::ImageIdKey::new(runner_id); + tx.set( + &keys::subspace().pack(&image_id_key), + &image_id_key + .serialize(queue_value.image_id) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Insert runner index key if multi. Single allocation per container runners don't need to be + // in the alloc idx because they only have 1 slot + if let BuildAllocationType::Multi = queue_value.build_allocation_type { + let runner_idx_key = keys::datacenter::RunnersByRemainingSlotsKey::new( + queue_value.image_id, + remaining_slots, + runner_id, + ); + tx.set( + &keys::subspace().pack(&runner_idx_key), + &runner_idx_key + .serialize(keys::datacenter::RunnersByRemainingSlotsKeyData { + client_id: old_client_allocation_key.client_id, + client_workflow_id, + }) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + } + + // Insert actor index key + let client_actor_key = keys::client::Actor2Key::new( + old_client_allocation_key.client_id, + queue_key.actor_id, + ); + tx.set( + &keys::subspace().pack(&client_actor_key), + &client_actor_key + .serialize(queue_value.generation) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?, + ); + + // Add read conflict for the queue key + tx.add_conflict_range( + queue_entry.key(), + &end_of_key_range(queue_entry.key()), + ConflictRangeType::Read, + )?; + tx.clear(queue_entry.key()); + // Clear sister queue key + tx.clear(&keys::subspace().pack(&queue_key.sister(queue_value.image_id))); + + results.push(ActorAllocation { + actor_id: queue_key.actor_id, + signal: Allocate { + runner_id, + new_runner: true, + client_id: old_client_allocation_key.client_id, + client_workflow_id, + }, + }); + + continue 'queue_loop; + } + } + + Ok(results) + }) + .custom_instrument(tracing::info_span!("client_allocate_pending_actors_tx")) + .await?; + + Ok(AllocatePendingActorsOutput { allocations: res }) +} + // TODO: This is called fairly frequently #[derive(Debug, Serialize, Deserialize, Hash)] struct UpdateMetricsInput { @@ -1186,6 +1661,10 @@ pub struct ToWs { pub struct PrewarmImage2 { pub image: protocol::Image, } + +#[signal("pegboard_client_check_queue")] +pub struct CheckQueue {} + #[message("pegboard_client_close_ws")] pub struct CloseWs { pub client_id: Uuid, @@ -1204,6 +1683,7 @@ join_signal!(Main { // Forwarded from the ws to this workflow Forward(protocol::ToServer), PrewarmImage2, + CheckQueue, Drain, Undrain, }); diff --git a/packages/edge/services/pegboard/standalone/usage-metrics-publish/src/lib.rs b/packages/edge/services/pegboard/standalone/usage-metrics-publish/src/lib.rs index 636cc46e8b..be5ea9ba25 100644 --- a/packages/edge/services/pegboard/standalone/usage-metrics-publish/src/lib.rs +++ b/packages/edge/services/pegboard/standalone/usage-metrics-publish/src/lib.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use build::types::{BuildAllocationType, BuildKind}; use chirp_workflow::prelude::*; -use fdb_util::SNAPSHOT; +use fdb_util::{FormalKey, SNAPSHOT}; use foundationdb::{self as fdb, options::StreamingMode}; use futures_util::{StreamExt, TryStreamExt}; use pegboard::{keys, protocol}; @@ -14,6 +14,13 @@ struct Usage { pub memory: u64, } +struct ImagePendingStats { + actors: u32, + slots: u32, + cpu: u64, + memory: u64, +} + pub async fn start(config: rivet_config::Config, pools: rivet_pools::Pools) -> GlobalResult<()> { let mut interval = tokio::time::interval(std::time::Duration::from_secs(7)); loop { @@ -51,7 +58,7 @@ pub async fn run_from_env( } // List all actor ids that are currently running - let actor_ids = ctx + let (actor_ids, pending_cpu, pending_mem) = ctx .fdb() .await? .run(|tx, _mc| async move { @@ -60,43 +67,106 @@ pub async fn run_from_env( let actor2_subspace = keys::subspace().subspace(&keys::client::Actor2Key::entire_subspace()); - tx.get_ranges_keyvalues( - fdb::RangeOption { - mode: StreamingMode::WantAll, - ..(&actor2_subspace).into() - }, - // Not serializable because we don't want to interfere with normal operations - SNAPSHOT, - ) - .chain(tx.get_ranges_keyvalues( + let actor_ids = tx + .get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::WantAll, + ..(&actor2_subspace).into() + }, + // Not serializable because we don't want to interfere with normal operations + SNAPSHOT, + ) + .chain(tx.get_ranges_keyvalues( + fdb::RangeOption { + mode: StreamingMode::WantAll, + ..(&actor_subspace).into() + }, + // Not serializable because we don't want to interfere with normal operations + SNAPSHOT, + )) + .map(|res| match res { + Ok(entry) => { + if let Ok(key) = + keys::subspace().unpack::(entry.key()) + { + Ok(key.actor_id) + } else { + let key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + Ok(util::Id::from(key.actor_id)) + } + } + Err(err) => Err(Into::::into(err)), + }) + .try_collect::>() + .await?; + + let pending_actor_subspace = + keys::subspace().subspace(&keys::datacenter::PendingActorKey::subspace()); + let mut pending_resources_by_image_id = HashMap::new(); + + // Read all pending actor keys + let mut stream = tx.get_ranges_keyvalues( fdb::RangeOption { mode: StreamingMode::WantAll, - ..(&actor_subspace).into() + ..(&pending_actor_subspace).into() }, // Not serializable because we don't want to interfere with normal operations SNAPSHOT, - )) - .map(|res| match res { - Ok(entry) => { - if let Ok(key) = keys::subspace().unpack::(entry.key()) - { - Ok(key.actor_id) - } else { - let key = keys::subspace() - .unpack::(entry.key()) - .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; - - Ok(util::Id::from(key.actor_id)) - } - } - Err(err) => Err(Into::::into(err)), - }) - .try_collect::>() - .await + ); + + loop { + let Some(entry) = stream.try_next().await? else { + break; + }; + + let key = keys::subspace() + .unpack::(entry.key()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let value = key + .deserialize(entry.value()) + .map_err(|x| fdb::FdbBindingError::CustomError(x.into()))?; + + let entry = pending_resources_by_image_id + .entry(value.image_id) + .or_insert_with(|| ImagePendingStats { + actors: 0, + slots: value.build_allocation_total_slots, + cpu: value.cpu, + memory: value.memory / 1024 / 1024, + }); + + entry.actors += 1; + } + + // Calculate how many runners we need based on image + let (pending_cpu, pending_mem) = + pending_resources_by_image_id + .into_iter() + .fold((0, 0), |s, (_, stats)| { + let runner_count = stats.actors.div_ceil(stats.slots) as u64; + + ( + s.0 + stats.cpu * runner_count, + s.1 + stats.memory * runner_count, + ) + }); + + Ok((actor_ids, pending_cpu, pending_mem)) }) .custom_instrument(tracing::info_span!("fetch_running_actors_tx")) .await?; + pegboard::metrics::ACTOR_CPU_PENDING_ALLOCATION + .with_label_values(&[]) + .set(pending_cpu.try_into()?); + pegboard::metrics::ACTOR_MEMORY_PENDING_ALLOCATION + .with_label_values(&[]) + .set(pending_mem.try_into()?); + let actors_res = ctx .op(pegboard::ops::actor::get::Input { actor_ids, @@ -146,7 +216,7 @@ pub async fn run_from_env( .entry((actor.env_id, client_flavor)) .or_insert(Usage { cpu: 0, memory: 0 }); - let resources = match build.allocation_type { + match build.allocation_type { BuildAllocationType::None | BuildAllocationType::Single => { if let Some(resources) = &actor.resources { env_usage.cpu += resources.cpu_millicores as u64; diff --git a/packages/edge/services/pegboard/standalone/ws/src/lib.rs b/packages/edge/services/pegboard/standalone/ws/src/lib.rs index 9464c428a7..3f01760adb 100644 --- a/packages/edge/services/pegboard/standalone/ws/src/lib.rs +++ b/packages/edge/services/pegboard/standalone/ws/src/lib.rs @@ -350,13 +350,24 @@ async fn update_ping_thread_inner( // Only update ping if the workflow is not dead if wf.has_wake_condition { - ctx.op(pegboard::ops::client::update_allocation_idx::Input { - client_id, - client_workflow_id: workflow_id, - flavor, - action: pegboard::ops::client::update_allocation_idx::Action::UpdatePing, - }) - .await?; + let re_eligible = ctx + .op(pegboard::ops::client::update_allocation_idx::Input { + client_id, + client_workflow_id: workflow_id, + flavor, + action: pegboard::ops::client::update_allocation_idx::Action::UpdatePing, + }) + .await?; + + // If the clients last ping was over the eligibility threshold, tell it to check the pending + // actors queue + if re_eligible { + tracing::debug!(?client_id, "client has become eligible again"); + ctx.signal(pegboard::workflows::client::CheckQueue {}) + .to_workflow_id(workflow_id) + .send() + .await?; + } } } }