rivet-dev · graphite-app · Apr 8, 2024 · Apr 5, 2024
diff --git a/.github/actions/pre-init/action.yaml b/.github/actions/pre-init/action.yaml
@@ -39,14 +39,6 @@ runs:
         echo 'AWS_SECRET_ACCESS_KEY=${{ inputs.SCCACHE_AWS_SECRET_ACCESS_KEY }}' >> $GITHUB_ENV
         echo 'AWS_ACCESS_KEY_ID=${{ inputs.SCCACHE_AWS_ACCESS_KEY_ID }}' >> $GITHUB_ENV
 
-    # Cache generated Bolt files in order to prevent needless rebuilding
-    - name: Bolt Cache
-      uses: actions/cache@v3
-      with:
-        key: ${{ runner.os }}-bolt-gen
-        path: |
-          svc/pkg/region/ops/config-get/gen
-
     # MARK: Nix
     - uses: cachix/install-nix-action@v22
       with:

diff --git a/.gitignore b/.gitignore
@@ -29,11 +29,13 @@ Bolt.local.toml
 !/secrets/README.md
 
 # Generated code
+gen/hash.txt
 gen/build_script.sh
 gen/svc/
 gen/tf/
 gen/docker/
 gen/k8s/
+svc/pkg/cluster/util/gen/hash.txt
 
 # Rust
 lib/**/Cargo.lock

diff --git a/.rivet/config.yaml b/.rivet/config.yaml
@@ -0,0 +1,6 @@
+cluster:
+  api_endpoint: https://api.eg.rivet.gg
+telemetry:
+  disabled: false
+tokens:
+  cloud: null
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -91,6 +91,7 @@ and this project adheres to [Calendar Versioning](https://calver.org/).
 -   **Bolt** `bolt.confirm_commands` to namespace to confirm before running commands on a namespace
 -   `watch-requests` load test
 -   `mm-sustain` load test
+-   **Infra** Automatic server provisioning system ([Read more](/docs/packages/cluster/SERVER_PROVISIONING.md)).
 
 ### Changed
 

diff --git a/docs/packages/cluster/AUTOSCALING.md b/docs/packages/cluster/AUTOSCALING.md
@@ -0,0 +1,138 @@
+# Autoscaling
+
+The autoscaler service runs every 15 seconds.
+
+## Why memory?
+
+The autoscaler uses CPU usage for GG nodes and memory usage for job nodes. This is because certain cloud providers like linode do not provide an actual value for the speed of the CPU, but rather the amount of cores. This is problematic because we use Nomad's API for determining the usage on any given node, and it returns its stats in MHz.
+
+## Hardware failover
+
+Before a job server provisioned, we don't know for sure what its specs will be because of the hardware failover system in `cluster-server-provision`. In the autoscaling process, all servers that aren't provisioned yet are assumed to have the specs of the first hardware option in the list.
+
+### Failover has lower specs
+
+In the event that the hardware which ended up being provisioned has lower specs than the first hardware in the list, the autoscaler will calculate the error between how much was expected and how much was actually provisioned. This error number corresponds to how many more servers might be needed to reach the desired server count.
+
+Here is an example of the process in action:
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 2             | 2000MB                | 0MB                 |
+
+We start with 0 servers provisioned, and 2 desired. Our config consists of two hardwares, the first having 1000MB of memory and the second having 500MB of memory. With our failover system if the first one fails to provision, the second will be provisioned.
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 2             | 2000MB                | 0MB                 |
+| 15s              | 3             | 2000MB                | 1000MB              |
+
+After the first iteration, the autoscaler provisioned 2 servers which both ended up failing over and only providing a total of 1000MB of memory. The autoscaler then proceeds to calculate the error like so:
+
+```rust
+ceil(expected - actual) / expected_memory_per_server)
+
+ceil((2000 - 1000) / 1000) = 1
+```
+
+So an extra server was added to the desired count.
+
+Now, if the next server to be provisioned ends up having 1000MB like it should, we will end up having the original amount of desired memory.
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 2             | 2000MB                | 0MB                 |
+| 15s              | 3             | 2000MB                | 1000MB              |
+| 30s              | 3             | 2000MB                | 2000MB              |
+
+The error calculation would now be:
+
+```rust
+ceil((3000 - 2000) / 1000) = 1
+```
+
+So the error count stays the same and we stay at 3 desired servers.
+
+However, if the server provisioned was again a failover server, we would have this scenario:
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 2             | 2000MB                | 0MB                 |
+| 15s              | 3             | 2000MB                | 1000MB              |
+| 30s              | 4             | 2000MB                | 1500MB              |
+
+We end up with two extra servers to provision atop our original 2.
+
+```rust
+ceil((3000 - 1500) / 1000) = 2
+```
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 2             | 2000MB                | 0MB                 |
+| 15s              | 3             | 2000MB                | 1000MB              |
+| 30s              | 4             | 2000MB                | 1500MB              |
+| 45s              | 4             | 2000MB                | 2000MB              |
+
+And finally we reach the desired capacity.
+
+### Failover has higher specs
+
+In the event that the failover hardware has higher specs than the desired amount, there is no error system that reduces the desired count to account for this difference. This is because there is no direct correlation between desired count and the hardware being provisioned and destroyed. Thus, if hardware with higher than expected specs is provisioned, that extra space will not be taken into account.
+
+If it was taken into account in a similar error system as failover with lower specs, it would look like this:
+
+| time since start | desired count | expected total memory | actual total memory |
+| ---------------- | ------------- | --------------------- | ------------------- |
+| 0s               | 1             | 1000MB                | 2000MB              |
+
+Error:
+
+```rust
+ceil(expected - actual) / expected_memory_per_server)
+
+ceil((1000 - 2000) / 1000) = -1
+```
+
+The original desired count + error would be 0, destroying the only server and causing the capacity to drop to 0. If the higher-spec'd failover kept getting provisioned, this would end up in a loop.
+
+## Job server autoscaling
+
+The nomad topology for each job server in a datacenter is fetched and the memory is aggregated. This value is then divided by the expected memory capacity (the capacity of the first hardware in the config), which determines the minimum expected server count required to accommodate the current usage. Then, we add the error value (discussed above) and the margin value which is configured in the namespace config.
+
+### Autoscaling via machine learning
+
+Coming soon
+
+## GG server autoscaling
+
+Because we do not need to be preemptive with GG servers, the autoscaling is a bit more simple.
+
+-   If the current CPU usage is more than 20% under the total, add a server.
+-   If the current CPU usage is less than 130% under the total, remove a server.
+
+Examples:
+
+```rust
+// 3 servers
+total_cpu = 300
+cpu_usage = 285
+
+// result: add a server
+```
+
+```rust
+// 1 server
+total_cpu = 100
+cpu_usage = 70
+
+// result: do nothing
+```
+
+```rust
+// 4 servers
+total_cpu = 400
+cpu_usage = 250
+
+// result: remove a server
+```
diff --git a/docs/packages/cluster/SERVER_PROVISIONING.md b/docs/packages/cluster/SERVER_PROVISIONING.md
@@ -0,0 +1,43 @@
+# Automatic Server Provisioning
+
+Server provisioning handles everything responsible for getting servers running and installed for game lobbies to run on. Server provisioning occurs in the `cluster` package and is automatically brought up and down to desired levels via `cluster-datacenter-scale`.
+
+## Motivation
+
+Server provisioning was created to allow for quick and stateful configuration of the game server topology on Rivet. This system was also written with the intention to allow clients to choose their own hardware options and server providers.
+
+In the future, an autoscaling system will be hooked up to the provisioning system to allow the system to scale up to meet spikes in demand, and scale down when load is decreased to save on costs.
+
+## Basic structure
+
+There are currently three types of servers that work together to host game lobbies:
+
+-   ### ATS
+
+    ATS servers host game images via Apache Traffic server. The caching feature provided by ATS along with ATS node being in the same datacenter as the Job node allows for very quick lobby start times.
+
+-   ### Job
+
+    Job servers run Nomad which handles the orchestration of the game lobbies themselves.
+
+-   ### GG
+
+    GameGuard nodes serve as a proxy for all incoming game connection and provide DoS protection.
+
+## Why are servers in the same availability zone (aka datacenter or region)
+
+Servers are placed in the same region for two reasons:
+
+1. ### VLAN + Network Constraints
+
+    Servers rely on VLAN to communicate between each other.
+
+2. ### Latency
+
+    Having all of the required components to run a Job server on the edge, (i.e. in the same datacenter) allows for very quick lobby start times.
+
+## Prior art
+
+-   https://console.aiven.io/project/rivet-3143/new-service?serviceType=pg
+-   https://karpenter.sh/docs/concepts/nodepools/
+-   Nomad autoscaler
diff --git a/docs/packages/cluster/TLS_AND_DNS.md b/docs/packages/cluster/TLS_AND_DNS.md
@@ -0,0 +1,66 @@
+# [rivet.run](http://rivet.run) DNS & TLS Configuration
+
+## Moving parts
+
+#### TLS Cert
+
+-   Can only have 1 wildcard
+    -   i.e. `*.lobby.{dc_id}.rivet.run`
+-   Takes a long time to issue
+-   Prone to Lets Encrypt downtime and [rate limits](https://letsencrypt.org/docs/rate-limits/)
+    -   Nathan requested a rate limit increase for when this is needed
+
+#### DNS record
+
+-   Must point to the IP of the datacenter we need
+    -   i.e. `*.lobby.{dc_id}.rivet.run` goes to the GG Node for the given datacenter
+    -   `*.rivet.run` will not work as a static DNS record because you can’t point it at a single datacenter
+
+#### GG host resolution
+
+-   When a request hits the GG server for HTTP(S) or TCP+TLS requests, we need to be able to resolve the lobby to send it to
+-   This is why the lobby ID Needs to be in the DNS name
+
+#### GG autoscaling
+
+-   The IPs that the DNS records point to change frequently as GG nodes scale up and down
+
+## Design
+
+#### DNS records
+
+Dynamically create a DNS record for each GG node formatted like `*.lobby.{dc_id}.rivet.run`. Example:
+
+```bash
+A *.lobby.51f3d45e-693f-4470-b86d-66980edd87ec.rivet.run 1.2.3.4	# DC foo, GG node 1
+A *.lobby.51f3d45e-693f-4470-b86d-66980edd87ec.rivet.run 5.6.7.8	# DC foo, GG node 2
+A *.lobby.51f3d45e-693f-4470-b86d-66980edd87ec.rivet.run 9.10.11.12	# DC bar, GG node 1
+```
+
+These the IPs of these records change as the GG nodes scale up and down, but the origin stays the same.
+
+#### TLS certs
+
+Each datacenter needs a TLS cert. For the example above, we need a TLS cert for `*.lobby.51f3d45e-693f-4470-b86d-66980edd87ec.rivet.run` and `*.lobby.51f3d45e-693f-4470-b86d-66980edd87ec.rivet.run`.
+
+## TLS
+
+#### TLS cert provider
+
+Currently we use Lets Encrypt as our TLS certificate provider.
+
+Alternatives:
+
+-   ZeroSSL
+
+#### TLS cert refreshing
+
+Right now, the TLS certs are issued in the Terraform plan. Eventually, TLS certs should renew on their own automatically.
+
+## TLS Alternatives
+
+#### Use `*.rivet.run` TLS cert with custom DNS server
+
+Create a `NS` record for `*.rivet.run` pointed at our custom DNS server
+
+We can use a single static TLS cert
diff --git a/fern/definition/admin/cluster/__package__.yml b/fern/definition/admin/cluster/__package__.yml
@@ -0,0 +1,23 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/fern-api/fern/main/fern.schema.json
+
+imports:
+  localCommons: ../common.yml
+
+service:
+  auth: true
+  base-path: /cluster
+  endpoints:
+    getServerIps:
+      path: /server_ips
+      method: GET
+      request:
+        name: GetServerIpsRequest
+        query-parameters:
+          server_id: optional<uuid>
+          pool: optional<localCommons.PoolType>
+      response: GetServerIpsResponse
+
+types:
+  GetServerIpsResponse:
+    properties:
+      ips: list<string>
diff --git a/fern/definition/admin/common.yml b/fern/definition/admin/common.yml
@@ -0,0 +1,7 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/fern-api/fern/main/fern.schema.json
+types:
+  PoolType:
+    enum:
+      - job
+      - gg
+      - ats
diff --git a/fern/definition/cloud/common.yml b/fern/definition/cloud/common.yml
@@ -328,9 +328,6 @@ types:
       provider:
         docs: The server provider of this region.
         type: string
-      universal_region:
-        docs: A universal region label given to this region.
-        type: UniversalRegion
       provider_display_name:
         docs: Represent a resource's readable display name.
         type: string
@@ -434,37 +431,6 @@ types:
           USD, 1,000,000,000,000 = $1.00).
         type: integer
 
-  UniversalRegion:
-    enum:
-      - unknown
-      - local
-      - amsterdam
-      - atlanta
-      - bangalore
-      - dallas
-      - frankfurt
-      - london
-      - mumbai
-      - newark
-      - new_york_city
-      - san_francisco
-      - singapore
-      - sydney
-      - tokyo
-      - toronto
-      - washington_dc
-      - chicago
-      - paris
-      - seattle
-      - sao_paulo
-      - stockholm
-      - chennai
-      - osaka
-      - milan
-      - miami
-      - jakarta
-      - los_angeles
-
   NamespaceFull:
     docs: A full namespace.
     properties:

diff --git a/fern/definition/cloud/games/versions.yml b/fern/definition/cloud/games/versions.yml
@@ -23,7 +23,9 @@ service:
     reserveVersionName:
       path: /reserve-name
       method: POST
-      docs: Reserves a display name for the next version. Used to generate a monotomically increasing build number without causing a race condition with multiple versions getting created at the same time.
+      docs: >-
+        Reserves a display name for the next version. Used to generate a monotomically increasing build
+        number without causing a race condition with multiple versions getting created at the same time.
       response: ReserveVersionNameResponse
 
     validateGameVersion: