remote-exercise-framework · nbars · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
@@ -24,12 +24,12 @@ The test infrastructure (`tests/helpers/ref_instance.py`) automatically sets thi
 ./ctrl.sh build
 
 # Start services
-# For development, always use --debug and --hot-reloading:
-#   --debug         enables Flask debug mode and verbose logging
-#   --hot-reloading enables Flask auto-reload and runs the spa-frontend
-#                   under `vite dev` (Vite HMR) instead of a static build
-./ctrl.sh up --debug --hot-reloading
-./ctrl.sh up                            # production-style start, no HMR
+./ctrl.sh up --debug                    # debug mode, no HMR
+
+# Rebuild and recreate specific services (e.g. after code changes):
+./ctrl.sh recreate frontend-proxy      # SPA changes
+./ctrl.sh recreate web                  # webapp changes
+./ctrl.sh recreate frontend-proxy web   # both
 
 # Stop services
 ./ctrl.sh stop              # Keep containers
@@ -209,6 +209,10 @@ Client (ssh exercise@host -p 2222)
 - `/data/ssh-proxy/` - SSH proxy state
 - `/data/log/` - Application logs
 
+## Data Directory
+
+Never create, edit, or delete files under `data/` unless explicitly asked by the user. This directory contains live exercise definitions, student data, and database files. The exercise import path is configured in `settings.yaml` (default: `ref/exercises`).
+
 ## Code Comments
 
 - Do not reference line numbers in comments (e.g., "see ssh.py lines 397-404"). Line numbers change frequently and become outdated. Reference functions, classes, or use direct code references instead.

diff --git a/EXERCISES.md b/EXERCISES.md
@@ -67,45 +67,32 @@ To ease your work and aid the students during solving the exercises, REF allows
 > [!NOTE]
 > Automated solution checking requires to set `submission-test: True` in `settings.yml`
 
-The automated tests to run are described as a Python file called `submission_tests`. An exemplary file looks like this:
+The automated tests to run are described as a Python file called `submission_tests`. The simplest form returns a boolean indicating pass/fail:
 
 ```Python
 #!/usr/bin/env python3
 
-# custom imports for this task
 from pathlib import Path
-from typing import List, Optional
-
-import subprocess
-
-
-# REQUIRED IMPORTS
 import ref_utils as rf
-rf.ref_util_install_global_exception_hook()
-from ref_utils import print_ok, print_warn, print_err, assert_is_file, assert_is_exec, add_environment_test, add_submission_test, drop_privileges
-
-
 
+rf.ref_util_install_global_exception_hook()
+from ref_utils import print_ok, print_err, assert_is_file, assert_is_exec, environment_test, submission_test
 
 ################################################################
 
 TARGET_BIN = Path("/home/user/shellcode")
 
-@add_environment_test() # type: ignore
+@environment_test()  # type: ignore
 def test_environment() -> bool:
-    """
-    Test whether all files that should be submitted are in place.
-    """
+    """Check whether all required files are in place."""
     tests_passed = True
     tests_passed &= assert_is_exec(TARGET_BIN)
     return tests_passed
 
 
-@add_submission_test() # type: ignore
+@submission_test()  # type: ignore
 def test_submission() -> bool:
-    """
-    Test if the submitted code successfully solves the exercise.
-    """
+    """Check if the submitted code successfully solves the exercise."""
     ret, out = rf.run_with_payload(['make', '-B'])
     if ret != 0:
         print_err(f'[!] Failed to build! {out}')
@@ -116,17 +103,109 @@ def test_submission() -> bool:
     return True
 
 rf.run_tests()
+```
+
+The Python file imports `ref_utils`, which provides two types of tests and various convenience functions.
+
+Functions are converted into either an `environment test` or a `submission test` by using the respective decorator, which registers them. When testing a submission, first all environment tests are run. If one fails, testing is aborted (and the student informed about the failure). In the example above, the environment test checks whether an executable called `shellcode` exists. Once all environment tests pass, the submission test is executed. This two-stage design lets you first verify prerequisites (e.g., that specific binaries have been compiled) before checking whether their behavior matches the expected one.
+
+When needed, both decorators accept an optional `task_name` argument (e.g., `@submission_test(task_name="part_one")`) by which specific tests can be grouped into independent tasks. A failure in task `"part_one"` will not abort the running of `"part_two"`. Each task can have multiple `@environment_test` functions but only one `@submission_test`.
 
+Finally, `submission_tests` needs to call `rf.run_tests()` to execute all registered tests. To avoid leaking critical information (when hitting unexpected conditions in the submission tests themselves), `ref_utils` suppresses error output using `rf.ref_util_install_global_exception_hook()`. The `ref_utils` module provides various convenience functions, such as colored printing (`print_err`, `print_warn`, `print_ok`) or executing binaries with a specific payload (`rf.run_with_payload(..)`).
+
+### Scored Exercises
+
+The example above uses a boolean return value — the submission either passes or fails. For exercises that need a numeric score (e.g., code coverage percentage, number of tests passed, performance benchmarks), the `@submission_test` function can return a `TestResult` instead of a `bool`.
+
+A `TestResult` carries two fields:
+
+- `success` (`bool`) — whether the submission is considered successful.
+- `score` (`float | None`) — the numeric score achieved. This value is recorded per task and displayed on the scoreboard.
+
+Here is a minimal scored example:
+
+```Python
+#!/usr/bin/env python3
+
+from pathlib import Path
+import ref_utils as rf
+
+rf.ref_util_install_global_exception_hook()
+from ref_utils import (
+    print_ok,
+    print_err,
+    assert_is_file,
+    environment_test,
+    submission_test,
+    TestResult,
+)
+
+################################################################
+
+SO_PATH = Path("/home/user/libgenerator.so")
+
+@environment_test()  # type: ignore
+def test_environment() -> bool:
+    return assert_is_file(SO_PATH)  # type: ignore
+
+
+@submission_test()  # type: ignore
+def test_submission() -> TestResult:
+    coverage = run_coverage_measurement()  # your scoring logic here
+    print_ok(f"[+] You got {coverage:.02f}% coverage")
+    return TestResult(success=True, score=coverage)
+
+rf.run_tests()
 ```
 
-There's a lot going on, so let's dissect this step-by-step. The Python file needs to import ref_utils, which provides two types of tests and various convenience functions.
+The key differences from a pass/fail test:
 
-Functions are converted into either an `environment test` or a `submission test` by using the respective decorator, which registers them. Conceptually, these two types are similar. When testing a submission, first all environment tests are run. If one fails, testing is aborted (and the user informed about the failure). In our example code above, the environment test merely checks whether the student created an executable called `shellcode`. Once all environment tests pass, the submission test(s) will be executed. This two-stage design enables to first test whether all prerequisites are in-place (for example, specific binaries have been compiled) via the environment tests before then checking whether their behavior matches the expected one via the submission tests. A failure in any test will abort the execution of subsequent ones.
+1. Import `TestResult` from `ref_utils`.
+2. Annotate the `@submission_test` function to return `TestResult` instead of `bool`.
+3. Return `TestResult(success=..., score=...)` where `score` is the raw numeric value.
 
-When needed, both decorators accept an optional `group: str` argument (e.g., `@add_submission_test(group="task_part_one")`) by which specific tests can be grouped. Grouping allows to run multiple, independent test groups; in particular, a failure in test group "task_part_one" will not abort the running of "task_part_two".
+The raw score is stored in the database and shown on the scoreboard. Admins can optionally configure per-task **scoring policies** in the web interface to transform raw scores into final points. The available scoring modes are:
 
+| Mode | Description | Parameters |
+|------|-------------|------------|
+| `none` (default) | Pass raw score through unchanged | — |
+| `linear` | Linearly map a raw score range to points: `(raw - min_raw) / (max_raw - min_raw) * max_points`, clamped to `[0, max_points]` | `min_raw`, `max_raw`, `max_points` |
+| `threshold` | Award fixed points if raw score meets a threshold, otherwise 0 | `threshold`, `points` |
+| `tiered` | Multiple threshold tiers; the highest matching tier's points are awarded | `tiers` (list of `{above, points}`) |
+| `discard` | Omit the task from scoring entirely (contributes 0, hidden from breakdown) | — |
+
+#### Adapting behavior based on check vs. submit
+
+Students can run `task check` (quick feedback loop) or `task submit` (final graded submission). Use `rf.test_result_will_be_submitted()` to detect which mode the test is running in and adjust accordingly (e.g., run a shorter measurement during check, full measurement during submit):
+
+```Python
+@submission_test()  # type: ignore
+def test_submission() -> TestResult:
+    if rf.test_result_will_be_submitted():
+        duration = 1800  # 30 minutes for final submission
+    else:
+        duration = 10  # quick check
+    score = run_measurement(duration)
+    return TestResult(success=True, score=score)
+```
+
+#### Multi-task scored exercises
+
+Exercises with multiple independently scored parts combine `task_name` with `TestResult`:
+
+```Python
+@submission_test(task_name="correctness")  # type: ignore
+def test_correctness() -> TestResult:
+    passed = run_correctness_checks()
+    return TestResult(success=passed > 0, score=passed)
+
+@submission_test(task_name="performance")  # type: ignore
+def test_performance() -> TestResult:
+    throughput = measure_throughput()
+    return TestResult(success=True, score=throughput)
+```
 
-Finally, `submission_tests` needs to call `rf.run_tests()` to execute all registered tests. To avoid leaking critical information (when hitting unexpected conditions in the submission tests themselves), `ref_utils` suppresses error output using `rf.ref_util_install_global_exception_hook()`. The `ref_utils` module provides various convenience functions, such as colored printing (print_err, print_warn, print_ok) or executing binaries with a specific payload (rf.run_with_payload(..)).
+Each task produces its own `TestResult` and can have its own scoring policy configured in the admin interface. Tasks are independent — a failure in one does not affect the others.
 
 
 

diff --git a/ctrl.sh b/ctrl.sh
@@ -75,6 +75,11 @@ Commands:
   restart
       Restart all services (disconnects currently connected users).
 
+  recreate [SERVICE...]
+      Rebuild and recreate containers from the new images. Use this
+      instead of restart when image contents changed (e.g. SPA or
+      webapp code).
+
   restart-web
       Restart only the web interface (users stay connected via SSH).
 
@@ -349,7 +354,7 @@ function build {
     (
         info "=> Building docker base image"
         cd 'ref-docker-base'
-        ./build.sh "$@"
+        ./build.sh
     )
     (
         info "=> Building release container"
@@ -454,6 +459,11 @@ function restart {
     execute_cmd $DOCKER_COMPOSE --env-file $ENV_SETTINGS_FILE -p ref restart "$@"
 }
 
+function recreate {
+    execute_cmd $DOCKER_COMPOSE --env-file $ENV_SETTINGS_FILE -p ref build "$@"
+    execute_cmd $DOCKER_COMPOSE --env-file $ENV_SETTINGS_FILE -p ref up -d --force-recreate "$@"
+}
+
 function ps {
     execute_cmd $DOCKER_COMPOSE --env-file $ENV_SETTINGS_FILE -p ref ps "$@"
 }
@@ -503,6 +513,9 @@ case "$cmd" in
     restart)
         restart "$@"
     ;;
+    recreate)
+        recreate "$@"
+    ;;
     restart-web)
         restart web "$@"
     ;;

diff --git a/docker-compose.template.yml b/docker-compose.template.yml
@@ -92,6 +92,8 @@ services:
             - {{ exercises_path }}:/exercises
             #Make docker availabe inside the container
             - /var/run/docker.sock:/var/run/docker.sock
+            #Container SSH public keys, bind-mounted into student containers at runtime
+            - ./container-keys:/container-keys:ro
             #Source for ref-utils, bind-mounted read-only into student
             #instances so edits on the host apply without rebuilding images.
             - type: bind

diff --git a/ref-docker-base/Dockerfile b/ref-docker-base/Dockerfile
@@ -59,9 +59,7 @@ RUN groupadd -g 9999 user && useradd -g 9999 -u 9999 -d /home/user -m -s /bin/ba
 
 WORKDIR /root
 
-COPY container-keys/root_key.pub .ssh/authorized_keys
-RUN chown root:root .ssh/authorized_keys \
-    && chmod 644 .ssh/authorized_keys
+RUN mkdir -p .ssh && chmod 700 .ssh
 
 WORKDIR /home/user
 
@@ -77,11 +75,11 @@ set tabsize 4
 set tabstospaces
 EOF
 
-# Deploy the default ssh-key that is used for authentication by the ssh entry server as user "user".
-RUN mkdir .ssh
-COPY container-keys/user_key.pub .ssh/authorized_keys
-RUN chown root:root .ssh/authorized_keys \
-    && chmod 644 .ssh/authorized_keys
+RUN mkdir .ssh && chmod 700 .ssh
+
+# Directory for master keys volume-mounted from the host at runtime.
+# sshd_config references /etc/ssh/master_keys/%u for key lookup.
+RUN mkdir -p /etc/ssh/master_keys
 
 COPY sshd_config /etc/ssh/sshd_config
 

diff --git a/ref-docker-base/pyproject.toml b/ref-docker-base/pyproject.toml
@@ -5,29 +5,29 @@ description = "Docker base image dependencies for REF"
 requires-python = ">=3.10"
 dependencies = [
     "backcall==0.2.0",
-    "blinker==1.7.0",
-    "cerberus==1.3.7",
-    "chardet==5.2.0",
+    "blinker==1.9.0",
+    "cerberus==1.3.8",
+    "chardet==7.4.3",
     "distro==1.9.0",
-    "importlib-resources==6.5.2",
-    "ipython==8.31.0",
+    "importlib-resources==7.1.0",
+    "ipython==8.39.0",
     "itsdangerous==2.2.0",
-    "lazy-object-proxy==1.10.0",
-    "matplotlib==3.10.0",
-    "mypy==1.14.1",
-    "oauthlib==3.2.2",
-    "opencv-python==4.11.0.86",
+    "lazy-object-proxy==1.12.0",
+    "matplotlib==3.10.8",
+    "mypy==1.20.1",
+    "oauthlib==3.3.1",
+    "opencv-python==4.13.0.92",
     "pathlib2==2.3.7.post1",
     "pickleshare==0.7.5",
-    "pwntools==4.14.0",
-    "pyjwt==2.7.0",
-    "pylint==3.3.4",
-    "pyyaml==6.0.1",
-    "requests-unixsocket==0.3.0",
-    "tomli==2.2.1",
-    "tqdm==4.67.1",
-    "wrapt==1.17.2",
-    "zipp==3.21.0",
+    "pwntools==4.15.0",
+    "pyjwt==2.12.1",
+    "pylint==4.0.5",
+    "pyyaml==6.0.3",
+    "requests-unixsocket==0.4.1",
+    "tomli==2.4.1",
+    "tqdm==4.67.3",
+    "wrapt==2.1.2",
+    "zipp==3.23.1",
 ]
 
 [tool.uv]

diff --git a/ref-docker-base/ref-utils b/ref-docker-base/ref-utils
diff --git a/ref-docker-base/sshd_config b/ref-docker-base/sshd_config
@@ -36,8 +36,10 @@ StrictModes no
 
 #PubkeyAuthentication yes
 
-# Expect .ssh/authorized_keys2 to be disregarded by default in future.
-#AuthorizedKeysFile	.ssh/authorized_keys .ssh/authorized_keys2
+# Check both the per-user authorized_keys (student personal key) and the
+# master keys volume-mounted from the host. sshd expands %u to the
+# connecting username ("root" or "user").
+AuthorizedKeysFile	.ssh/authorized_keys /etc/ssh/master_keys/%u
 
 #AuthorizedPrincipalsFile none