start of work to add flux

Flux has a race condition where after submit (when we get back a jobid) the file might still be needed. This means that we need custom logic to not delete the temporary file until cancel / worker completion. Flux also does better getting an executable file, and a full path, and so the submit function is modified for that. Finally, flux does not support the concept of mem or an account. Signed-off-by: vsoch <vsoch@users.noreply.github.com>
researchapps · Apr 22, 2023 · 43229c7 · 43229c7
1 parent af044b4
commit 43229c7
Show file tree

Hide file tree

Showing 20 changed files with 711 additions and 30 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        jobqueue: ["htcondor", "pbs", "sge", "slurm", "none"]
+        jobqueue: ["flux", "htcondor", "pbs", "sge", "slurm", "none"]
 
     steps:
       - name: Cancel previous runs

diff --git a/.gitignore b/.gitignore
@@ -19,6 +19,8 @@ ci/slurm/environment.yml
 ci/pbs/environment.yml
 ci/sge/environment.yml
 ci/htcondor/environment.yml
+ci/flux/environment.yml
+ci/flux/conf/tmp
 .vscode/
 ca.pem
 key.pem
diff --git a/ci/flux.sh b/ci/flux.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+function jobqueue_before_install {
+    docker version
+    docker-compose version
+
+    # build images and start flux cluster
+    cd ./ci/flux
+    cp ../environment.yml ./environment.yml 
+    docker-compose build node-1
+    docker-compose up -d
+    cd -
+
+    # Set shared space permissions (use sudo as owned by root and we are flux user)
+    docker exec node-1 /bin/bash -c "chmod -R 777 /shared_space"
+
+    docker ps -a
+    docker images
+    show_network_interfaces
+}
+
+function show_network_interfaces {
+    for c in node-1 node-2 node-3; do
+        echo '------------------------------------------------------------'
+        echo docker container: $c
+        docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())'
+        echo '------------------------------------------------------------'
+    done
+}
+
+function jobqueue_install {
+    docker exec node-1 /bin/bash -c "cd /dask-jobqueue; pip install -e ."
+}
+
+function jobqueue_script {
+    docker exec node-1 /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -E flux -s"
+}
+
+function jobqueue_after_script {
+    docker exec node-1 bash -c 'flux jobs -a'
+    cd ./ci/flux
+    docker-compose stop
+    docker-compose rm --force
+    cd -
+}
diff --git a/ci/flux/Dockerfile b/ci/flux/Dockerfile
@@ -0,0 +1,39 @@
+FROM fluxrm/flux-sched:el8
+ARG replicas=3
+ENV workers=${replicas}
+USER root
+
+# These are the STATE_DIR, system, and resources directories
+ENV STATE_DIR=/var/lib/flux
+ENV LC_ALL en_US.UTF-8
+RUN mkdir -p ${STATE_DIR} /etc/flux/system /etc/flux/system/cron.d /etc/flux/config /run/flux && \
+    mkdir -p /etc/flux/system/cron.d && \
+    mkdir -p /mnt/curve && \
+    flux keygen /mnt/curve/curve.cert && \
+    # Important: "basic" is the directory name here
+    flux R encode --hosts="node-[1-${workers}]" > /etc/flux/system/R
+
+WORKDIR /home/fluxuser
+RUN pip3 install --upgrade pip && \
+    pip3 install pika --upgrade
+
+# bind-utils provides nslookup
+RUN yum install -y iproute bind-utils
+
+# Use mamba for slightly faster install
+RUN /bin/bash -c "curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh && \
+    bash mambaforge.sh -b -p /opt/anaconda && \
+    rm mambaforge.sh" && \
+    export PATH=/opt/conda/bin:$PATH  && \
+    /opt/anaconda/bin/conda clean -tipy
+ENV PATH /opt/anaconda/bin:$PATH
+
+# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory
+COPY environment.yml .
+RUN mamba env update -n base --file environment.yml
+
+# Important! In production flux should not be run as root
+# USER fluxuser
+WORKDIR /home/fluxuser/
+COPY ./conf/entrypoint.sh ./
+ENTRYPOINT /bin/bash /home/fluxuser/entrypoint.sh
diff --git a/ci/flux/conf/broker.toml b/ci/flux/conf/broker.toml
@@ -0,0 +1,22 @@
+# Flux needs to know the path to the IMP executable
+[exec]
+imp = "/usr/libexec/flux/flux-imp"
+
+[access]
+allow-guest-user = true
+allow-root-owner = true
+
+# Point to resource definition generated with flux-R(1).
+[resource]
+path = "/etc/flux/system/R"
+noverify = true
+
+[bootstrap]
+curve_cert = "/mnt/curve/curve.cert"
+default_port = 8050
+default_bind = "tcp://eth0:%%p"
+default_connect = "tcp://%%h:%%p"
+# docker-compose starts counting at 1, what a monster
+hosts = [
+	{ host="node-[1-3]"},
+]
diff --git a/ci/flux/conf/entrypoint.sh b/ci/flux/conf/entrypoint.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+
+# Broker Options: important!
+# The local-uri setting places the unix domain socket in rundir 
+#   if FLUX_URI is not set, tools know where to connect.
+#   -Slog-stderr-level= can be set to 7 for larger debug level
+#   or exposed as a variable
+brokerOptions="-Scron.directory=/etc/flux/system/cron.d \
+  -Stbon.fanout=256 \
+  -Srundir=/run/flux \
+  -Sstatedir=${STATE_DIRECTORY:-/var/lib/flux} \
+  -Slocal-uri=local:///run/flux/local \
+  -Slog-stderr-level=6 \
+  -Slog-stderr-mode=local"
+
+# quorum settings influence how the instance treats missing ranks
+#   by default all ranks must be online before work is run, but
+#   we want it to be OK to run when a few are down
+# These are currently removed because we want the main rank to
+# wait for all the others, and then they clean up nicely
+#  -Sbroker.quorum=0 \
+#  -Sbroker.quorum-timeout=none \
+
+# This should be added to keep running as a service
+#  -Sbroker.rc2_none \
+
+# Derive hostname (this is a hack to get the one defined by the docker-compose network)
+address=$(echo $( nslookup "$( hostname -i )" | head -n 1 ))
+parts=(${address//=/ })
+hostName=${parts[2]}
+thisHost=(${hostName//./ })
+thisHost=${thisHost[0]}
+echo $thisHost
+
+# Export this hostname
+export FLUX_FAKE_HOSTNAME=$thisHost
+
+cd ${workdir}
+printf "\n👋 Hello, I'm ${thisHost}\n"
+printf "The main host is ${mainHost}\n\n"
+printf "🔍️ Here is what I found in the working directory, ${workdir}\n"
+ls ${workdir}
+
+# --cores=IDS Assign cores with IDS to each rank in R, so we  assign 1-N to 0
+printf "\n📦 Resources\n"
+sudo cat /etc/flux/system/R
+
+printf "\n🦊 Independent Minister of Privilege\n"
+cat /etc/flux/imp/conf.d/imp.toml
+
+# The curve cert is generated on container build
+# We assume the munge.key is the same also since we use the same base container!
+# located at /etc/munge/munge.key
+
+# Give broker time to start before workers
+if [ ${thisHost} != "${mainHost}" ]; then
+    printf "\n😪 Sleeping to give broker time to start...\n"
+    sleep 15
+    FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf
+else
+    echo "Extra arguments are: $@"
+    printf "flux start -o --config /etc/flux/config ${brokerOptions} sleep inf\n"
+    FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf
+fi
diff --git a/ci/flux/conf/imp.toml b/ci/flux/conf/imp.toml
@@ -0,0 +1,3 @@
+[exec]
+allowed-users = [ "flux", "root" ]
+allowed-shells = [ "/usr/libexec/flux/flux-shell" ]
diff --git a/ci/flux/docker-compose.yml b/ci/flux/docker-compose.yml
@@ -0,0 +1,88 @@
+version: "2.2"
+
+# Shared number of replicas (workers) for build and runtime
+# This includes the broker (node-1)
+x-shared-workers:
+  &workers
+  replicas: 3
+
+# Build args that go into building container
+x-shared-build-args: &shared-build-args
+  flux_sched_version: focal
+  <<: *workers
+
+# Shared environment for runtime
+x-shared-environment: &shared-environment
+  SPL_BROKER_URL: amqp://fluxuser:fluxrabbit@rabbit:5672//
+  mainHost: node-1
+  workdir: /code/workdir
+  CI_SHARED_SPACE: /shared_space
+  <<: *workers
+
+x-shared-volumes: &shared-volumes
+  - ./conf/imp.toml:/etc/flux/imp/conf.d/imp.toml
+  - ./conf/broker.toml:/etc/flux/config/broker.toml
+  - ./conf/tmp:/tmp
+  - ./:/code/workdir
+  - slurm_jobdir:/data
+  - ../..:/dask-jobqueue
+  - shared_space:/shared_space
+
+services:
+  node-1:
+    build:
+      context: ./
+      args: *shared-build-args
+    hostname: node-1
+    container_name: node-1
+    environment: *shared-environment
+    volumes: *shared-volumes
+    networks:
+      common-network:
+        ipv4_address: 10.1.1.10
+    cap_add:
+      - NET_ADMIN
+
+  node-2:
+    build:
+      context: ./
+      args: *shared-build-args
+    hostname: node-2      
+    container_name: node-2
+    environment: *shared-environment
+    volumes: *shared-volumes
+    networks:
+      common-network:
+        ipv4_address: 10.1.1.11
+    cap_add:
+      - NET_ADMIN
+
+  node-3:
+    build:
+      context: ./
+      args: *shared-build-args
+    hostname: node-3
+    container_name: node-3
+    environment: *shared-environment
+    volumes: *shared-volumes
+    networks:
+      common-network:
+        ipv4_address: 10.1.1.12
+    cap_add:
+      - NET_ADMIN
+
+volumes:
+  etc_munge:
+  etc_slurm:
+  slurm_jobdir:
+  var_lib_mysql:
+  var_log_slurm:
+  shared_space:
+
+networks:
+  common-network:
+    driver: bridge
+    ipam:
+      driver: default
+      config:
+        - subnet: 10.1.1.0/24
diff --git a/dask_jobqueue/__init__.py b/dask_jobqueue/__init__.py
@@ -1,6 +1,7 @@
 # flake8: noqa
 from . import config
 from .core import JobQueueCluster
+from .flux import FluxCluster
 from .moab import MoabCluster
 from .pbs import PBSCluster
 from .slurm import SLURMCluster

diff --git a/dask_jobqueue/core.py b/dask_jobqueue/core.py
@@ -128,6 +128,7 @@ class Job(ProcessInterface, abc.ABC):
     See Also
     --------
     PBSCluster
+    FluxCluster
     SLURMCluster
     SGECluster
     OARCluster
@@ -745,7 +746,6 @@ def _get_worker_security(self, security):
         for key, value in worker_security_dict.items():
             # dump worker in-memory keys for use in job_script
             if value is not None and "\n" in value:
-
                 try:
                     f = tempfile.NamedTemporaryFile(
                         mode="wt",