forked from dask/dask-jobqueue
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Flux has a race condition where after submit (when we get back a jobid) the file might still be needed. This means that we need custom logic to not delete the temporary file until cancel / worker completion. Flux also does better getting an executable file, and a full path, and so the submit function is modified for that. Finally, flux does not support the concept of mem or an account. Signed-off-by: vsoch <vsoch@users.noreply.github.com>
- Loading branch information
Showing
20 changed files
with
711 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env bash | ||
|
||
function jobqueue_before_install { | ||
docker version | ||
docker-compose version | ||
|
||
# build images and start flux cluster | ||
cd ./ci/flux | ||
cp ../environment.yml ./environment.yml | ||
docker-compose build node-1 | ||
docker-compose up -d | ||
cd - | ||
|
||
# Set shared space permissions (use sudo as owned by root and we are flux user) | ||
docker exec node-1 /bin/bash -c "chmod -R 777 /shared_space" | ||
|
||
docker ps -a | ||
docker images | ||
show_network_interfaces | ||
} | ||
|
||
function show_network_interfaces { | ||
for c in node-1 node-2 node-3; do | ||
echo '------------------------------------------------------------' | ||
echo docker container: $c | ||
docker exec $c python -c 'import psutil; print(psutil.net_if_addrs().keys())' | ||
echo '------------------------------------------------------------' | ||
done | ||
} | ||
|
||
function jobqueue_install { | ||
docker exec node-1 /bin/bash -c "cd /dask-jobqueue; pip install -e ." | ||
} | ||
|
||
function jobqueue_script { | ||
docker exec node-1 /bin/bash -c "cd; pytest /dask-jobqueue/dask_jobqueue --verbose -E flux -s" | ||
} | ||
|
||
function jobqueue_after_script { | ||
docker exec node-1 bash -c 'flux jobs -a' | ||
cd ./ci/flux | ||
docker-compose stop | ||
docker-compose rm --force | ||
cd - | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
FROM fluxrm/flux-sched:el8 | ||
ARG replicas=3 | ||
ENV workers=${replicas} | ||
USER root | ||
|
||
# These are the STATE_DIR, system, and resources directories | ||
ENV STATE_DIR=/var/lib/flux | ||
ENV LC_ALL en_US.UTF-8 | ||
RUN mkdir -p ${STATE_DIR} /etc/flux/system /etc/flux/system/cron.d /etc/flux/config /run/flux && \ | ||
mkdir -p /etc/flux/system/cron.d && \ | ||
mkdir -p /mnt/curve && \ | ||
flux keygen /mnt/curve/curve.cert && \ | ||
# Important: "basic" is the directory name here | ||
flux R encode --hosts="node-[1-${workers}]" > /etc/flux/system/R | ||
|
||
WORKDIR /home/fluxuser | ||
RUN pip3 install --upgrade pip && \ | ||
pip3 install pika --upgrade | ||
|
||
# bind-utils provides nslookup | ||
RUN yum install -y iproute bind-utils | ||
|
||
# Use mamba for slightly faster install | ||
RUN /bin/bash -c "curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh && \ | ||
bash mambaforge.sh -b -p /opt/anaconda && \ | ||
rm mambaforge.sh" && \ | ||
export PATH=/opt/conda/bin:$PATH && \ | ||
/opt/anaconda/bin/conda clean -tipy | ||
ENV PATH /opt/anaconda/bin:$PATH | ||
|
||
# environment.yml file is copied by CI script. If manually building, you should copy it too from parent directory | ||
COPY environment.yml . | ||
RUN mamba env update -n base --file environment.yml | ||
|
||
# Important! In production flux should not be run as root | ||
# USER fluxuser | ||
WORKDIR /home/fluxuser/ | ||
COPY ./conf/entrypoint.sh ./ | ||
ENTRYPOINT /bin/bash /home/fluxuser/entrypoint.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Flux needs to know the path to the IMP executable | ||
[exec] | ||
imp = "/usr/libexec/flux/flux-imp" | ||
|
||
[access] | ||
allow-guest-user = true | ||
allow-root-owner = true | ||
|
||
# Point to resource definition generated with flux-R(1). | ||
[resource] | ||
path = "/etc/flux/system/R" | ||
noverify = true | ||
|
||
[bootstrap] | ||
curve_cert = "/mnt/curve/curve.cert" | ||
default_port = 8050 | ||
default_bind = "tcp://eth0:%%p" | ||
default_connect = "tcp://%%h:%%p" | ||
# docker-compose starts counting at 1, what a monster | ||
hosts = [ | ||
{ host="node-[1-3]"}, | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#!/bin/sh | ||
|
||
# Broker Options: important! | ||
# The local-uri setting places the unix domain socket in rundir | ||
# if FLUX_URI is not set, tools know where to connect. | ||
# -Slog-stderr-level= can be set to 7 for larger debug level | ||
# or exposed as a variable | ||
brokerOptions="-Scron.directory=/etc/flux/system/cron.d \ | ||
-Stbon.fanout=256 \ | ||
-Srundir=/run/flux \ | ||
-Sstatedir=${STATE_DIRECTORY:-/var/lib/flux} \ | ||
-Slocal-uri=local:///run/flux/local \ | ||
-Slog-stderr-level=6 \ | ||
-Slog-stderr-mode=local" | ||
|
||
# quorum settings influence how the instance treats missing ranks | ||
# by default all ranks must be online before work is run, but | ||
# we want it to be OK to run when a few are down | ||
# These are currently removed because we want the main rank to | ||
# wait for all the others, and then they clean up nicely | ||
# -Sbroker.quorum=0 \ | ||
# -Sbroker.quorum-timeout=none \ | ||
|
||
# This should be added to keep running as a service | ||
# -Sbroker.rc2_none \ | ||
|
||
# Derive hostname (this is a hack to get the one defined by the docker-compose network) | ||
address=$(echo $( nslookup "$( hostname -i )" | head -n 1 )) | ||
parts=(${address//=/ }) | ||
hostName=${parts[2]} | ||
thisHost=(${hostName//./ }) | ||
thisHost=${thisHost[0]} | ||
echo $thisHost | ||
|
||
# Export this hostname | ||
export FLUX_FAKE_HOSTNAME=$thisHost | ||
|
||
cd ${workdir} | ||
printf "\n👋 Hello, I'm ${thisHost}\n" | ||
printf "The main host is ${mainHost}\n\n" | ||
printf "🔍️ Here is what I found in the working directory, ${workdir}\n" | ||
ls ${workdir} | ||
|
||
# --cores=IDS Assign cores with IDS to each rank in R, so we assign 1-N to 0 | ||
printf "\n📦 Resources\n" | ||
sudo cat /etc/flux/system/R | ||
|
||
printf "\n🦊 Independent Minister of Privilege\n" | ||
cat /etc/flux/imp/conf.d/imp.toml | ||
|
||
# The curve cert is generated on container build | ||
# We assume the munge.key is the same also since we use the same base container! | ||
# located at /etc/munge/munge.key | ||
|
||
# Give broker time to start before workers | ||
if [ ${thisHost} != "${mainHost}" ]; then | ||
printf "\n😪 Sleeping to give broker time to start...\n" | ||
sleep 15 | ||
FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf | ||
else | ||
echo "Extra arguments are: $@" | ||
printf "flux start -o --config /etc/flux/config ${brokerOptions} sleep inf\n" | ||
FLUX_FAKE_HOSTNAME=$thisHost flux start -o --config /etc/flux/config ${brokerOptions} sleep inf | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[exec] | ||
allowed-users = [ "flux", "root" ] | ||
allowed-shells = [ "/usr/libexec/flux/flux-shell" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
version: "2.2" | ||
|
||
# Shared number of replicas (workers) for build and runtime | ||
# This includes the broker (node-1) | ||
x-shared-workers: | ||
&workers | ||
replicas: 3 | ||
|
||
# Build args that go into building container | ||
x-shared-build-args: &shared-build-args | ||
flux_sched_version: focal | ||
<<: *workers | ||
|
||
# Shared environment for runtime | ||
x-shared-environment: &shared-environment | ||
SPL_BROKER_URL: amqp://fluxuser:fluxrabbit@rabbit:5672// | ||
mainHost: node-1 | ||
workdir: /code/workdir | ||
CI_SHARED_SPACE: /shared_space | ||
<<: *workers | ||
|
||
x-shared-volumes: &shared-volumes | ||
- ./conf/imp.toml:/etc/flux/imp/conf.d/imp.toml | ||
- ./conf/broker.toml:/etc/flux/config/broker.toml | ||
- ./conf/tmp:/tmp | ||
- ./:/code/workdir | ||
- slurm_jobdir:/data | ||
- ../..:/dask-jobqueue | ||
- shared_space:/shared_space | ||
|
||
services: | ||
node-1: | ||
build: | ||
context: ./ | ||
args: *shared-build-args | ||
hostname: node-1 | ||
container_name: node-1 | ||
environment: *shared-environment | ||
volumes: *shared-volumes | ||
networks: | ||
common-network: | ||
ipv4_address: 10.1.1.10 | ||
cap_add: | ||
- NET_ADMIN | ||
|
||
node-2: | ||
build: | ||
context: ./ | ||
args: *shared-build-args | ||
hostname: node-2 | ||
container_name: node-2 | ||
environment: *shared-environment | ||
volumes: *shared-volumes | ||
networks: | ||
common-network: | ||
ipv4_address: 10.1.1.11 | ||
cap_add: | ||
- NET_ADMIN | ||
|
||
node-3: | ||
build: | ||
context: ./ | ||
args: *shared-build-args | ||
hostname: node-3 | ||
container_name: node-3 | ||
environment: *shared-environment | ||
volumes: *shared-volumes | ||
networks: | ||
common-network: | ||
ipv4_address: 10.1.1.12 | ||
cap_add: | ||
- NET_ADMIN | ||
|
||
volumes: | ||
etc_munge: | ||
etc_slurm: | ||
slurm_jobdir: | ||
var_lib_mysql: | ||
var_log_slurm: | ||
shared_space: | ||
|
||
networks: | ||
common-network: | ||
driver: bridge | ||
ipam: | ||
driver: default | ||
config: | ||
- subnet: 10.1.1.0/24 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.