From 21503aaf6c6cd15e7cb25b2137d662a3bc1ac49b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Aug 2023 11:08:59 +0000 Subject: [PATCH 01/14] fix opensearch grafana plugin at last working version --- environments/common/inventory/group_vars/all/grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index ce83e3c9..64c19d84 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -79,7 +79,7 @@ grafana_datasources: # readOnly: false grafana_plugins: - - grafana-opensearch-datasource + - grafana-opensearch-datasource 2.6.2 # want to set grafana_server.serve_from_sub_path if have Open Ondemand to proxy: grafana_server: From a9f5d332d3a0eacf41f748fbcf068a98bcbec7a0 Mon Sep 17 00:00:00 2001 From: Mariusz Karpiarz Date: Tue, 1 Aug 2023 12:35:14 +0100 Subject: [PATCH 02/14] Fix query type in the Slurm jobs Grafana dashboard Set the query type to "lucene" to fix the `Slurm jobs` dashboard. The original query causes a `SIGSEGV: segmentation violation` in versions above 2.6.2 of the OpenSearch plugin due to this change: https://github.com/grafana/opensearch-datasource/commit/24fcd47ea212f058ecf8227554c2d16a93f60a57#diff-804a5756e358074cb0ba27b1932cca77d39b90694fc1d4d122f46348c8653259 This then makes Grafana return a 500 error with the `TypeError: pa[t] is undefined` message. CC: https://github.com/stackhpc/ansible-slurm-appliance/pull/292 --- ansible/roles/grafana-dashboards/files/slurm-jobs.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/grafana-dashboards/files/slurm-jobs.json b/ansible/roles/grafana-dashboards/files/slurm-jobs.json index 40afb37a..37a1cff1 100644 --- a/ansible/roles/grafana-dashboards/files/slurm-jobs.json +++ b/ansible/roles/grafana-dashboards/files/slurm-jobs.json @@ -150,7 +150,7 @@ } ], "query": "*", - "queryType": "randomWalk", + "queryType": "lucene", "refId": "A", "timeField": "@timestamp" } From 048e2220d4a5dc7f25427da4db9580c4b2f31d9d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 1 Aug 2023 12:43:45 +0000 Subject: [PATCH 03/14] bump grafana opensearch plugin prior to fixing query type (i.e. this won't work) --- environments/common/inventory/group_vars/all/grafana.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 64c19d84..23f23c9a 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -79,7 +79,7 @@ grafana_datasources: # readOnly: false grafana_plugins: - - grafana-opensearch-datasource 2.6.2 + - grafana-opensearch-datasource 2.8.1 # want to set grafana_server.serve_from_sub_path if have Open Ondemand to proxy: grafana_server: From 7f40e32ae1f07b9802121b602e06af707edff882 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Tue, 1 Aug 2023 17:02:37 +0100 Subject: [PATCH 04/14] Increment iteration in slurm-jobs.json This should mean that existing Grafanas pick up the fix to this provisioned dashboard --- ansible/roles/grafana-dashboards/files/slurm-jobs.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/slurm-jobs.json b/ansible/roles/grafana-dashboards/files/slurm-jobs.json index 37a1cff1..8cbc4438 100644 --- a/ansible/roles/grafana-dashboards/files/slurm-jobs.json +++ b/ansible/roles/grafana-dashboards/files/slurm-jobs.json @@ -46,7 +46,7 @@ "gnetId": 13535, "graphTooltip": 0, "id": null, - "iteration": 1607441312744, + "iteration": 1607441312745, "links": [], "panels": [ { @@ -302,4 +302,4 @@ "uid": "jYPt7MTGk", "version": 2, "description": "Requires https://github.com/stackhpc/slurm-openstack-tools" -} \ No newline at end of file +} From eb10a25723ff7c38dca99bc3b4538a87ab544195 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Aug 2023 09:25:21 +0000 Subject: [PATCH 05/14] fix slurmstats/opensearch datasource version configuration --- environments/common/inventory/group_vars/all/grafana.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 23f23c9a..8222a3cc 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -63,6 +63,7 @@ grafana_datasources: url: "http://{{ prometheus_address }}:9090" # default prometheus port editable: true - name: slurmstats + # see https://github.com/grafana/opensearch-datasource#configure-the-data-source-with-provisioning type: grafana-opensearch-datasource url: "https://{{ opensearch_address }}:9200" basicAuth: true @@ -74,7 +75,10 @@ grafana_datasources: tlsSkipVerify: true database: filebeat-* timeField: "@timestamp" - flavor: opensearch + # Have to set flavor and version, but ansible/roles/opensearch/templates/opensearch.yml.j2 fakes version for filebeat + # so need to set to fake version here: + version: '7.10.2' + flavor: elasticsearch editable: true # readOnly: false From b4c02794c3746d30d4114e213a1f5839b551331a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Aug 2023 09:37:30 +0000 Subject: [PATCH 06/14] use python3.9 for jupyter OOD app --- .../files/jupyter_requirements.txt | 101 ++++++++++++++++++ .../openondemand/tasks/jupyter_compute.yml | 21 +++- .../inventory/group_vars/all/openondemand.yml | 2 +- 3 files changed, 118 insertions(+), 6 deletions(-) create mode 100644 ansible/roles/openondemand/files/jupyter_requirements.txt diff --git a/ansible/roles/openondemand/files/jupyter_requirements.txt b/ansible/roles/openondemand/files/jupyter_requirements.txt new file mode 100644 index 00000000..ec28503c --- /dev/null +++ b/ansible/roles/openondemand/files/jupyter_requirements.txt @@ -0,0 +1,101 @@ +# Python3.9, pip 23.2.1 +anyio==3.7.1 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +arrow==1.2.3 +asttokens==2.2.1 +async-lru==2.0.4 +attrs==23.1.0 +Babel==2.12.1 +backcall==0.2.0 +beautifulsoup4==4.12.2 +bleach==6.0.0 +certifi==2023.7.22 +cffi==1.15.1 +charset-normalizer==3.2.0 +comm==0.1.3 +debugpy==1.6.7 +decorator==5.1.1 +defusedxml==0.7.1 +exceptiongroup==1.1.2 +executing==1.2.0 +fastjsonschema==2.18.0 +fqdn==1.5.1 +idna==3.4 +importlib-metadata==6.8.0 +ipykernel==6.25.0 +ipython==8.14.0 +ipython-genutils==0.2.0 +ipywidgets==8.1.0 +isoduration==20.11.0 +jedi==0.19.0 +Jinja2==3.1.2 +json5==0.9.14 +jsonpointer==2.4 +jsonschema==4.18.4 +jsonschema-specifications==2023.7.1 +jupyter==1.0.0 +jupyter-console==6.6.3 +jupyter-events==0.7.0 +jupyter-lsp==2.2.0 +jupyter_client==8.3.0 +jupyter_core==5.3.1 +jupyter_server==2.7.0 +jupyter_server_terminals==0.4.4 +jupyterlab==4.0.3 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==3.0.8 +jupyterlab_server==2.24.0 +MarkupSafe==2.1.3 +matplotlib-inline==0.1.6 +mistune==3.0.1 +nbclient==0.8.0 +nbconvert==7.7.3 +nbformat==5.9.2 +nest-asyncio==1.5.7 +notebook==7.0.1 +notebook_shim==0.2.3 +overrides==7.3.1 +packaging==23.1 +pandocfilters==1.5.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +platformdirs==3.10.0 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +psutil==5.9.5 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.15.1 +python-dateutil==2.8.2 +python-json-logger==2.0.7 +PyYAML==6.0.1 +pyzmq==25.1.0 +qtconsole==5.4.3 +QtPy==2.3.1 +referencing==0.30.0 +requests==2.31.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rpds-py==0.9.2 +Send2Trash==1.8.2 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.4.1 +stack-data==0.6.2 +terminado==0.17.1 +tinycss2==1.2.1 +tomli==2.0.1 +tornado==6.3.2 +traitlets==5.9.0 +typing_extensions==4.7.1 +uri-template==1.3.0 +urllib3==2.0.4 +wcwidth==0.2.6 +webcolors==1.13 +webencodings==0.5.1 +websocket-client==1.6.1 +widgetsnbextension==4.0.8 +zipp==3.16.2 diff --git a/ansible/roles/openondemand/tasks/jupyter_compute.yml b/ansible/roles/openondemand/tasks/jupyter_compute.yml index 4aa7e7e9..a87d07da 100644 --- a/ansible/roles/openondemand/tasks/jupyter_compute.yml +++ b/ansible/roles/openondemand/tasks/jupyter_compute.yml @@ -2,20 +2,31 @@ # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html # - Will already have openssl and lmod +- name: Ensure python3.9 installed + dnf: + name: python39 + tags: install + - name: Install jupyter venv # Requires separate step so that the upgraded pip is used to install packages pip: name: pip state: latest - virtualenv: /opt/jupyter - virtualenv_command: python3 -m venv + virtualenv: /opt/jupyter-py39 + virtualenv_command: python3.9 -m venv + tags: install + +- name: Copy jupyter requirements file + copy: + src: jupyter_requirements.txt + dest: /opt/jupyter-py39/jupyter_requirements.txt tags: install - name: Install jupyter package in venv pip: - name: jupyter - virtualenv: /opt/jupyter - virtualenv_command: python3 -m venv + virtualenv: /opt/jupyter-py39 + virtualenv_command: python3.9 -m venv + requirements: /opt/jupyter-py39/jupyter_requirements.txt tags: install diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 4652b375..c29b6a55 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -37,7 +37,7 @@ openondemand_clusters: basic: script_wrapper: | module purge - export PATH=/opt/jupyter/bin/:$PATH + export PATH=/opt/jupyter-py39/bin/:$PATH %s set_host: host=$(hostname -s) vnc: From f0db29232b6d2125893747e4e997ec56cf50b199 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Aug 2023 12:45:41 +0000 Subject: [PATCH 07/14] make eessi test async to avoid ansible timeouts --- ansible/ci/check_eessi.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/ci/check_eessi.yml b/ansible/ci/check_eessi.yml index 0112509a..2d37a9d5 100644 --- a/ansible/ci/check_eessi.yml +++ b/ansible/ci/check_eessi.yml @@ -25,6 +25,8 @@ chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow" executable: /bin/bash register: job_output + async: "{{ 6 * 60 }}" # wait for up to 6 minutes, expect it to take 4 + poll: 15 # check every 15 seconds - name: Fail if job output contains error fail: From 854e4918e5a382112b7101ebe5bb7ef2ef97aa0a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Aug 2023 14:25:30 +0000 Subject: [PATCH 08/14] extend eessi timeout --- ansible/ci/check_eessi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/ci/check_eessi.yml b/ansible/ci/check_eessi.yml index 2d37a9d5..0ced21a9 100644 --- a/ansible/ci/check_eessi.yml +++ b/ansible/ci/check_eessi.yml @@ -25,7 +25,7 @@ chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow" executable: /bin/bash register: job_output - async: "{{ 6 * 60 }}" # wait for up to 6 minutes, expect it to take 4 + async: "{{ 10 * 60 }}" # wait for up to 10 minutes poll: 15 # check every 15 seconds - name: Fail if job output contains error From 307f5ba625d82b7c9d4ac35eecd5dfbed7509f98 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 2 Aug 2023 20:27:14 +0000 Subject: [PATCH 09/14] capture tensorflow EESSI test output --- ansible/ci/check_eessi.yml | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/ansible/ci/check_eessi.yml b/ansible/ci/check_eessi.yml index 0ced21a9..280f8658 100644 --- a/ansible/ci/check_eessi.yml +++ b/ansible/ci/check_eessi.yml @@ -17,20 +17,34 @@ repo: "https://github.com/eessi/eessi-demo.git" dest: "{{ eessi_test_rootdir }}/eessi-demo" - - name: Run test job - ansible.builtin.shell: - cmd: | + - name: Create batch script + copy: + dest: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh" + content: | + #!/usr/bin/env bash + #SBATCH --output=%x.out + #SBATCH --error=%x.out source /cvmfs/pilot.eessi-hpc.org/latest/init/bash srun ./run.sh + + - name: Run test job + ansible.builtin.shell: + cmd: sbatch --wait tensorflow.sh chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow" - executable: /bin/bash register: job_output - async: "{{ 10 * 60 }}" # wait for up to 10 minutes - poll: 15 # check every 15 seconds + + - name: Retrieve job output + slurp: + src: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh.out" + register: _tensorflow_out + no_log: true # as its base64 encoded so useless + + - name: Show job output + debug: + msg: "{{ _tensorflow_out.content | b64decode }}" - name: Fail if job output contains error fail: # Note: Job prints live progress bar to terminal, so use regex filter to remove this from stdout msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}" - when: '"Epoch 5/5" not in job_output.stdout' - \ No newline at end of file + when: '"Epoch 5/5" not in _tensorflow_out.content | b64decode' From 8cea013c7c0912c779112f2004ffc70d68560970 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Aug 2023 08:27:59 +0000 Subject: [PATCH 10/14] disable EESSI tests in CI for now --- .github/workflows/stackhpc.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 787374e9..ea18b310 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -90,11 +90,11 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/adhoc/hpctests.yml - - name: Run EESSI tests - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -vv ansible/ci/check_eessi.yml + # - name: Run EESSI tests + # run: | + # . venv/bin/activate + # . environments/.stackhpc/activate + # ansible-playbook -vv ansible/ci/check_eessi.yml - name: Confirm Open Ondemand is up (via SOCKS proxy) run: | From c91c8ede2c7b6ad3529b8869a61b1ddde34a0110 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Aug 2023 09:45:49 +0000 Subject: [PATCH 11/14] disable ssh session sharing for stackhpc --- environments/.stackhpc/ansible.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index d7a3783f..6aca9ef1 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -11,5 +11,6 @@ roles_path = ../../ansible/roles filter_plugins = ../../ansible/filter_plugins [ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +#ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +ssh_args = -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True From 054d287a1895403832fa963e8de34a84ef1feada Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Aug 2023 15:19:37 +0000 Subject: [PATCH 12/14] Revert "disable EESSI tests in CI for now" This reverts commit 8cea013c7c0912c779112f2004ffc70d68560970. --- .github/workflows/stackhpc.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index ea18b310..787374e9 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -90,11 +90,11 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/adhoc/hpctests.yml - # - name: Run EESSI tests - # run: | - # . venv/bin/activate - # . environments/.stackhpc/activate - # ansible-playbook -vv ansible/ci/check_eessi.yml + - name: Run EESSI tests + run: | + . venv/bin/activate + . environments/.stackhpc/activate + ansible-playbook -vv ansible/ci/check_eessi.yml - name: Confirm Open Ondemand is up (via SOCKS proxy) run: | From aa6fb9c5f0a0d149dc1e6a89e340d0be6a08782a Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 3 Aug 2023 17:03:18 +0000 Subject: [PATCH 13/14] Re-disable EESSI tests in CI for now This reverts commit 054d287a1895403832fa963e8de34a84ef1feada. --- .github/workflows/stackhpc.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 787374e9..ea18b310 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -90,11 +90,11 @@ jobs: . environments/.stackhpc/activate ansible-playbook -vv ansible/adhoc/hpctests.yml - - name: Run EESSI tests - run: | - . venv/bin/activate - . environments/.stackhpc/activate - ansible-playbook -vv ansible/ci/check_eessi.yml + # - name: Run EESSI tests + # run: | + # . venv/bin/activate + # . environments/.stackhpc/activate + # ansible-playbook -vv ansible/ci/check_eessi.yml - name: Confirm Open Ondemand is up (via SOCKS proxy) run: | From bae72557f6e29984643f26f609473ba3e65cfd1f Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 4 Aug 2023 09:19:04 +0000 Subject: [PATCH 14/14] reenable ControlMaster, adding ControlPath --- environments/.stackhpc/ansible.cfg | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/environments/.stackhpc/ansible.cfg b/environments/.stackhpc/ansible.cfg index 6aca9ef1..2a12e06b 100644 --- a/environments/.stackhpc/ansible.cfg +++ b/environments/.stackhpc/ansible.cfg @@ -11,6 +11,5 @@ roles_path = ../../ansible/roles filter_plugins = ../../ansible/filter_plugins [ssh_connection] -#ssh_args = -o ControlMaster=auto -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null -ssh_args = -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null +ssh_args = -o ControlMaster=auto -o ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null pipelining = True