Skip to content

Commit

Permalink
Merge pull request #79 from silx-kit/separate-resource-display-custom
Browse files Browse the repository at this point in the history
Change way to custom displayed SLURM resources
  • Loading branch information
t20100 authored Jan 26, 2023
2 parents ac94633 + c14e0b7 commit fbcf07c
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 108 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
graft demo
graft jupyterhub_moss
10 changes: 4 additions & 6 deletions demo/templates/option_form.html
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
{% extends 'templates/option_form.html' %}
{% macro resource_table(partitions, simple_only=false) -%}
<h4 style="text-align: center">Available resources</h4>
{% macro resource_table(partitions, simple_only=false) %}
<h4 style="text-align: left">Currently available resources</h4>
<table class="table">
<tr class="active">
<th>Partition</th>
<th></th>
<th>CPU cores</th>
<th>Nodes</th>
</tr>
{% for name, partition in partitions.items() %}
{% if partition.simple or not simple_only %}
<tr>
<th>{{ name }}</th>
<th>{{ partition['available_counts'][0] }}<small>/{{ partition['available_counts'][1] }}</small></th>
<th>{{ partition['available_counts'][2] }}</th>
<th>{{ partition['ncores_idle'] }}</th>
</tr>
{% endif %}
{% endfor %}
Expand Down
106 changes: 40 additions & 66 deletions jupyterhub_moss/spawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
import os.path
import re
from collections import defaultdict
from copy import deepcopy
from typing import Dict, List

Expand All @@ -21,14 +20,7 @@
}

# Required resources per partition
RESOURCES_COUNTS = [
"max_nprocs",
"max_mem",
"gpu",
"max_ngpus",
"max_runtime",
"available_counts",
]
REQUIRED_RESOURCES_COUNTS = "max_nprocs", "max_mem", "gpu", "max_ngpus", "max_runtime"

with open(local_path("batch_script.sh")) as f:
BATCH_SCRIPT = f.read()
Expand Down Expand Up @@ -99,47 +91,43 @@ def _validate_partitions(self, proposal):
slurm_info_resources = traitlets.Callable(
help="""Provides information about resources in Slurm cluster.
It will be called with the output of ``slurm_info_cmd`` as argument and should return a tuple:
- list of resource labels to be displayed in table of available resources
- dictionary mapping each partition name to resources defined in ``RESOURCES_COUNTS``""",
It will be called with the output of ``slurm_info_cmd`` as argument and should return a
dictionary mapping each partition name to resources defined in ``REQUIRED_RESOURCES_COUNTS``
and resources used in option_form template.
""",
).tag(config=True)

@traitlets.default("slurm_info_resources")
def _get_slurm_info_resources_default(self):
"""Returns default for `slurm_info_resources` traitlet."""
return self._slurm_info_resources

def _slurm_info_resources(self, slurm_info_out):
"""
Parses output from Slurm command: sinfo -a --noheader -o '%R %D %C %G %m'
Returns information about partition resources listed in RESOURCES_COUNTS: number of cores,
max memory, gpus and resource counts to be shown in table of available resources
:param slurm_info_out: string with output of slurm_info_cmd
:rtype: tuple with:
- list of resource labels to be displayed in table of available resources
- dict with mapping per partition: {
def _slurm_info_resources(self, slurm_info_out: str) -> Dict[str, dict]:
"""Parses output from Slurm command: sinfo -a --noheader -o '%R %D %c %C %G %m %l'
Returns information about partition resources listed in ``REQUIRED_RESOURCES_COUNTS``:
number of cores, max memory, gpus and resource counts to be shown in table of available resources.
:param slurm_info_out: Output of slurm_info_cmd
:rtype: Mapping of partition information:
{
partition: {max_nprocs, max_ngpus, max_mem, max_runtime, ...},
}
}
"""
# Resources displayed in table of available resources (column labels in display order)
resources_display = ["Idle Cores", "Total Cores", "Total Nodes"]
partitions_info = {}

# Parse output
resources_count = defaultdict(
lambda: {resource: 0 for resource in RESOURCES_COUNTS + resources_display}
)
for line in slurm_info_out.splitlines():
(
partition,
nodes,
nnodes_total,
ncores_per_node,
cores,
gpus,
memory,
timelimit,
) = line.split()
# core count - allocated/idle/other/total
_, cores_idle, _, cores_total = cores.split("/")
_, ncores_idle, _, ncores_total = cores.split("/")
# gpu count - gpu:name:total(indexes)
try:
gpus_gres = gpus.replace("(", ":").split(":")
Expand All @@ -157,35 +145,25 @@ def _slurm_info_resources(self, slurm_info_out):
)
max_runtime = datetime.timedelta(days=1)

count = resources_count[partition]
resources = {}
try:
# display resource counts
count["Total Nodes"] = int(nodes)
count["Total Cores"] = int(cores_total)
count["Idle Cores"] = int(cores_idle)
resources["nnodes_total"] = int(nnodes_total)
resources["ncores_total"] = int(ncores_total)
resources["ncores_idle"] = int(ncores_idle)
# required resource counts
count["max_nprocs"] = int(ncores_per_node.rstrip("+"))
count["max_mem"] = int(memory.rstrip("+"))
count["gpu"] = gpu
count["max_ngpus"] = int(gpus_total)
count["max_runtime"] = int(max_runtime.total_seconds())
resources["max_nprocs"] = int(ncores_per_node.rstrip("+"))
resources["max_mem"] = int(memory.rstrip("+"))
resources["gpu"] = gpu
resources["max_ngpus"] = int(gpus_total)
resources["max_runtime"] = int(max_runtime.total_seconds())
except ValueError as err:
self.log.error("Error parsing output of slurm_info_cmd: %s", err)
raise
else:
count["available_counts"] = [
count[resource] for resource in resources_display
]

resources_info = {
partition: {
resource: resources_count[partition][resource]
for resource in RESOURCES_COUNTS
}
for partition in resources_count
}
partitions_info[partition] = resources

return (resources_display, resources_info)
return partitions_info

singularity_cmd = traitlets.List(
trait=traitlets.Unicode(),
Expand Down Expand Up @@ -240,31 +218,29 @@ async def _get_partitions_info(self):
out = await self.run_command(cmd)

# Parse command output
resources_display, resources_info = self.slurm_info_resources(out)
dbgmsg = "Slurm partition resources displayed as available resources: %s"
self.log.debug(dbgmsg, resources_display)
resources_info = self.slurm_info_resources(out)
self.log.debug("Slurm partition resources: %s", resources_info)

for partition in resources_info:
if not all(
counter in resources_info[partition] for counter in RESOURCES_COUNTS
):
errmsg = "Missing required resource counter in Slurm partition: {}"
raise KeyError(errmsg.format(partition))

# use data from Slurm as base and overwrite with manual configuration settings
partitions_info = {
partition: {**resources_info[partition], **config_partition_info}
for partition, config_partition_info in self.partitions.items()
}

for partition, info in partitions_info.items():
for key in REQUIRED_RESOURCES_COUNTS:
if key not in info:
raise KeyError(
f"Missing required resource '{key}' for partition '{partition}'"
)

# Ensure returning a dict that can be modified by the callers
return (resources_display, deepcopy(partitions_info))
return deepcopy(partitions_info)

@staticmethod
async def create_options_form(spawner):
"""Create a form for the user to choose the configuration for the SLURM job"""
resources_display, partitions_info = await spawner._get_partitions_info()
partitions_info = await spawner._get_partitions_info()

simple_partitions = [
partition for partition, info in partitions_info.items() if info["simple"]
Expand All @@ -284,7 +260,6 @@ async def create_options_form(spawner):
{
"partitions": partitions_info,
"default_partition": default_partition,
"resources_display": resources_display,
}
)

Expand All @@ -293,7 +268,6 @@ async def create_options_form(spawner):
hash_option_form_js=RESOURCES_HASH["option_form.js"],
partitions=partitions_info,
default_partition=default_partition,
resources_display=resources_display,
batchspawner_version=BATCHSPAWNER_VERSION,
jupyterhub_version=JUPYTERHUB_VERSION,
jsondata=jsondata,
Expand Down Expand Up @@ -459,7 +433,7 @@ def __update_spawn_commands(self, cmd_path):
self.cmd = [os.path.join(cmd_path, "jupyterhub-singleuser")]

async def start(self):
_, partitions_info = await self._get_partitions_info()
partitions_info = await self._get_partitions_info()
partition_info = partitions_info[self.user_options["partition"]]

# Exceptions raised by the checks are catched by the caller, and
Expand Down
58 changes: 22 additions & 36 deletions jupyterhub_moss/templates/option_form.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
{% macro resource_tab_footer(partitions, simple_only) %}
<h4 style="text-align: center">Available resources</h4>
<table class="table">
<tr class="active">
<th>Partition</th>
<th>Idle CPU cores</th>
<th>Idle nodes</th>
</tr>
{% for name, partition in partitions.items() %}
{% if partition.simple or not simple_only %}
<tr>
<th>{{ name }}</th>
<th>{{ partition['ncores_idle'] }} <small>/ {{ partition['ncores_total'] }}</small></th>
<th>{{ partition['nnodes_total'] }}</th>
</tr>
{% endif %}
{% endfor %}
</table>
{% endmacro %}

<link href="/hub/form/option_form.css?v={{hash_option_form_css}}" rel="stylesheet" />
<script>
window.SLURM_DATA = JSON.parse('{{ jsondata }}');
Expand Down Expand Up @@ -125,25 +145,7 @@
</select>
</div>
{% block simple_tab_footer %}
<h4 style="text-align: left">Available resources at current time</h4>
<table class="table">
<tr class="active">
<th>Partition</th>
{% for res_name in resources_display %}
<th>{{ res_name }}</th>
{% endfor %}
</tr>
{% for name, partition in partitions.items() %}
{% if partition.simple %}
<tr>
<th>{{ name }}</th>
{% for res in partition['available_counts'] %}
<td>{{ res }}</td>
{% endfor %}
</tr>
{% endif %}
{% endfor %}
</table>
{{ resource_tab_footer(partitions, simple_only=true) }}
{% endblock simple_tab_footer %}
</div>
<div id="menu1" class="tab-pane fade indent-right" align="right">
Expand Down Expand Up @@ -288,23 +290,7 @@ <h4 style="text-align: left">Available resources at current time</h4>
/>
</div>
{% block advanced_tab_footer %}
<h4 style="text-align: left">Available resources at current time</h4>
<table class="table">
<tr class="active">
<th>Partition</th>
{% for res_name in resources_display %}
<th>{{ res_name }}</th>
{% endfor %}
</tr>
{% for name, partition in partitions.items() %}
<tr>
<th>{{ name }}</th>
{% for res in partition['available_counts'] %}
<td>{{ res }}</td>
{% endfor %}
</tr>
{% endfor %}
</table>
{{ resource_tab_footer(partitions, simple_only=false) }}
{% endblock advanced_tab_footer %}
</div>
</div>

0 comments on commit fbcf07c

Please sign in to comment.