Skip to content

Commit

Permalink
Merge pull request #30 from stackhpc/feat/autoscaler-service-account
Browse files Browse the repository at this point in the history
Replaced mounted kubeconfig with service account
  • Loading branch information
sjpb committed Aug 17, 2023
2 parents 057651a + a2ca5e3 commit 96d933f
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 33 deletions.
3 changes: 0 additions & 3 deletions generate-secrets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,3 @@ kubectl create secret generic munge-key-secret \
--from-literal=munge.key=$(dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64 -w 0) \
-o yaml | \
kubectl apply -f -

cp $KUBECONFIG slurm-cluster-chart/files/kubeconfig
echo "copied $KUBECONFIG into slurm-cluster-chart/files/"
6 changes: 3 additions & 3 deletions image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ LABEL org.opencontainers.image.source="https://github.com/stackhpc/slurm-docker-
ARG SLURM_TAG=slurm-23.02
ARG GOSU_VERSION=1.11

COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo

RUN set -ex \
&& yum makecache \
&& yum -y update \
Expand Down Expand Up @@ -42,6 +44,7 @@ RUN set -ex \
hwloc-devel \
openssh-server \
apptainer \
kubectl \
&& yum clean all \
&& rm -rf /var/cache/yum

Expand Down Expand Up @@ -91,9 +94,6 @@ RUN mkdir /etc/sysconfig/slurm \
&& useradd -u 1000 rocky \
&& usermod -p '*' rocky # unlocks account but sets no password

COPY kubernetes.repo /etc/yum.repos.d/kubernetes.repo
RUN dnf install -y kubectl

VOLUME /etc/slurm
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
COPY --chown=slurm:slurm --chmod=744 k8s-slurmd-* /usr/local/bin/
Expand Down
4 changes: 0 additions & 4 deletions image/docker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,6 @@ then
echo "---> Setting ownership for state directory ..."
chown slurm:slurm /var/spool/slurmctld

echo "---> Copying Kubeconfig ..."
install -o slurm -g slurm -m u=rwX,go= -d /var/lib/slurmctld/
install -o slurm -g slurm -m u=r,go= /tmp/kubeconfig /var/lib/slurmctld/

echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
if /usr/sbin/slurmctld -V | grep -q '17.02' ; then
exec gosu slurm /usr/sbin/slurmctld -D "${@:2}"
Expand Down
13 changes: 9 additions & 4 deletions image/k8s-slurmd-create
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/usr/bin/bash

export KUBECONFIG=/var/lib/slurmctld/kubeconfig
echo "$(date) Resume invoked $0 $*" &>> /var/log/slurm/power_save.log

echo "$(date) Resume invoked $0 $*" >> /var/log/slurm/power_save.log
APISERVER=https://kubernetes.default.svc
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
TOKEN=$(cat ${SERVICEACCOUNT}/token)
CACERT=${SERVICEACCOUNT}/ca.crt

hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes
for host in $hosts
do
sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | kubectl create -f -
done
( sed s/SLURMD_NODENAME/$host/ /etc/slurm/slurmd-pod-template.yml | \
kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT create -f - )
done
10 changes: 7 additions & 3 deletions image/k8s-slurmd-delete
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#!/usr/bin/bash

export KUBECONFIG=/var/lib/slurmctld/kubeconfig

echo "$(date) Suspend invoked $0 $*" >> /var/log/slurm/power_save.log

APISERVER=https://kubernetes.default.svc
SERVICEACCOUNT=/var/run/secrets/kubernetes.io/serviceaccount
NAMESPACE=$(cat ${SERVICEACCOUNT}/namespace)
TOKEN=$(cat ${SERVICEACCOUNT}/token)
CACERT=${SERVICEACCOUNT}/ca.crt

hosts=$(scontrol show hostnames $1) # this is purely a textual expansion, doens't depend on defined nodes
for host in $hosts
do
kubectl delete pod $host
kubectl --server $APISERVER --token $TOKEN --certificate-authority $CACERT delete pod $host
done
8 changes: 0 additions & 8 deletions slurm-cluster-chart/templates/kubeconfig.yml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slurm-autoscaler-account
automountServiceAccountToken: True

---

apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: slurm-autoscaler-role
rules:
- apiGroups: [""] # "" indicates the core API group
resources: ["pods"]
verbs: ["get","apply","create", "patch", "delete", "list", "watch"]

---

apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: slurm-autoscaler-rolebinding
subjects:
- kind: ServiceAccount
name: slurm-autoscaler-account
roleRef:
kind: Role
name: slurm-autoscaler-role
apiGroup: rbac.authorization.k8s.io
8 changes: 1 addition & 7 deletions slurm-cluster-chart/templates/slurmctld-statefulset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ spec:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: slurmctld
spec:
serviceAccountName: slurm-autoscaler-account
containers:
- args:
- slurmctld
Expand All @@ -38,9 +39,6 @@ spec:
subPath: munge.key
- mountPath: /var/spool/slurmctld
name: slurmctld-state
- mountPath: /tmp/kubeconfig
name: kubeconfig-secret
subPath: kubeconfig
dnsConfig:
searches:
- slurmd.default.svc.cluster.local
Expand All @@ -63,7 +61,3 @@ spec:
secret:
secretName: {{ .Values.secrets.mungeKey }}
defaultMode: 0400
- name: kubeconfig-secret
secret:
secretName: kubeconfig-secret
defaultMode: 0400
2 changes: 1 addition & 1 deletion slurm-cluster-chart/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:9e4598e
slurmImage: ghcr.io/stackhpc/slurm-docker-cluster:a731c60

replicas:
slurmd: 2
Expand Down

0 comments on commit 96d933f

Please sign in to comment.