-
Notifications
You must be signed in to change notification settings - Fork 35
/
kubernetes_job_manager.py
322 lines (287 loc) · 12.7 KB
/
kubernetes_job_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# -*- coding: utf-8 -*-
#
# This file is part of REANA.
# Copyright (C) 2019 CERN.
#
# REANA is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Kubernetes Job Manager."""
import ast
import logging
import os
import traceback
import uuid
from flask import current_app
from kubernetes import client
from kubernetes.client.models.v1_delete_options import V1DeleteOptions
from kubernetes.client.rest import ApiException
from reana_commons.config import (CVMFS_REPOSITORIES, K8S_CERN_EOS_AVAILABLE,
K8S_CERN_EOS_MOUNT_CONFIGURATION,
K8S_DEFAULT_NAMESPACE,
WORKFLOW_RUNTIME_USER_GID,
WORKFLOW_RUNTIME_USER_UID)
from reana_commons.k8s.api_client import current_k8s_batchv1_api_client
from reana_commons.k8s.secrets import REANAUserSecretsStore
from reana_commons.k8s.volumes import get_k8s_cvmfs_volume, get_shared_volume
from retrying import retry
from reana_job_controller.errors import ComputingBackendSubmissionError
from reana_job_controller.job_manager import JobManager
class KubernetesJobManager(JobManager):
"""Kubernetes job management."""
MAX_NUM_RESUBMISSIONS = 3
"""Maximum number of job submission/creation tries """
MAX_NUM_JOB_RESTARTS = 0
"""Maximum number of job restarts in case of internal failures."""
def __init__(self, docker_img=None, cmd=None, prettified_cmd=None,
env_vars=None, workflow_uuid=None, workflow_workspace=None,
cvmfs_mounts='false', shared_file_system=False, job_name=None,
kerberos=False, kubernetes_uid=None):
"""Instanciate kubernetes job manager.
:param docker_img: Docker image.
:type docker_img: str
:param cmd: Command to execute.
:type cmd: list
:param prettified_cmd: pretified version of command to execute.
:type prettified_cmd: str
:param env_vars: Environment variables.
:type env_vars: dict
:param workflow_uuid: Unique workflow id.
:type workflow_uuid: str
:param workflow_workspace: Workflow workspace path.
:type workflow_workspace: str
:param cvmfs_mounts: list of CVMFS mounts as a string.
:type cvmfs_mounts: str
:param shared_file_system: if shared file system is available.
:type shared_file_system: bool
:param job_name: Name of the job.
:type job_name: str
:param kerberos: Decides if kerberos should be provided for job.
:type kerberos: bool
:param kubernetes_uid: User ID for job container.
:type kubernetes_uid: int
"""
super(KubernetesJobManager, self).__init__(
docker_img=docker_img,
cmd=cmd,
prettified_cmd=prettified_cmd,
env_vars=env_vars,
workflow_uuid=workflow_uuid,
workflow_workspace=workflow_workspace,
job_name=job_name)
self.compute_backend = "Kubernetes"
self.cvmfs_mounts = cvmfs_mounts
self.shared_file_system = shared_file_system
self.kerberos = kerberos
self.set_user_id(kubernetes_uid)
@JobManager.execution_hook
def execute(self):
"""Execute a job in Kubernetes."""
backend_job_id = str(uuid.uuid4())
self.job = {
'kind': 'Job',
'apiVersion': 'batch/v1',
'metadata': {
'name': backend_job_id,
'namespace': K8S_DEFAULT_NAMESPACE
},
'spec': {
'backoffLimit': KubernetesJobManager.MAX_NUM_JOB_RESTARTS,
'autoSelector': True,
'template': {
'metadata': {
'name': backend_job_id
},
'spec': {
'containers': [
{
'image': self.docker_img,
'command': self.cmd,
'name': 'job',
'env': [],
'volumeMounts': [],
}
],
'initContainers': [],
'volumes': [],
'restartPolicy': 'Never'
}
}
}
}
user_id = os.getenv('REANA_USER_ID')
secrets_store = REANAUserSecretsStore(user_id)
secret_env_vars = secrets_store.get_env_secrets_as_k8s_spec()
self.job['spec']['template']['spec']['containers'][0]['env'].extend(
secret_env_vars
)
self.job['spec']['template']['spec']['volumes'].append(
secrets_store.get_file_secrets_volume_as_k8s_specs()
)
secrets_volume_mount = \
secrets_store.get_secrets_volume_mount_as_k8s_spec()
self.job['spec']['template']['spec']['containers'][0]['volumeMounts'] \
.append(secrets_volume_mount)
if self.env_vars:
for var, value in self.env_vars.items():
self.job['spec']['template']['spec'][
'containers'][0]['env'].append({'name': var,
'value': value})
self.add_hostpath_volumes()
self.add_shared_volume()
self.add_eos_volume()
self.add_image_pull_secrets()
if self.cvmfs_mounts != 'false':
cvmfs_map = {}
for cvmfs_mount_path in ast.literal_eval(self.cvmfs_mounts):
if cvmfs_mount_path in CVMFS_REPOSITORIES:
cvmfs_map[
CVMFS_REPOSITORIES[cvmfs_mount_path]] = \
cvmfs_mount_path
for repository, mount_path in cvmfs_map.items():
volume = get_k8s_cvmfs_volume(repository)
(self.job['spec']['template']['spec']['containers'][0]
['volumeMounts'].append(
{'name': volume['name'],
'mountPath': '/cvmfs/{}'.format(mount_path),
'readOnly': volume['readOnly']}
))
self.job['spec']['template']['spec']['volumes'].append(volume)
self.job['spec']['template']['spec']['securityContext'] = \
client.V1PodSecurityContext(
run_as_group=WORKFLOW_RUNTIME_USER_GID,
run_as_user=self.kubernetes_uid)
if self.kerberos:
self._add_krb5_init_container(secrets_volume_mount)
backend_job_id = self._submit()
return backend_job_id
@retry(stop_max_attempt_number=MAX_NUM_RESUBMISSIONS)
def _submit(self):
"""Submit job and return its backend id."""
try:
api_response = \
current_k8s_batchv1_api_client.create_namespaced_job(
namespace=K8S_DEFAULT_NAMESPACE,
body=self.job)
return self.job['metadata']['name']
except ApiException as e:
logging.error("Error while connecting to Kubernetes"
" API: {}".format(e))
except Exception as e:
logging.error(traceback.format_exc())
logging.debug("Unexpected error: {}".format(e))
def stop(backend_job_id, asynchronous=True):
"""Stop Kubernetes job execution.
:param backend_job_id: Kubernetes job id.
:param asynchronous: Whether the function waits for the action to be
performed or does it asynchronously.
"""
try:
propagation_policy = 'Background' if asynchronous else 'Foreground'
delete_options = V1DeleteOptions(
propagation_policy=propagation_policy)
current_k8s_batchv1_api_client.delete_namespaced_job(
backend_job_id, K8S_DEFAULT_NAMESPACE,
body=delete_options)
except ApiException as e:
logging.error(
'An error has occurred while connecting to Kubernetes API '
'Server \n {}'.format(e))
raise ComputingBackendSubmissionError(e.reason)
def add_shared_volume(self):
"""Add shared CephFS volume to a given job spec."""
if self.shared_file_system:
volume_mount, volume = get_shared_volume(
self.workflow_workspace)
self.add_volumes([(volume_mount, volume)])
def add_eos_volume(self):
"""Add EOS volume to a given job spec."""
if K8S_CERN_EOS_AVAILABLE:
self.add_volumes([(
K8S_CERN_EOS_MOUNT_CONFIGURATION['volumeMounts'],
K8S_CERN_EOS_MOUNT_CONFIGURATION['volume'])])
def add_image_pull_secrets(self):
"""Attach to the container the configured image pull secrets."""
image_pull_secrets = []
for secret_name in current_app.config['IMAGE_PULL_SECRETS']:
if secret_name:
image_pull_secrets.append({'name': secret_name})
self.job['spec']['template']['spec']['imagePullSecrets'] = \
image_pull_secrets
def add_hostpath_volumes(self):
"""Add hostPath mounts from configuration to job."""
volumes_to_mount = []
for name, path in current_app.config['JOB_HOSTPATH_MOUNTS']:
volume_mount = {'name': name,
'mountPath': path}
volume = {
'name': name,
'hostPath': {'path': path}}
volumes_to_mount.append((volume_mount, volume))
self.add_volumes(volumes_to_mount)
def add_volumes(self, volumes):
"""Add provided volumes to job.
:param volumes: A list of tuple composed 1st of a Kubernetes
volumeMount spec and 2nd of Kubernetes volume spec.
"""
for volume_mount, volume in volumes:
self.job['spec']['template']['spec']['containers'][0][
'volumeMounts'].append(volume_mount)
self.job['spec']['template']['spec']['volumes'].append(volume)
def _add_krb5_init_container(self, secrets_volume_mount):
"""Add sidecar container for a job."""
krb5_config_map_name = current_app.config['KRB5_CONFIGMAP_NAME']
ticket_cache_volume = {
'name': 'krb5-cache',
'emptyDir': {}
}
krb5_config_volume = {
'name': 'krb5-conf',
'configMap': {'name': krb5_config_map_name}
}
volume_mounts = [
{
'name': ticket_cache_volume['name'],
'mountPath': current_app.config['KRB5_TOKEN_CACHE_LOCATION']
},
{
'name': krb5_config_volume['name'],
'mountPath': '/etc/krb5.conf',
'subPath': 'krb5.conf'
}
]
keytab_file = os.environ.get('CERN_KEYTAB')
cern_user = os.environ.get('CERN_USER')
krb5_container = {
'image': current_app.config['KRB5_CONTAINER_IMAGE'],
'command': ['kinit', '-kt',
'/etc/reana/secrets/{}'.format(keytab_file),
'{}@CERN.CH'.format(cern_user)],
'name': current_app.config['KRB5_CONTAINER_NAME'],
'imagePullPolicy': 'IfNotPresent',
'volumeMounts': [secrets_volume_mount] + volume_mounts,
'security_context': client.V1PodSecurityContext(
run_as_group=WORKFLOW_RUNTIME_USER_GID,
run_as_user=self.kubernetes_uid)
}
self.job['spec']['template']['spec']['volumes'].extend(
[ticket_cache_volume, krb5_config_volume])
self.job['spec']['template']['spec']['containers'][0][
'volumeMounts'].extend(volume_mounts)
# Add the Kerberos token cache file location to the job container
# so every instance of Kerberos picks it up even if it doesn't read
# the configuration file.
self.job['spec']['template']['spec']['containers'][0][
'env'].append({'name': 'KRB5CCNAME',
'value': os.path.join(
current_app.config['KRB5_TOKEN_CACHE_LOCATION'],
current_app.config['KRB5_TOKEN_CACHE_FILENAME']
.format(self.kubernetes_uid)
)})
self.job['spec']['template']['spec']['initContainers'].append(
krb5_container)
def set_user_id(self, kubernetes_uid):
"""Set user id for job pods. UIDs < 100 are refused for security."""
if kubernetes_uid and kubernetes_uid >= 100:
self.kubernetes_uid = kubernetes_uid
else:
self.kubernetes_uid = WORKFLOW_RUNTIME_USER_UID