Skip to content
Permalink
Browse files

Add ability for dumping YAML file of the ElasticDL job (#1526)

* Support for dumping pod as yaml

* a

* Add parameter for yaml dump

* Add --yaml parameter supporting YAML dump

* Clean format

* Remove unused pkg
  • Loading branch information
ywskycn committed Nov 27, 2019
1 parent 536fafe commit 18bf0785290d5c238352a36237e0128af98e5d4a
@@ -1,5 +1,5 @@
[settings]
multi_line_output=3
line_length=79
known_third_party = PIL,docker,google,grpc,kubernetes,numpy,odps,pyspark,recordio,requests,setuptools,tensorflow
known_third_party = PIL,docker,google,grpc,kubernetes,numpy,odps,pyspark,recordio,requests,setuptools,tensorflow,yaml
include_trailing_comma=True
@@ -171,7 +171,9 @@ def add_common_params(parser):
"default to worker_resource_request",
)
parser.add_argument(
"--ps_pod_priority", help="The requested priority of PS pod"
"--ps_pod_priority",
default="",
help="The requested priority of PS pod",
)
parser.add_argument(
"--volume",
@@ -200,6 +202,7 @@ def add_common_params(parser):
parser.add_argument(
"--envs",
type=str,
default="",
help="Runtime environment variables. (key1=value1,key2=value2), "
"comma is supported in value field",
)
@@ -237,6 +240,15 @@ def add_common_params(parser):
parser.add_argument(
"--docker_tlskey", help="Path to Docker client key", default=""
)
parser.add_argument(
"--yaml",
type=str,
default="",
help="File path for dumping ElasticDL job YAML specification. "
"Note that, if users specify --yaml, the client wouldn't submit "
"the job automatically, and users need to launch the job through "
"command `kubectl create -f path_to_yaml_file`.",
)


def add_train_params(parser):
@@ -2,6 +2,7 @@
import threading
import traceback

import yaml
from kubernetes import client, config, watch
from kubernetes.client import V1EnvVar, V1EnvVarSource, V1ObjectFieldSelector

@@ -268,6 +269,17 @@ def _create_pod(self, **kargs):
return pod

def create_master(self, **kargs):
pod = self._create_master_pod_obj(**kargs)
self.client.create_namespaced_pod(self.namespace, pod)
logger.info("Master launched.")

def dump_master_yaml(self, **kargs):
pod = self._create_master_pod_obj(**kargs)
pod_dict = self.client.api_client.sanitize_for_serialization(pod)
with open(kargs["yaml"], "w") as f:
yaml.safe_dump(pod_dict, f)

def _create_master_pod_obj(self, **kargs):
env = [
V1EnvVar(
name="MY_POD_IP",
@@ -298,8 +310,9 @@ def create_master(self, **kargs):
# Add replica type and index
pod.metadata.labels[ELASTICDL_REPLICA_TYPE_KEY] = "master"
pod.metadata.labels[ELASTICDL_REPLICA_INDEX_KEY] = "0"
self.client.create_namespaced_pod(self.namespace, pod)
logger.info("Master launched.")
pod.api_version = "v1"
pod.kind = "Pod"
return pod

def _create_ps_worker_pod(self, pod_name, type_key, index_key, **kargs):
# Find that master pod that will be used as the owner reference
@@ -138,20 +138,38 @@ def _submit_job(image_name, client_args, container_args):
cluster_spec=client_args.cluster_spec,
)

client.create_master(
resource_requests=client_args.master_resource_request,
resource_limits=client_args.master_resource_limit,
args=container_args,
pod_priority=client_args.master_pod_priority,
image_pull_policy=client_args.image_pull_policy,
restart_policy=client_args.restart_policy,
volume=client_args.volume,
envs=parse_envs(client_args.envs),
)
logger.info(
"ElasticDL job %s was successfully submitted. The master pod is: %s."
% (client_args.job_name, client.get_master_pod_name())
)
if client_args.yaml:
client.dump_master_yaml(
resource_requests=client_args.master_resource_request,
resource_limits=client_args.master_resource_limit,
args=container_args,
pod_priority=client_args.master_pod_priority,
image_pull_policy=client_args.image_pull_policy,
restart_policy=client_args.restart_policy,
volume=client_args.volume,
envs=parse_envs(client_args.envs),
yaml=client_args.yaml,
)
logger.info(
"ElasticDL job %s YAML has been dumped into file %s."
% (client_args.job_name, client_args.yaml)
)
else:
client.create_master(
resource_requests=client_args.master_resource_request,
resource_limits=client_args.master_resource_limit,
args=container_args,
pod_priority=client_args.master_pod_priority,
image_pull_policy=client_args.image_pull_policy,
restart_policy=client_args.restart_policy,
volume=client_args.volume,
envs=parse_envs(client_args.envs),
)
logger.info(
"ElasticDL job %s was successfully submitted. "
"The master pod is: %s."
% (client_args.job_name, client.get_master_pod_name())
)


def _model_zoo_in_docker(model_zoo):
@@ -3,4 +3,4 @@ kubernetes
docker
pyrecordio>=0.0.6
odps
redis-py-cluster
redis-py-cluster

0 comments on commit 18bf078

Please sign in to comment.
You can’t perform that action at this time.