Skip to content
Permalink
Browse files

Check the true status for master pod when TensorBoard is enabled (#1429)

Signed-off-by: terrytangyuan <terrytangyuan@gmail.com>
  • Loading branch information...
terrytangyuan committed Nov 7, 2019
1 parent f3f9b60 commit c7f678e2617d1e0f20683a84275558e5adf8f452
Showing with 17 additions and 0 deletions.
  1. +17 −0 scripts/validate_job_status.sh
@@ -16,8 +16,18 @@ function get_pod_status {
echo ${pod_status}
}

# If TensorBoard service keeps running when the tasks are finished,
# the master pod status would always be running and thus cannot reflect the true status.
# This function finds the true status under master pod's `metadata.labels.status`
# when TensorBoard service is enabled.
function get_master_pod_label_status {
local master_pod_status=$(kubectl get pod ${MASTER_POD_NAME} -o jsonpath='{.metadata.labels.status}')
echo ${master_pod_status}
}

for i in {1..200}; do
MASTER_POD_STATUS=$(get_pod_status ${MASTER_POD_NAME})
MASTER_POD_LABEL_STATUS=$(get_master_pod_label_status)
WORKER_0_POD_STATUS=$(get_pod_status ${WORKER_0_POD_NAME})
WORKER_1_POD_STATUS=$(get_pod_status ${WORKER_1_POD_NAME})

@@ -27,6 +37,13 @@ for i in {1..200}; do
echo "ElasticDL job succeeded."
kubectl delete pod ${MASTER_POD_NAME}
exit 0
elif [[ "$MASTER_POD_STATUS" == "Running" ]] &&
[[ "$MASTER_POD_LABEL_STATUS" == "Finished" ]] &&
[[ "$WORKER_0_POD_STATUS" == "Succeeded" ]] &&
[[ "$WORKER_1_POD_STATUS" == "Succeeded" ]]; then
echo "ElasticDL job succeeded (master pod keeps running for TensorBoard service)."
kubectl delete pod ${MASTER_POD_NAME}
exit 0
elif [[ "$MASTER_POD_STATUS" == "Failed" ]] ||
[[ "$WORKER_0_POD_STATUS" == "Failed" ]] ||
[[ "$WORKER_1_POD_STATUS" == "Failed" ]]; then

0 comments on commit c7f678e

Please sign in to comment.
You can’t perform that action at this time.