From f6ca89967fd63d6f3d38d7904bb33d6b98f7545c Mon Sep 17 00:00:00 2001 From: workingloong Date: Wed, 12 Aug 2020 11:41:33 +0800 Subject: [PATCH 1/3] Install Horovod and dependencies in the dev image --- elasticdl/docker/Dockerfile | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/elasticdl/docker/Dockerfile b/elasticdl/docker/Dockerfile index 91da12b6e..f9a344c4f 100644 --- a/elasticdl/docker/Dockerfile +++ b/elasticdl/docker/Dockerfile @@ -7,9 +7,25 @@ ARG EXTRA_PYPI_INDEX=https://pypi.org/simple COPY elasticdl/docker/bashrc /etc/bash.bashrc RUN chmod a+rx /etc/bash.bashrc -RUN apt-get -qq update && \ - apt-get -qq install -y unzip curl git software-properties-common g++ wget \ - shellcheck libeigen3-dev clang-format > /dev/null && \ +RUN apt-get -qq update && apt-get -qq install -y \ + unzip \ + curl \ + git \ + software-properties-common \ + g++ \ + wget \ + build-essential \ + cmake \ + vim \ + ca-certificates \ + libjpeg-dev \ + libpng-dev \ + librdmacm1 \ + libibverbs1 \ + ibverbs-providers \ + shellcheck \ + libeigen3-dev \ + clang-format > /dev/null && \ python -m pip install --quiet --upgrade pip COPY elasticdl_client/requirements.txt /requirements.txt @@ -61,6 +77,8 @@ COPY elasticdl/python/data/recordio_gen/heart_recordio_gen.py /scripts/heart_rec FROM dev as allreduce +RUN pip install future typing + # Note that pip is having issue downloading PyTorch on manylinux so we use curl # to download it instead RUN curl -sLo torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl \ @@ -72,3 +90,10 @@ RUN cd /root && git clone --depth=1 https://github.com/caicloud/ftlib.git RUN cd /root/ftlib && python -m pip install --quiet -r requirements.txt RUN cd /root/ftlib/ftlib/consensus/gossip && bash ./gen_shared_lib.sh RUN cp -r /root/ftlib/ftlib /usr/local/lib/python3.6/dist-packages/ftlib + +ENV HOROVOD_PATH /tmp/horovod +RUN cd /tmp \ + && git clone --recursive https://github.com/horovod/horovod.git + +RUN cd ${HOROVOD_PATH} && HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 \ + python setup.py install \ No newline at end of file From f54f63de43a04d2e8d7e5767193f27b985034f1b Mon Sep 17 00:00:00 2001 From: workingloong Date: Wed, 12 Aug 2020 11:50:13 +0800 Subject: [PATCH 2/3] Add a blank line at the end --- elasticdl/docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elasticdl/docker/Dockerfile b/elasticdl/docker/Dockerfile index f9a344c4f..dcbdbe1de 100644 --- a/elasticdl/docker/Dockerfile +++ b/elasticdl/docker/Dockerfile @@ -96,4 +96,4 @@ RUN cd /tmp \ && git clone --recursive https://github.com/horovod/horovod.git RUN cd ${HOROVOD_PATH} && HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 \ - python setup.py install \ No newline at end of file + python setup.py install From 1927a6b4b6db57dd5c6e77f96e0c31b462db1d3b Mon Sep 17 00:00:00 2001 From: workingloong Date: Wed, 12 Aug 2020 16:02:02 +0800 Subject: [PATCH 3/3] Set commit id to clone --- elasticdl/docker/Dockerfile | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/elasticdl/docker/Dockerfile b/elasticdl/docker/Dockerfile index dcbdbe1de..2d22605c4 100644 --- a/elasticdl/docker/Dockerfile +++ b/elasticdl/docker/Dockerfile @@ -3,6 +3,7 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} as dev ARG EXTRA_PYPI_INDEX=https://pypi.org/simple +ARG HOROVOD_COMMIT_ID="3108a24" COPY elasticdl/docker/bashrc /etc/bash.bashrc RUN chmod a+rx /etc/bash.bashrc @@ -91,9 +92,15 @@ RUN cd /root/ftlib && python -m pip install --quiet -r requirements.txt RUN cd /root/ftlib/ftlib/consensus/gossip && bash ./gen_shared_lib.sh RUN cp -r /root/ftlib/ftlib /usr/local/lib/python3.6/dist-packages/ftlib -ENV HOROVOD_PATH /tmp/horovod +# The latest package of Horovod does not support elastic training, +# so we need to git clone and install it using source codes. +ENV HOROVOD_PATH /tmp/${HOROVOD_COMMIT_ID} RUN cd /tmp \ - && git clone --recursive https://github.com/horovod/horovod.git - -RUN cd ${HOROVOD_PATH} && HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 \ - python setup.py install + && git clone --recursive https://github.com/horovod/horovod.git \ + -b master ${HOROVOD_COMMIT_ID} + +RUN cd ${HOROVOD_PATH} && HOROVOD_WITHOUT_MPI=1 \ + HOROVOD_WITHOUT_MXNET=1 \ + HOROVOD_WITH_TENSORFLOW=1 \ + HOROVOD_WITH_PYTORCH=1 \ + python setup.py install