Update base for Update on "[Gradient Compression] Allow BatchedPowerS…

…GD to run vanilla allreduce for the first K iterations" Similar to #50973, allow the batched version to run vanilla allreduce for the first K iterations. This may be useful if the batched version can be applied to some use cases where the accuracy requirement is not very strict. Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 Differential Revision: [D26077709](https://our.internmc.facebook.com/intern/diff/D26077709/) [ghstack-poisoned]
pytorch · Jan 31, 2021 · 04be512 · 04be512
2 parents be1af96 + 95a8a14
commit 04be512
Show file tree

Hide file tree

Showing 364 changed files with 11,484 additions and 4,356 deletions.
diff --git a/.circleci/cimodel/data/simple/android_definitions.py b/.circleci/cimodel/data/simple/android_definitions.py
@@ -79,7 +79,6 @@ def gen_tree(self):
     AndroidJob(["x86_64"], "pytorch_linux_build"),
     AndroidJob(["arm", "v7a"], "pytorch_linux_build"),
     AndroidJob(["arm", "v8a"], "pytorch_linux_build"),
-    AndroidJob(["vulkan", "x86_32"], "pytorch_linux_build", is_master_only=False),
     AndroidGradleJob(
         "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32",
         "pytorch_android_gradle_build-x86_32",

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -574,7 +574,7 @@ jobs:
             hostname
             export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           else
-            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           fi
           echo "id=${id}" >> "${BASH_ENV}"
 
@@ -7103,12 +7103,6 @@ workflows:
           name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build
           requires:
             - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
-      - pytorch_linux_build:
-          build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-vulkan-x86_32-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
-          name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_vulkan_x86_32_build
-          requires:
-            - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c
       - pytorch_android_gradle_build-x86_32:
           filters:
             branches:

diff --git a/.circleci/scripts/python_doc_push_script.sh b/.circleci/scripts/python_doc_push_script.sh
@@ -42,7 +42,28 @@ fi
 
 echo "install_path: $install_path  version: $version"
 
-git clone https://github.com/pytorch/pytorch.github.io -b $branch
+
+build_docs () {
+  set +e
+  set -o pipefail
+  make $1 2>&1 | tee /tmp/docs_build.txt
+  code=$?
+  if [ $code -ne 0 ]; then
+    set +x
+    echo =========================
+    grep "WARNING:" /tmp/docs_build.txt
+    echo =========================
+    echo Docs build failed. If the failure is not clear, scan back in the log
+    echo for any WARNINGS or for the line "build finished with problems"
+    echo "(tried to echo the WARNINGS above the ==== line)"
+    echo =========================
+  fi
+  set -ex
+  return $code
+}
+
+
+git clone https://github.com/pytorch/pytorch.github.io -b $branch --depth 1
 pushd pytorch.github.io
 
 export LC_ALL=C
@@ -57,7 +78,8 @@ pushd docs
 # Build the docs
 pip -q install -r requirements.txt
 if [ "$is_master_doc" = true ]; then
-  make html
+  build_docs html
+  [ $? -eq 0 ] || exit $?
   make coverage
   # Now we have the coverage report, we need to make sure it is empty.
   # Count the number of lines in the file and turn that number into a variable
@@ -78,8 +100,9 @@ if [ "$is_master_doc" = true ]; then
     exit 1
   fi
 else
-  # Don't fail the build on coverage problems
-  make html-stable
+  # skip coverage, format for stable or tags
+  build_docs html-stable
+  [ $? -eq 0 ] || exit $?
 fi
 
 # Move them into the docs repo

diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -133,7 +133,7 @@ jobs:
             hostname
             export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           else
-            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
           fi
           echo "id=${id}" >> "${BASH_ENV}"
 

diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+"""Generates a matrix to be utilized through github actions
+
+Will output a condensed version of the matrix if on a pull request that only
+includes the latest version of python we support built on three different
+architectures:
+    * CPU
+    * Latest CUDA
+    * Latest ROCM
+"""
+
+import json
+import os
+import itertools
+
+CUDA_ARCHES = [
+    "10.1",
+    "10.2",
+    "11.0"
+]
+
+ROCM_ARCHES = [
+    "3.10",
+    "4.0"
+]
+
+FULL_ARCHES = [
+    "cpu",
+    *CUDA_ARCHES,
+    *ROCM_ARCHES
+]
+
+CONTAINER_IMAGES = {
+    **{
+        # TODO: Re-do manylinux CUDA image tagging scheme to be similar to
+        #       ROCM so we don't have to do this replacement
+        gpu_arch: f"pytorch/manylinux-cuda{gpu_arch.replace('.', '')}"
+        for gpu_arch in CUDA_ARCHES
+    },
+    **{
+        gpu_arch: f"pytorch/manylinux-rocm:{gpu_arch}"
+        for gpu_arch in ROCM_ARCHES
+    },
+    "cpu": "pytorch/manylinux-cpu"
+}
+
+FULL_PYTHON_VERSIONS = [
+    "3.6",
+    "3.7",
+    "3.8",
+    "3.9",
+]
+
+
+def is_pull_request():
+    return os.environ.get("GITHUB_HEAD_REF")
+
+def generate_matrix():
+    python_versions = FULL_PYTHON_VERSIONS
+    arches = FULL_ARCHES
+    if is_pull_request():
+        python_versions = [python_versions[-1]]
+        arches = ["cpu", CUDA_ARCHES[-1], ROCM_ARCHES[-1]]
+    matrix = []
+    for item in itertools.product(python_versions, arches):
+        python_version, arch_version = item
+        # Not my favorite code here
+        gpu_arch_type = "cuda"
+        if "rocm" in CONTAINER_IMAGES[arch_version]:
+            gpu_arch_type = "rocm"
+        elif "cpu" in CONTAINER_IMAGES[arch_version]:
+            gpu_arch_type = "cpu"
+        matrix.append({
+            "python_version": python_version,
+            "gpu_arch_type": gpu_arch_type,
+            "gpu_arch_version": arch_version,
+            "container_image": CONTAINER_IMAGES[arch_version]
+        })
+    return json.dumps({"include": matrix})
+
+def main():
+    print(generate_matrix())
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/generate_pytorch_version.py b/.github/scripts/generate_pytorch_version.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import subprocess
+import re
+
+from datetime import datetime
+from distutils.util import strtobool
+from pathlib import Path
+
+LEADING_V_PATTERN = re.compile("^v")
+TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
+LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
+
+class NoGitTagException(Exception):
+    pass
+
+def get_pytorch_root():
+    return Path(subprocess.check_output(
+        ['git', 'rev-parse', '--show-toplevel']
+    ).decode('ascii').strip())
+
+def get_tag():
+    root = get_pytorch_root()
+    # We're on a tag
+    am_on_tag = (
+        subprocess.run(
+            ['git', 'describe', '--tags', '--exact'],
+            cwd=root,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL
+        ).returncode == 0
+    )
+    tag = ""
+    if am_on_tag:
+        dirty_tag = subprocess.check_output(
+            ['git', 'describe'],
+            cwd=root
+        ).decode('ascii').strip()
+        # Strip leading v that we typically do when we tag branches
+        # ie: v1.7.1 -> 1.7.1
+        tag = re.sub(LEADING_V_PATTERN, "", dirty_tag)
+        # Strip trailing rc pattern
+        # ie: 1.7.1-rc1 -> 1.7.1
+        tag = re.sub(TRAILING_RC_PATTERN, "", tag)
+    return tag
+
+def get_base_version():
+    root = get_pytorch_root()
+    dirty_version = open(root / 'version.txt', 'r').read().strip()
+    # Strips trailing a0 from version.txt, not too sure why it's there in the
+    # first place
+    return re.sub(LEGACY_BASE_VERSION_SUFFIX_PATTERN, "", dirty_version)
+
+class PytorchVersion:
+    def __init__(self, gpu_arch_type, gpu_arch_version, no_build_suffix):
+        self.gpu_arch_type = gpu_arch_type
+        self.gpu_arch_version = gpu_arch_version
+        self.no_build_suffix = no_build_suffix
+
+    def get_post_build_suffix(self):
+        # CUDA 10.2 is the version to be uploaded to PyPI so it doesn't have a
+        # version suffix
+        if ((self.gpu_arch_type == "cuda" and self.gpu_arch_version == "10.2")
+                or self.no_build_suffix):
+            return ""
+        if self.gpu_arch_type == "cuda":
+            return f"+cu{self.gpu_arch_version.replace('.', '')}"
+        return f"+{self.gpu_arch_type}{self.gpu_arch_version}"
+
+    def get_release_version(self):
+        if not get_tag():
+            raise NoGitTagException(
+                "Not on a git tag, are you sure you want a release version?"
+            )
+        return f"{get_tag()}{self.get_post_build_suffix()}"
+
+    def get_nightly_version(self):
+        date_str = datetime.today().strftime('%Y%m%d')
+        build_suffix = self.get_post_build_suffix()
+        return f"{get_base_version()}.dev{date_str}{build_suffix}"
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate pytorch version for binary builds"
+    )
+    parser.add_argument(
+        "--no-build-suffix",
+        type=strtobool,
+        help="Whether or not to add a build suffix typically (+cpu)",
+        default=os.environ.get("NO_BUILD_SUFFIX", False)
+    )
+    parser.add_argument(
+        "--gpu-arch-type",
+        type=str,
+        help="GPU arch you are building for, typically (cpu, cuda, rocm)",
+        default=os.environ.get("GPU_ARCH_TYPE", "cpu")
+    )
+    parser.add_argument(
+        "--gpu-arch-version",
+        type=str,
+        help="GPU arch version, typically (10.2, 4.0), leave blank for CPU",
+        default=os.environ.get("GPU_ARCH_VERSION", "")
+    )
+    args = parser.parse_args()
+    version_obj = PytorchVersion(
+        args.gpu_arch_type,
+        args.gpu_arch_version,
+        args.no_build_suffix
+    )
+    try:
+        print(version_obj.get_release_version())
+    except NoGitTagException:
+        print(version_obj.get_nightly_version())
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/build_linux_binaries.yml b/.github/workflows/build_linux_binaries.yml
@@ -0,0 +1,86 @@
+name: Build Linux Wheels
+
+on:
+  # TODO: These are only runnable from workflow_dispatch, we need to eventually add
+  #       a cron
+  # TODO: Add an on_release trigger to build on tags
+  workflow_dispatch:
+
+jobs:
+  generate-build-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    container:
+      image: python:3.9
+    steps:
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python .github/scripts/generate_binary_build_matrix.py
+          MATRIX=$(python .github/scripts/generate_binary_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+  build-wheel:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: generate-build-matrix
+    runs-on: linux.2xlarge
+    strategy:
+      matrix:
+        ${{ fromJson(needs.generate-build-matrix.outputs.matrix) }}
+    container:
+      image: ${{ matrix.container_image }}
+    env:
+      DESIRED_PYTHON: ${{ matrix.python_version }}
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: ${{ matrix.gpu_arch_version }}
+      GPU_ARCH_VERSION: ${{ matrix.GPU_ARCH_VERSION }}
+      GPU_ARCH_TYPE: ${{ matrix.gpu_arch_type }}
+      PYTORCH_BUILD_NUMBER: 1
+      SKIP_ALL_TESTS: 1
+    steps:
+      - name: Clone pytorch/pytorch
+        uses: actions/checkout@v2
+        with:
+          path: pytorch
+          submodules: recursive
+      - name: Clone pytorch/builder
+        uses: actions/checkout@v2
+        with:
+          repository: pytorch/builder
+          path: builder
+      - name: Generate version string
+        working-directory: pytorch/
+        run: |
+          version=$(.github/scripts/generate_pytorch_version.py)
+          echo "Generated version: ${version}"
+          echo "PYTORCH_BUILD_VERSION=${version}" >> $GITHUB_ENV
+      # TODO: Remove this once we remove the need for the directories to be
+      #       in specific locations
+      - name: Symlink repositories to root directory (for legacy scripts purposes)
+        run: |
+          ln -s $(pwd)/pytorch /pytorch
+          ln -s $(pwd)/builder /builder
+      # TODO: Bundle the correct build script in the base container image so
+      #       that we don't have to do this type of specification
+      - name: Build PyTorch binary (CUDA specific)
+        if: ${{ matrix.gpu_arch_type == 'cuda' }}
+        run: |
+          /builder/manywheel/build.sh
+      - name: Build PyTorch binary (ROCM specific)
+        if: ${{ matrix.gpu_arch_type == 'rocm' }}
+        run: |
+          /builder/manywheel/build_rocm.sh
+      - name: Build PyTorch binary (CPU specific)
+        if: ${{ matrix.gpu_arch_type == 'cpu' }}
+        run: |
+          /builder/manywheel/build_cpu.sh
+      - uses: actions/upload-artifact@v2
+        with:
+          name: pytorch-wheel-py${{ matrix.python_version }}-${{matrix.gpu_arch_type}}-${{ matrix.gpu_arch_version }}
+          path: /remote/**/*.whl
+      # TODO: Add a step here for uploading binaries