Update on "Allow more inserts before reIndexTopology"

Summary: Currently if you are inserting into JIT IR at the same point in the middle of the graph, it only allows for 40 inserts before it has to reindex. Reindexing is N**2 behavior, which can lead to slow load times. This changes it so that it keeps track of how many insertions happen at single point (like when a function is being inlined) to predict how many future insertions will happen there. It then adjusts how it assigns topology to make sure there is enough room for those predicted insertions. In practice this will allow around 2M inserts at a single point before it reindexes. Test Plan: test_jit.py Differential Revision: [D46206617](https://our.internmc.facebook.com/intern/diff/D46206617) [ghstack-poisoned]
pytorch · May 31, 2023 · 91ce1e3 · 91ce1e3
2 parents dffa7a2 + 86c4d16
commit 91ce1e3
Show file tree

Hide file tree

Showing 316 changed files with 9,556 additions and 5,162 deletions.
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -88,11 +88,25 @@ _UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7)
+  pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9)
     CUDA_VERSION=12.1.0
     CUDNN_VERSION=8
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=7
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
     PROTOBUF=yes
     DB=yes
     VISION=yes

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-7d1a95b04654ff9c216afe08a454ad0822f05370
+9820899b3845e461d9031dba66062efade65d420
diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh
@@ -16,15 +16,15 @@ pip_install \
   onnx==1.14.0
 
 pip_install \
-  onnxruntime==1.14.0 \
+  onnxruntime==1.15.0 \
   parameterized==0.8.1 \
   pytest-cov==4.0.0 \
   pytest-subtests==0.10.0 \
   tabulate==0.9.0 \
   transformers==4.25.1
 
 # TODO: change this when onnx-script is on testPypi
-pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@bf502680231e4b134a71f74e812c84ddd7efffbe"
+pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@68adea42fb9b7353148e7ab289b76f9b89890e1c"
 
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,34 @@
+FROM mcr.microsoft.com/vscode/devcontainers/miniconda:0-3
+
+#  I am suprised this is needed
+RUN conda init
+
+# Copy environment.yml (if found) to a temp location so we update the environment. Also
+# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
+COPY .devcontainer/cuda/environment.yml .devcontainer/noop.txt /tmp/conda-tmp/
+RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
+    && sudo rm -rf /tmp/conda-tmp
+
+# Tools needed for llvm
+RUN sudo apt-get -y update
+RUN sudo apt install -y lsb-release wget software-properties-common gnupg
+
+# Install CLANG if version is specified
+ARG CLANG_VERSION
+RUN if [ -n "$CLANG_VERSION" ]; then \
+    sudo wget https://apt.llvm.org/llvm.sh; \
+    chmod +x llvm.sh; \
+    sudo ./llvm.sh "${CLANG_VERSION}"; \
+    echo 'export CC=clang' >> ~/.bashrc; \
+    echo 'export CXX=clang++' >> ~/.bashrc; \
+    sudo apt update; \
+    sudo apt install -y clang; \
+    sudo apt install -y libomp-dev; \
+    fi
+
+
+# Install cuda if version is specified
+ARG CUDA_VERSION
+RUN if [ -n "$CUDA_VERSION" ]; then \
+       conda install cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \
+    fi
diff --git a/.devcontainer/cpu/devcontainer.json b/.devcontainer/cpu/devcontainer.json
@@ -0,0 +1,37 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
+{
+  "name": "PyTorch - CPU",
+  "build": {
+    "context": "../..",
+    "dockerfile": "../Dockerfile",
+    "args": {
+      "USERNAME": "vscode",
+      "BUILDKIT_INLINE_CACHE": "0",
+      "CLANG_VERSION": ""
+    }
+  },
+
+  // Features to add to the dev container. More info: https://containers.dev/features.
+  "features": {
+    // This is needed for lintrunner
+    "ghcr.io/devcontainers/features/rust:1" : {}
+  },
+
+  // Use 'forwardPorts' to make a list of ports inside the container available locally.
+  // "forwardPorts": [],
+
+  // Use 'postCreateCommand' to run commands after the container is created.
+  "postCreateCommand": "bash .devcontainer/scripts/install-dev-tools.sh",
+
+  // Configure tool-specific properties.
+  // "customizations": {},
+  "customizations": {
+    "vscode": {
+      "extensions": ["streetsidesoftware.code-spell-checker"]
+    }
+  }
+
+  // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+  // "remoteUser": "root"
+}
diff --git a/.devcontainer/cpu/environment.yml b/.devcontainer/cpu/environment.yml
@@ -0,0 +1,6 @@
+# This environment is specific to Debian
+name: PyTorch
+dependencies:
+  - cmake
+  - ninja
+  - libopenblas
diff --git a/.devcontainer/cuda/devcontainer.json b/.devcontainer/cuda/devcontainer.json
@@ -0,0 +1,37 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/anaconda
+{
+  "name": "PyTorch - CUDA",
+  "build": {
+    "context": "../..",
+    "dockerfile": "../Dockerfile",
+    "args": {
+      "USERNAME": "vscode",
+      "BUILDKIT_INLINE_CACHE": "0",
+      "CUDA_VERSION": "11.8.0",
+      "CLANG_VERSION": ""
+    }
+  },
+  "runArgs": ["--gpus", "all"],
+// Use 'forwardPorts' to make a list of ports inside the container available locally.
+  // "forwardPorts": [],
+
+  // Use 'postCreateCommand' to run commands after the container is created.
+  "postCreateCommand": "bash .devcontainer/scripts/install-dev-tools.sh",
+
+  // Configure tool-specific properties.
+  // "customizations": {},
+  "customizations": {
+    "vscode": {
+      "extensions": ["streetsidesoftware.code-spell-checker"]
+    }
+  },
+
+  // Features to add to the dev container. More info: https://containers.dev/features.
+  "features": {
+    // This is needed for lintrunner
+    "ghcr.io/devcontainers/features/rust:1" : {}
+  }
+  // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+  // "remoteUser": "root"
+}
diff --git a/.devcontainer/cuda/environment.yml b/.devcontainer/cuda/environment.yml
@@ -0,0 +1,6 @@
+# This environment is specific to Debian
+name: PyTorch
+dependencies:
+  - cmake
+  - ninja
+  - libopenblas
diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt
@@ -0,0 +1,3 @@
+This file copied into the container along with environment.yml* from the parent
+folder. This file is included to prevents the Dockerfile COPY instruction from
+failing if no environment.yml is found.
diff --git a/.devcontainer/scripts/install-dev-tools.sh b/.devcontainer/scripts/install-dev-tools.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
+pip install -r requirements.txt
+git submodule sync
+git submodule update --init --recursive
+
+# This takes some time
+make setup_lint
+
+# Add CMAKE_PREFIX_PATH to bashrc
+echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
diff --git a/.devcontainer/scripts/update_alternatives_clang.sh b/.devcontainer/scripts/update_alternatives_clang.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# update_alternatives_clang.sh
+# chmod u+x update_alternatives_clang.sh
+#
+
+update_alternatives() {
+    local version=${1}
+    local priority=${2}
+    local z=${3}
+    local slaves=${4}
+    local path=${5}
+    local cmdln
+
+    cmdln="--verbose --install ${path}${master} ${master} ${path}${master}-${version} ${priority}"
+    for slave in ${slaves}; do
+        cmdln="${cmdln} --slave ${path}${slave} ${slave} ${path}${slave}-${version}"
+    done
+    sudo update-alternatives ${cmdln}
+}
+
+if [[ ${#} -ne 2 ]]; then
+    echo usage: "${0}" clang_version priority
+    exit 1
+fi
+
+version=${1}
+priority=${2}
+path="/usr/bin/"
+
+master="llvm-config"
+slaves="llvm-addr2line llvm-ar llvm-as llvm-bcanalyzer llvm-bitcode-strip llvm-cat llvm-cfi-verify llvm-cov llvm-c-test llvm-cvtres llvm-cxxdump llvm-cxxfilt llvm-cxxmap llvm-debuginfod llvm-debuginfod-find llvm-diff llvm-dis llvm-dlltool llvm-dwarfdump llvm-dwarfutil llvm-dwp llvm-exegesis llvm-extract llvm-gsymutil llvm-ifs llvm-install-name-tool llvm-jitlink llvm-jitlink-executor llvm-lib llvm-libtool-darwin llvm-link llvm-lipo llvm-lto llvm-lto2 llvm-mc llvm-mca llvm-ml llvm-modextract llvm-mt llvm-nm llvm-objcopy llvm-objdump llvm-omp-device-info llvm-opt-report llvm-otool llvm-pdbutil llvm-PerfectShuffle llvm-profdata llvm-profgen llvm-ranlib llvm-rc llvm-readelf llvm-readobj llvm-reduce llvm-remark-size-diff llvm-rtdyld llvm-sim llvm-size llvm-split llvm-stress llvm-strings llvm-strip llvm-symbolizer llvm-tapi-diff llvm-tblgen llvm-tli-checker llvm-undname llvm-windres llvm-xray"
+
+update_alternatives "${version}" "${priority}" "${master}" "${slaves}" "${path}"
+
+master="clang"
+slaves="analyze-build asan_symbolize bugpoint c-index-test clang++ clang-apply-replacements clang-change-namespace clang-check clang-cl clang-cpp clangd clang-doc clang-extdef-mapping clang-format clang-format-diff clang-include-fixer clang-linker-wrapper clang-move clang-nvlink-wrapper clang-offload-bundler clang-offload-packager clang-offload-wrapper clang-pseudo clang-query clang-refactor clang-rename clang-reorder-fields clang-repl clang-scan-deps clang-tidy count diagtool dsymutil FileCheck find-all-symbols git-clang-format hmaptool hwasan_symbolize intercept-build ld64.lld ld.lld llc lld lldb lldb-argdumper lldb-instr lldb-server lldb-vscode lld-link lli lli-child-target modularize not obj2yaml opt pp-trace run-clang-tidy sancov sanstats scan-build scan-build-py scan-view split-file UnicodeNameMappingGenerator verify-uselistorder wasm-ld yaml2obj yaml-bench"
+
+update_alternatives "${version}" "${priority}" "${master}" "${slaves}" "${path}"
diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-4125d3a02b15faf4b19767a91797320151ce8bc6
+01b9faa16cfeacbb70aa33bd18534de50891786b
diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt
@@ -11,6 +11,6 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.26
+requests==2.31.0
 rich==10.9.0
 rockset==1.0.3
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
@@ -387,7 +387,7 @@ def main() -> None:
         filtered_test_matrix = test_matrix
 
     if args.event_name == "schedule" and args.schedule == "29 8 * * *":
-        # we don't want to run the mem leack check or disabled tests on normal
+        # we don't want to run the mem leak check or disabled tests on normal
         # periodically scheduled jobs, only the ones at this time
         filtered_test_matrix = set_periodic_modes(filtered_test_matrix, args.job_name)
 

diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
@@ -33,7 +33,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
+          - docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
+          - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9
           - docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
           - docker-image-name: pytorch-linux-bionic-py3.8-clang9
           - docker-image-name: pytorch-linux-bionic-py3.11-clang9

diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
@@ -15,12 +15,12 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build:
-    name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
+  linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
       cuda-arch-list: '8.6'
       test-matrix: |
         { include: [
@@ -38,11 +38,11 @@ jobs:
           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-test:
-    name: cuda12.1-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
+  linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build
+    needs: linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build
     with:
-      build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
@@ -17,51 +17,52 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build:
-    name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
+  linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build:
+    name: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
-      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
+      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
+      cuda-arch-list: 8.6
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-test:
-    name: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
+  linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-test:
+    name: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build
+    needs: linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build
     with:
-      build-environment: linux-bionic-cuda12.1-py3-gcc7-slow-gradcheck
-      docker-image: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda12.1-py3-gcc9-slow-gradcheck
+      docker-image: ${{ needs.linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.test-matrix }}
       timeout-minutes: 300
 
-  linux-bionic-cuda12_1-py3_10-gcc7-sm86-build:
-    name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
+  linux-bionic-cuda12_1-py3_10-gcc9-sm86-build:
+    name: linux-bionic-cuda12.1-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc7
+      build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9
       cuda-arch-list: 8.6
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
 
-  linux-bionic-cuda12_1-py3_10-gcc7-sm86-test:
-    name: linux-bionic-cuda12.1-py3.10-gcc7-sm86
+  linux-bionic-cuda12_1-py3_10-gcc9-sm86-test:
+    name: linux-bionic-cuda12.1-py3.10-gcc9-sm86
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda12_1-py3_10-gcc7-sm86-build
+    needs: linux-bionic-cuda12_1-py3_10-gcc9-sm86-build
     with:
-      build-environment: linux-bionic-cuda12.1-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc7-sm86-build.outputs.test-matrix }}
+      build-environment: linux-bionic-cuda12.1-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-bionic-cuda12_1-py3_10-gcc9-sm86-build.outputs.test-matrix }}
 
   linux-bionic-py3_8-clang9-build:
     name: linux-bionic-py3.8-clang9

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -51,12 +51,12 @@ jobs:
       docker-image: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_10-gcc7-build.outputs.test-matrix }}
 
-  libtorch-linux-bionic-cuda11_8-py3_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.8-py3.7-gcc7
+  libtorch-linux-bionic-cuda11_8-py3_7-gcc9-build:
+    name: libtorch-linux-bionic-cuda11.8-py3.7-gcc9
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: libtorch-linux-bionic-cuda11.8-py3.7-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
+      build-environment: libtorch-linux-bionic-cuda11.8-py3.7-gcc9
+      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc9
       build-generates-artifacts: false
       runner: linux.4xlarge
       test-matrix: |