Update on "Fix node provenance tracking"

Before: ``` triton_fused_add_83_add_84_convolution_15_relu_12_relu_13_squeeze_46_var_mean_15_14 ``` After: ``` triton_fused_add_83_add_84_relu_13_squeeze_46_var_mean_15_14 ``` For this kernel ``` persistent_reduction( size_hints=[512, 64], reduction_hint=ReductionHint.INNER, filename=__file__, meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]} ) triton.jit def triton_(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, out_ptr3, out_ptr4, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr): xnumel = 512 rnumel = 49 xoffset = tl.program_id(0) * XBLOCK xindex = xoffset + tl.arange(0, XBLOCK)[:, None] xmask = xindex < xnumel rindex = tl.arange(0, RBLOCK)[None, :] rmask = rindex < rnumel r1 = rindex x0 = xindex tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask & xmask, other=0) tmp8 = tl.load(in_ptr1 + (x0), xmask) tmp22 = tl.load(in_ptr2 + (x0), xmask) tmp24 = tl.load(in_ptr3 + (x0), xmask) tmp30 = tl.load(in_ptr4 + (x0), xmask) tmp2 = tl.where(rmask & xmask, tmp0, 0) tmp3 = tl.sum(tmp2, 1)[:, None] tmp4 = 49.0 tmp5 = tmp3 / tmp4 tmp6 = 0.1 tmp7 = tmp5 * tmp6 tmp9 = 0.9 tmp10 = tmp8 * tmp9 tmp11 = tmp7 + tmp10 tmp12 = tmp0 - tmp5 tmp13 = tmp12 * tmp12 tmp15 = tl.where(rmask & xmask, tmp13, 0) tmp16 = tl.sum(tmp15, 1)[:, None] tmp17 = tmp16 / tmp4 tmp18 = 1e-05 tmp19 = tmp17 + tmp18 tmp20 = tl.libdevice.rsqrt(tmp19) tmp21 = tmp12 * tmp20 tmp23 = tmp21 * tmp22 tmp25 = tmp23 + tmp24 tmp26 = tl.where(0 != 0, 0, tl.where(0 > tmp25, 0, tmp25)) tmp27 = 1.0208333333333333 tmp28 = tmp17 * tmp27 tmp29 = tmp28 * tmp6 tmp31 = tmp30 * tmp9 tmp32 = tmp29 + tmp31 tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask) tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask) tl.store(out_ptr2 + (r1 + (49*x0) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp26, rmask & xmask) tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask) tl.store(out_ptr4 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp32, xmask) ``` Tbh this still isn't super great provenance tracking, since ops like layernorms are decomposed. I might add some extra provenance tracking during decompositions. cc soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 desertfire [ghstack-poisoned]
pytorch · Mar 5, 2023 · d26cdcf · d26cdcf
2 parents 6f3c63d + 192ac60
commit d26cdcf
Show file tree

Hide file tree

Showing 561 changed files with 21,981 additions and 7,392 deletions.
diff --git a/.ci/docker/android/build.gradle b/.ci/docker/android/build.gradle
@@ -53,7 +53,7 @@ dependencies {
     implementation 'androidx.appcompat:appcompat:1.0.0'
     implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
     implementation 'com.google.code.findbugs:jsr305:3.0.1'
-    implementation 'com.facebook.soloader:nativeloader:0.10.4'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
 
     implementation 'junit:junit:' + rootProject.junitVersion
     implementation 'androidx.test:core:' + rootProject.coreVersion

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
@@ -100,6 +100,7 @@ case "$image" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
     CUDA_VERSION=11.7.0
@@ -113,6 +114,7 @@ case "$image" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7)
     CUDA_VERSION=11.8.0
@@ -126,6 +128,7 @@ case "$image" in
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-py3-clang7-asan)
     ANACONDA_PYTHON_VERSION=3.9
@@ -134,6 +137,7 @@ case "$image" in
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-py3-clang10-onnx)
     ANACONDA_PYTHON_VERSION=3.8
@@ -162,6 +166,7 @@ case "$image" in
     VULKAN_SDK_VERSION=1.2.162.1
     SWIFTSHADER=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-py3.11-clang9)
     ANACONDA_PYTHON_VERSION=3.11
@@ -172,6 +177,7 @@ case "$image" in
     VULKAN_SDK_VERSION=1.2.162.1
     SWIFTSHADER=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-bionic-py3.8-gcc9)
     ANACONDA_PYTHON_VERSION=3.8
@@ -180,6 +186,7 @@ case "$image" in
     DB=yes
     VISION=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-rocm-n-1-py3)
     ANACONDA_PYTHON_VERSION=3.8
@@ -209,6 +216,7 @@ case "$image" in
     VISION=yes
     KATEX=yes
     CONDA_CMAKE=yes
+    TRITON=yes
     ;;
   pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)
     ANACONDA_PYTHON_VERSION=3.8
@@ -218,6 +226,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
+    TRITON=yes
     ;;
   pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
     ANACONDA_PYTHON_VERSION=3.8
@@ -227,6 +236,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
+    TRITON=yes
     ;;
   pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
     ANACONDA_PYTHON_VERSION=3.8
@@ -236,6 +246,7 @@ case "$image" in
     PROTOBUF=yes
     DB=yes
     VISION=yes
+    TRITON=yes
     ;;
   pytorch-linux-focal-linter)
     # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
@@ -328,6 +339,7 @@ docker build \
        --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
        --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
        --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
+       --build-arg "TRITON=${TRITON}" \
        -f $(dirname ${DOCKERFILE})/Dockerfile \
        -t "$tmp_tag" \
        "$@" \

diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -0,0 +1 @@
+b8b470bc597c1c5bd03682c09fe3e6b7c53787fd
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
@@ -13,7 +13,7 @@ as_jenkins() {
   # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
   # NB: This must be run from a directory that jenkins has access to,
   # works around https://github.com/conda/conda-package-handling/pull/34
-  $SUDO -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
+  $SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
 }
 
 conda_install() {
@@ -30,3 +30,7 @@ conda_run() {
 pip_install() {
   as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
 }
+
+get_pinned_commit() {
+  cat "${1}".txt
+}
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+get_conda_version() {
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
+}
+
+conda_reinstall() {
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
+}
+
+# The logic here is copied from .ci/pytorch/common_utils.sh
+TRITON_PINNED_COMMIT=$(get_pinned_commit triton)
+
+apt update
+apt-get install -y gpg-agent
+
+if [ -n "${CONDA_CMAKE}" ]; then
+  # Keep the current cmake and numpy version here, so we can reinstall them later
+  CMAKE_VERSION=$(get_conda_version cmake)
+  NUMPY_VERSION=$(get_conda_version numpy)
+fi
+
+if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
+  # Triton needs at least gcc-9 to build
+  apt-get install -y g++-9
+
+  CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+elif [ -n "${CLANG_VERSION}" ]; then
+  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
+  add-apt-repository -y ppa:ubuntu-toolchain-r/test
+  apt-get install -y g++-9
+
+  CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+else
+  pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
+fi
+
+if [ -n "${CONDA_CMAKE}" ]; then
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # this can be removed.
+  #
+  # The correct numpy version also needs to be set here because conda claims that it
+  # causes inconsistent environment.  Without this, conda will attempt to install the
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
+  # needs NumPy 1.20 or less.
+  conda_reinstall cmake="${CMAKE_VERSION}"
+  conda_reinstall numpy="${NUMPY_VERSION}"
+fi
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -258,3 +258,8 @@ ghstack==0.7.1
 #Description: ghstack tool
 #Pinned versions: 0.7.1
 #test that import:
+
+jinja2==3.1.2
+#Description: jinja2 template engine
+#Pinned versions: 3.1.2
+#test that import:
diff --git a/.ci/docker/ubuntu-cuda/Dockerfile b/.ci/docker/ubuntu-cuda/Dockerfile
@@ -85,6 +85,15 @@ COPY ./common/install_cmake.sh install_cmake.sh
 RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh
 
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH

diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
@@ -134,6 +134,15 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
 ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh
 
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -191,16 +191,19 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
   set -e
 
   get_bazel
+  install_sccache_nvcc_for_bazel
 
   # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
   # the runner
   BAZEL_MEM_LIMIT="--local_ram_resources=HOST_RAM*.8"
   BAZEL_CPU_LIMIT="--local_cpu_resources=HOST_CPUS-1"
 
-  tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
-  # Build torch, the Python module, and tests for CPU-only
-  tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" --config=cpu-only :torch :_C.so :all_tests
-
+  if [[ "$CUDA_VERSION" == "cpu" ]]; then
+    # Build torch, the Python module, and tests for CPU-only
+    tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" --config=cpu-only :torch :_C.so :all_tests
+  else
+    tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
+  fi
 else
   # check that setup.py would fail with bad arguments
   echo "The next three invocations are expected to fail with invalid command error messages."

diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
@@ -95,6 +95,26 @@ function get_bazel() {
   chmod +x tools/bazel
 }
 
+# This function is bazel specific because of the bug
+# in the bazel that requires some special paths massaging
+# as a workaround. See
+# https://github.com/bazelbuild/bazel/issues/10167
+function install_sccache_nvcc_for_bazel() {
+  sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
+
+  # Write the `/usr/local/cuda/bin/nvcc`
+  cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
+#!/bin/sh
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache /usr/local/cuda/bin/nvcc "\$@"
+else
+  exec external/local_cuda/cuda/bin/nvcc-real "\$@"
+fi
+EOF
+
+  sudo chmod +x /usr/local/cuda/bin/nvcc
+}
+
 function install_monkeytype {
   # Install MonkeyType
   pip_install MonkeyType
@@ -129,32 +149,6 @@ function clone_pytorch_xla() {
   fi
 }
 
-function install_filelock() {
-  pip_install filelock
-}
-
-function install_triton() {
-  local commit
-  if [[ "${TEST_CONFIG}" == *rocm* ]]; then
-    echo "skipping triton due to rocm"
-  else
-    commit=$(get_pinned_commit triton)
-    if [[ "${BUILD_ENVIRONMENT}" == *gcc7* ]]; then
-      # Trition needs gcc-9 to build
-      sudo apt-get install -y g++-9
-      CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
-    elif [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
-      # Trition needs <filesystem> which surprisingly is not available with clang-9 toolchain
-      sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
-      sudo apt-get install -y g++-9
-      CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
-    else
-      pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
-    fi
-    pip_install --user jinja2
-  fi
-}
-
 function setup_torchdeploy_deps(){
   conda install -y -n "py_${ANACONDA_PYTHON_VERSION}" "libpython-static=${ANACONDA_PYTHON_VERSION}"
   local CC

diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
@@ -40,6 +40,16 @@ cross_compile_arm64() {
   USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
+  USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
+}
+
 compile_x86_64() {
   USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
 }
@@ -63,7 +73,11 @@ build_lite_interpreter() {
 }
 
 if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
-  cross_compile_arm64
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
 elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
   export BUILD_LITE_INTERPRETER=1
   build_lite_interpreter

diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh
@@ -42,7 +42,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/
 time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/test_softmax
 time python test/run_test.py --verbose -i distributed/_shard/sharded_optim/test_sharded_optim
 time python test/run_test.py --verbose -i distributed/_shard/test_partial_tensor
-time python test/run_test.py --verbose -i distributed/_shard/test_replicated_tensor
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors