Skip to content

Commit

Permalink
Update on "Fix node provenance tracking"
Browse files Browse the repository at this point in the history
Before:
```
triton_fused_add_83_add_84_convolution_15_relu_12_relu_13_squeeze_46_var_mean_15_14
```

After:
```
triton_fused_add_83_add_84_relu_13_squeeze_46_var_mean_15_14
```

For this kernel
```
persistent_reduction(
    size_hints=[512, 64],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    meta={'signature': {0: '*fp32', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: 'i32', 11: 'i32'}, 'device': 0, 'constants': {}, 'mutated_arg_names': ['in_out_ptr0'], 'configs': [instance_descriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10), equal_to_1=())]}
)
triton.jit
def triton_(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, out_ptr2, out_ptr3, out_ptr4, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 512
    rnumel = 49
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    rindex = tl.arange(0, RBLOCK)[None, :]
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (49*x0)), rmask & xmask, other=0)
    tmp8 = tl.load(in_ptr1 + (x0), xmask)
    tmp22 = tl.load(in_ptr2 + (x0), xmask)
    tmp24 = tl.load(in_ptr3 + (x0), xmask)
    tmp30 = tl.load(in_ptr4 + (x0), xmask)
    tmp2 = tl.where(rmask & xmask, tmp0, 0)
    tmp3 = tl.sum(tmp2, 1)[:, None]
    tmp4 = 49.0
    tmp5 = tmp3 / tmp4
    tmp6 = 0.1
    tmp7 = tmp5 * tmp6
    tmp9 = 0.9
    tmp10 = tmp8 * tmp9
    tmp11 = tmp7 + tmp10
    tmp12 = tmp0 - tmp5
    tmp13 = tmp12 * tmp12
    tmp15 = tl.where(rmask & xmask, tmp13, 0)
    tmp16 = tl.sum(tmp15, 1)[:, None]
    tmp17 = tmp16 / tmp4
    tmp18 = 1e-05
    tmp19 = tmp17 + tmp18
    tmp20 = tl.libdevice.rsqrt(tmp19)
    tmp21 = tmp12 * tmp20
    tmp23 = tmp21 * tmp22
    tmp25 = tmp23 + tmp24
    tmp26 = tl.where(0 != 0, 0, tl.where(0 > tmp25, 0, tmp25))
    tmp27 = 1.0208333333333333
    tmp28 = tmp17 * tmp27
    tmp29 = tmp28 * tmp6
    tmp31 = tmp30 * tmp9
    tmp32 = tmp29 + tmp31
    tl.store(in_out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp5, xmask)
    tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp11, xmask)
    tl.store(out_ptr2 + (r1 + (49*x0) + tl.zeros([XBLOCK, RBLOCK], tl.int32)), tmp26, rmask & xmask)
    tl.store(out_ptr3 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp20, xmask)
    tl.store(out_ptr4 + (x0 + tl.zeros([XBLOCK, 1], tl.int32)), tmp32, xmask)
```

Tbh this still isn't super great provenance tracking, since ops like layernorms are decomposed. I might add some extra provenance tracking during decompositions.



cc soumith voznesenskym yanboliang penguinwu anijain2305 EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng Xia-Weiwen wenzhe-nrv jiayisunx peterbell10 desertfire

[ghstack-poisoned]
  • Loading branch information
Chillee committed Mar 5, 2023
2 parents 6f3c63d + 192ac60 commit d26cdcf
Show file tree
Hide file tree
Showing 561 changed files with 21,981 additions and 7,392 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/android/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ dependencies {
implementation 'androidx.appcompat:appcompat:1.0.0'
implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
implementation 'com.google.code.findbugs:jsr305:3.0.1'
implementation 'com.facebook.soloader:nativeloader:0.10.4'
implementation 'com.facebook.soloader:nativeloader:0.10.5'

implementation 'junit:junit:' + rootProject.junitVersion
implementation 'androidx.test:core:' + rootProject.coreVersion
Expand Down
12 changes: 12 additions & 0 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
CUDA_VERSION=11.7.0
Expand All @@ -113,6 +114,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7)
CUDA_VERSION=11.8.0
Expand All @@ -126,6 +128,7 @@ case "$image" in
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3-clang7-asan)
ANACONDA_PYTHON_VERSION=3.9
Expand All @@ -134,6 +137,7 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3-clang10-onnx)
ANACONDA_PYTHON_VERSION=3.8
Expand Down Expand Up @@ -162,6 +166,7 @@ case "$image" in
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-py3.11-clang9)
ANACONDA_PYTHON_VERSION=3.11
Expand All @@ -172,6 +177,7 @@ case "$image" in
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-bionic-py3.8-gcc9)
ANACONDA_PYTHON_VERSION=3.8
Expand All @@ -180,6 +186,7 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-rocm-n-1-py3)
ANACONDA_PYTHON_VERSION=3.8
Expand Down Expand Up @@ -209,6 +216,7 @@ case "$image" in
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.6-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
Expand All @@ -218,6 +226,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
Expand All @@ -227,6 +236,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
Expand All @@ -236,6 +246,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
Expand Down Expand Up @@ -328,6 +339,7 @@ docker build \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
--build-arg "TRITON=${TRITON}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \
Expand Down
1 change: 1 addition & 0 deletions .ci/docker/ci_commit_pins/triton.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
b8b470bc597c1c5bd03682c09fe3e6b7c53787fd
6 changes: 5 additions & 1 deletion .ci/docker/common/common_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ as_jenkins() {
# NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation
# NB: This must be run from a directory that jenkins has access to,
# works around https://github.com/conda/conda-package-handling/pull/34
$SUDO -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
$SUDO -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $*
}

conda_install() {
Expand All @@ -30,3 +30,7 @@ conda_run() {
pip_install() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $*
}

get_pinned_commit() {
cat "${1}".txt
}
54 changes: 54 additions & 0 deletions .ci/docker/common/install_triton.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

set -ex

source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

get_conda_version() {
as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
}

conda_reinstall() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
}

# The logic here is copied from .ci/pytorch/common_utils.sh
TRITON_PINNED_COMMIT=$(get_pinned_commit triton)

apt update
apt-get install -y gpg-agent

if [ -n "${CONDA_CMAKE}" ]; then
# Keep the current cmake and numpy version here, so we can reinstall them later
CMAKE_VERSION=$(get_conda_version cmake)
NUMPY_VERSION=$(get_conda_version numpy)
fi

if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
# Triton needs at least gcc-9 to build
apt-get install -y g++-9

CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
elif [ -n "${CLANG_VERSION}" ]; then
# Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get install -y g++-9

CXX=g++-9 pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
else
pip_install "git+https://github.com/openai/triton@${TRITON_PINNED_COMMIT}#subdirectory=python"
fi

if [ -n "${CONDA_CMAKE}" ]; then
# TODO: This is to make sure that the same cmake and numpy version from install conda
# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
# this can be removed.
#
# The correct numpy version also needs to be set here because conda claims that it
# causes inconsistent environment. Without this, conda will attempt to install the
# latest numpy version, which fails ASAN tests with the following import error: Numba
# needs NumPy 1.20 or less.
conda_reinstall cmake="${CMAKE_VERSION}"
conda_reinstall numpy="${NUMPY_VERSION}"
fi
5 changes: 5 additions & 0 deletions .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,8 @@ ghstack==0.7.1
#Description: ghstack tool
#Pinned versions: 0.7.1
#test that import:

jinja2==3.1.2
#Description: jinja2 template engine
#Pinned versions: 3.1.2
#test that import:
9 changes: 9 additions & 0 deletions .ci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh

ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt

# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
Expand Down
9 changes: 9 additions & 0 deletions .ci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,15 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
ENV OPENSSL_DIR /opt/openssl
RUN rm install_openssl.sh

ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt

# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
Expand Down
11 changes: 7 additions & 4 deletions .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,16 +191,19 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
set -e

get_bazel
install_sccache_nvcc_for_bazel

# Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
# the runner
BAZEL_MEM_LIMIT="--local_ram_resources=HOST_RAM*.8"
BAZEL_CPU_LIMIT="--local_cpu_resources=HOST_CPUS-1"

tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
# Build torch, the Python module, and tests for CPU-only
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" --config=cpu-only :torch :_C.so :all_tests

if [[ "$CUDA_VERSION" == "cpu" ]]; then
# Build torch, the Python module, and tests for CPU-only
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" --config=cpu-only :torch :_C.so :all_tests
else
tools/bazel build --config=no-tty "${BAZEL_MEM_LIMIT}" "${BAZEL_CPU_LIMIT}" //...
fi
else
# check that setup.py would fail with bad arguments
echo "The next three invocations are expected to fail with invalid command error messages."
Expand Down
46 changes: 20 additions & 26 deletions .ci/pytorch/common_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,26 @@ function get_bazel() {
chmod +x tools/bazel
}

# This function is bazel specific because of the bug
# in the bazel that requires some special paths massaging
# as a workaround. See
# https://github.com/bazelbuild/bazel/issues/10167
function install_sccache_nvcc_for_bazel() {
sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real

# Write the `/usr/local/cuda/bin/nvcc`
cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
#!/bin/sh
if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
exec sccache /usr/local/cuda/bin/nvcc "\$@"
else
exec external/local_cuda/cuda/bin/nvcc-real "\$@"
fi
EOF

sudo chmod +x /usr/local/cuda/bin/nvcc
}

function install_monkeytype {
# Install MonkeyType
pip_install MonkeyType
Expand Down Expand Up @@ -129,32 +149,6 @@ function clone_pytorch_xla() {
fi
}

function install_filelock() {
pip_install filelock
}

function install_triton() {
local commit
if [[ "${TEST_CONFIG}" == *rocm* ]]; then
echo "skipping triton due to rocm"
else
commit=$(get_pinned_commit triton)
if [[ "${BUILD_ENVIRONMENT}" == *gcc7* ]]; then
# Trition needs gcc-9 to build
sudo apt-get install -y g++-9
CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
elif [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
# Trition needs <filesystem> which surprisingly is not available with clang-9 toolchain
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
sudo apt-get install -y g++-9
CXX=g++-9 pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
else
pip_install --user "git+https://github.com/openai/triton@${commit}#subdirectory=python"
fi
pip_install --user jinja2
fi
}

function setup_torchdeploy_deps(){
conda install -y -n "py_${ANACONDA_PYTHON_VERSION}" "libpython-static=${ANACONDA_PYTHON_VERSION}"
local CC
Expand Down
16 changes: 15 additions & 1 deletion .ci/pytorch/macos-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ cross_compile_arm64() {
USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}

compile_arm64() {
# Compilation for arm64
# TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}

compile_x86_64() {
USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
}

compile_x86_64() {
USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel
}
Expand All @@ -63,7 +73,11 @@ build_lite_interpreter() {
}

if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
cross_compile_arm64
if [[ $(uname -m) == "arm64" ]]; then
compile_arm64
else
cross_compile_arm64
fi
elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
export BUILD_LITE_INTERPRETER=1
build_lite_interpreter
Expand Down
1 change: 0 additions & 1 deletion .ci/pytorch/multigpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/
time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/ops/test_softmax
time python test/run_test.py --verbose -i distributed/_shard/sharded_optim/test_sharded_optim
time python test/run_test.py --verbose -i distributed/_shard/test_partial_tensor
time python test/run_test.py --verbose -i distributed/_shard/test_replicated_tensor
# Other tests
time python test/run_test.py --verbose -i test_cuda_primary_ctx
time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
Expand Down

0 comments on commit d26cdcf

Please sign in to comment.