Skip to content

Commit

Permalink
Update on "[vulkan] Add mean.dim op for vulkan"
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
SS-JIA committed Nov 3, 2020
2 parents 1800c75 + c68c3d0 commit 9185eb4
Show file tree
Hide file tree
Showing 34 changed files with 570 additions and 254 deletions.
1 change: 0 additions & 1 deletion .circleci/docker/centos-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
ENV PATH /opt/rocm/opencl/bin:$PATH
ENV PATH /opt/rocm/llvm/bin:$PATH
ENV HIP_PLATFORM hcc
ENV LANG en_US.utf8
ENV LC_ALL en_US.utf8

Expand Down
1 change: 0 additions & 1 deletion .circleci/docker/ubuntu-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
ENV PATH /opt/rocm/opencl/bin:$PATH
ENV PATH /opt/rocm/llvm/bin:$PATH
ENV HIP_PLATFORM hcc
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8

Expand Down
2 changes: 2 additions & 0 deletions .jenkins/caffe2/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
fi
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
unset HIP_PLATFORM
if which sccache > /dev/null; then
# Save sccache logs to file
sccache --stop-server || true
Expand Down
2 changes: 2 additions & 0 deletions .jenkins/pytorch/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ SCRIPT_DIR="$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"

# Figure out which Python to use for ROCm
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
unset HIP_PLATFORM
PYTHON=$(which "python${BASH_REMATCH[1]}")
# non-interactive bashs do not expand aliases by default
shopt -s expand_aliases
Expand Down
28 changes: 3 additions & 25 deletions .jenkins/pytorch/macos-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,8 @@ git submodule update --init --recursive
export CMAKE_PREFIX_PATH=${WORKSPACE_DIR}/miniconda3/

# Build PyTorch
if [[ "${BUILD_ENVIRONMENT}" == *cuda9.2* ]]; then
export CUDA_VERSION=9.2
export TORCH_CUDA_ARCH_LIST=5.2
export PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/bin${PATH:+:${PATH}}
export DYLD_LIBRARY_PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}
export CUDA_HOME=/Developer/NVIDIA/CUDA-${CUDA_VERSION}
export USE_CUDA=1

if [ -z "${IN_CI}" ]; then
# Eigen gives "explicit specialization of class must precede its first use" error
# when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead.
export DEVELOPER_DIR=/Library/Developer/CommandLineTools
fi
else
if [ -z "${IN_CI}" ]; then
export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
fi
if [ -z "${IN_CI}" ]; then
export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
fi

if which sccache > /dev/null; then
Expand All @@ -34,17 +19,10 @@ if which sccache > /dev/null; then
printf "#!/bin/sh\nexec sccache $(which clang) \$*" > "${WORKSPACE_DIR}/clang"
chmod a+x "${WORKSPACE_DIR}/clang"

if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
printf "#!/bin/sh\nexec sccache $(which nvcc) \$*" > "${WORKSPACE_DIR}/nvcc"
chmod a+x "${WORKSPACE_DIR}/nvcc"
export CUDA_NVCC_EXECUTABLE="${WORKSPACE_DIR}/nvcc"
fi

export PATH="${WORKSPACE_DIR}:$PATH"
fi

# If we run too many parallel jobs, we will OOM
MAX_JOBS=2 USE_DISTRIBUTED=1 python setup.py install
USE_DISTRIBUTED=1 python setup.py install

assert_git_not_dirty

Expand Down
22 changes: 12 additions & 10 deletions .jenkins/pytorch/win-test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash -ex

#!/bin/bash
set -ex
# shellcheck disable=SC2034
COMPACT_JOB_NAME=pytorch-win-ws2019-cuda10-cudnn7-py3-test

Expand Down Expand Up @@ -42,28 +42,30 @@ fi

run_tests() {
if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat && \
$SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
$SCRIPT_HELPERS_DIR/test_custom_backend.bat
$SCRIPT_HELPERS_DIR/test_libtorch.bat
else
if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
export PYTORCH_COLLECT_COVERAGE=1
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_libtorch.bat
if [[ "${USE_CUDA}" == "1" ]]; then
$SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM"
fi
elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_custom_backend.bat
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
fi
fi
}

run_tests && assert_git_not_dirty && echo "TEST PASSED"
run_tests
assert_git_not_dirty
echo "TEST PASSED"

if [[ "${BUILD_ENVIRONMENT}" == "pytorch-win-vs2019-cuda10-cudnn7-py3" ]] && [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
pushd $TEST_DIR
Expand Down
66 changes: 66 additions & 0 deletions aten/src/ATen/native/LinearAlgebra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1738,5 +1738,71 @@ Tensor chain_matmul(TensorList matrices) {
}
}

/*
Calculates the Kronecker product between two Tensors.
*/
Tensor kron(const Tensor& self, const Tensor& other) {
/*
We can obtain the kron result using tensordot or einsum. The implementation below uses tensordot.
In einsum notation suppose we have `self` with dim 4 and `other` with dim 2
the result of below tensordot is in einsum 0123, 45 -> 012345.
To obtain the correct kron we need to permute and reshape the array.
The permutation rule is the following: going from right to left
take axes in turn to form the permutation
with our example the correct permutation is 012435 and
the kron shape is (shape_self[0], shape_self[1], shape_self[3]*shape_other[0],
shape_self[4]*shape_other[1])
*/
std::vector<int64_t> self_sizes = self.sizes().vec();
std::vector<int64_t> other_sizes = other.sizes().vec();
int64_t self_ndim = self.dim();
int64_t other_ndim = other.dim();
int64_t min_ndim = std::min(self_ndim, other_ndim);
int64_t ndim_diff = std::abs(self_ndim - other_ndim);

std::vector<int64_t> a_axes(self_ndim);
std::vector<int64_t> b_axes(other_ndim);
std::iota(a_axes.begin(), a_axes.end(), 0);
std::iota(b_axes.begin(), b_axes.end(), 0 + self_ndim);

bool is_a_larger = self_ndim >= other_ndim;
std::vector<int64_t> kron_permutation(self_ndim + other_ndim);
for (int64_t i = 0; i < ndim_diff; i++) {
kron_permutation[i] = is_a_larger ? a_axes[i] : b_axes[i];
}
for (int64_t i = 0, j = 0; i < min_ndim; i++, j += 2) {
kron_permutation[self_ndim + other_ndim - 1 - j] = b_axes[other_ndim - 1 - i];
kron_permutation[self_ndim + other_ndim - 1 - j - 1] = a_axes[self_ndim - 1 - i];
}

std::vector<int64_t> result_shape(std::max(self_ndim, other_ndim));
for (int64_t i = 0; i < ndim_diff; i++) {
result_shape[i] = is_a_larger ? self_sizes[i] : other_sizes[i];
}
for (int64_t i = 0; i < min_ndim; i++) {
result_shape[ndim_diff + i] = is_a_larger
? self_sizes[ndim_diff + i] * other_sizes[i]
: other_sizes[ndim_diff + i] * self_sizes[i];
}

Tensor result = at::tensordot(self, other, {}, {});
// Step 2: now permute result
result = result.permute(kron_permutation);
// Step 3: reshape
result = result.reshape(result_shape);

return result;
}

Tensor& kron_out(Tensor& result, const Tensor& self, const Tensor& other) {
TORCH_CHECK(result.scalar_type() == self.scalar_type(),
"result dtype ", result.scalar_type(), " does not match self dtype ", self.scalar_type());

Tensor result_tmp = at::kron(self, other);
at::native::resize_output(result, result_tmp.sizes());
result.copy_(result_tmp);
return result;
}

} // namespace native
} // namespace at
9 changes: 9 additions & 0 deletions aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2142,6 +2142,15 @@
CPU: kl_div_backward_cpu
CUDA: kl_div_backward_cuda

- func: kron(Tensor self, Tensor other) -> Tensor
variants: function, method
dispatch:
Math: kron

- func: kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
Math: kron_out

- func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
use_c10_dispatcher: full
variants: function, method
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/native/vulkan/glsl/mean.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ layout(std430) buffer;
layout(std430) uniform;
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform Block {
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
int W;
int H;
} uBlock;
Expand All @@ -13,11 +13,11 @@ layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);
vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H));
vec4 acc = vec4(0);
int xi, yi;
for (xi = 0; xi < uBlock.W; ++xi) {
for (yi = 0; yi < uBlock.H; ++yi) {
for (yi = 0; yi < uBlock.H; ++yi) {
for (xi = 0; xi < uBlock.W; ++xi) {
acc += texelFetch(uInput, ivec3(xi, yi, pos.z), 0);
}
}
Expand Down
9 changes: 4 additions & 5 deletions aten/src/ATen/native/vulkan/glsl/mean2d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ layout(std430) buffer;
layout(std430) uniform;
layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput;
layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
layout(set = 0, binding = 2) uniform Block {
layout(set = 0, binding = 2) uniform PRECISION restrict Block {
int W;
int H;
} uBlock;
Expand All @@ -13,18 +13,17 @@ layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;

void main() {
ivec3 pos = ivec3(gl_GlobalInvocationID);
vec4 r = vec4(1.0) / float(uBlock.W) / float(uBlock.H);
vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H));
vec4 acc = vec4(0);
int xi, yi;
int zi = (imageSize(uOutput).x*pos.y + pos.x)/4;
int zo = (imageSize(uOutput).x*pos.y + pos.x)%4;
for (xi = 0; xi < uBlock.W; ++xi) {
for (yi = 0; yi < uBlock.H; ++yi) {
for (yi = 0; yi < uBlock.H; ++yi) {
for (xi = 0; xi < uBlock.W; ++xi) {
acc += texelFetch(uInput, ivec3(xi, yi, zi), 0);
}
}
vec4 outValue = r * acc;

int test = (imageSize(uOutput).x*pos.x + pos.x);
imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0));
}

0 comments on commit 9185eb4

Please sign in to comment.