Skip to content

Commit

Permalink
Update on "[vulkan] Distribute weight prepacking along y dimension fo…
Browse files Browse the repository at this point in the history
…r conv2d"

Differential Revision: [D25222752](https://our.internmc.facebook.com/intern/diff/D25222752)

[ghstack-poisoned]
  • Loading branch information
SS-JIA committed Dec 1, 2020
2 parents adb309c + 5f181e2 commit f419c73
Show file tree
Hide file tree
Showing 99 changed files with 2,030 additions and 1,066 deletions.
2 changes: 1 addition & 1 deletion .circleci/cimodel/data/pytorch_build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
]),
]),
("rocm", [
("3.7", [
("3.9", [
("3.6", [
('build_only', [XImportant(True)]),
]),
Expand Down
1 change: 0 additions & 1 deletion .circleci/cimodel/data/simple/docker_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds
"pytorch-linux-xenial-py3.6-gcc7.2",
"pytorch-linux-xenial-py3.6-gcc7",
"pytorch-linux-bionic-rocm3.7-py3.6",
"pytorch-linux-bionic-rocm3.8-py3.6",
"pytorch-linux-bionic-rocm3.9-py3.6",
]
Expand Down
27 changes: 8 additions & 19 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -453,12 +453,8 @@ jobs:
no_output_timeout: "1h"
command: |
set -e
# TODO: Remove this after we figure out why rocm tests are failing
if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
fi
if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
fi
if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
Expand Down Expand Up @@ -538,12 +534,8 @@ jobs:
command: |
set -e
export PYTHONUNBUFFERED=1
# TODO: Remove this after we figure out why rocm tests are failing
if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
fi
if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
fi
# See Note [Special build images]
output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
Expand Down Expand Up @@ -7280,9 +7272,6 @@ workflows:
- docker_build_job:
name: "docker-pytorch-linux-xenial-py3.6-gcc7"
image_name: "pytorch-linux-xenial-py3.6-gcc7"
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
Expand Down Expand Up @@ -7713,11 +7702,11 @@ workflows:
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
resource_class: large
- pytorch_linux_build:
name: pytorch_linux_bionic_rocm3_7_py3_6_build
name: pytorch_linux_bionic_rocm3_9_py3_6_build
requires:
- "docker-pytorch-linux-bionic-rocm3.7-py3.6"
build_environment: "pytorch-linux-bionic-rocm3.7-py3.6-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.7-py3.6"
- "docker-pytorch-linux-bionic-rocm3.9-py3.6"
build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6"
resource_class: xlarge
- pytorch_macos_10_13_py3_build:
name: pytorch_macos_10_13_py3_build
Expand Down
7 changes: 0 additions & 7 deletions .circleci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -274,13 +274,6 @@ case "$image" in
VISION=yes
KATEX=yes
;;
pytorch-linux-bionic-rocm3.7-py3.6)
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=3.7
;;
pytorch-linux-bionic-rocm3.8-py3.6)
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
Expand Down
16 changes: 4 additions & 12 deletions .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,8 @@ jobs:
no_output_timeout: "1h"
command: |
set -e
# TODO: Remove this after we figure out why rocm tests are failing
if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
fi
if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
fi
if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
Expand Down Expand Up @@ -100,12 +96,8 @@ jobs:
command: |
set -e
export PYTHONUNBUFFERED=1
# TODO: Remove this after we figure out why rocm tests are failing
if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
fi
if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
fi
# See Note [Special build images]
output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
Expand Down
5 changes: 5 additions & 0 deletions .jenkins/pytorch/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,18 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *pytorch-xla-linux-bionic* ]] || \
[[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda9-cudnn7-py2* ]] || \
[[ "$BUILD_ENVIRONMENT" == *pytorch-linux-xenial-cuda10.1-cudnn7-py3* ]] || \
[[ "$BUILD_ENVIRONMENT" == *pytorch-*centos* ]] || \
[[ "$BUILD_ENVIRONMENT" == *pytorch-linux-bionic* ]]; then
if ! which conda; then
echo "Expected ${BUILD_ENVIRONMENT} to use conda, but 'which conda' returns empty"
exit 1
else
conda install -q -y cmake
fi
if [[ "$BUILD_ENVIRONMENT" == *pytorch-*centos* ]]; then
# cmake3 package will conflict with conda cmake
sudo yum -y remove cmake3 || true
fi
fi
retry () {
Expand Down
1 change: 0 additions & 1 deletion BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,6 @@ filegroup(
name = "aten_srcs_cu",
srcs = [
"aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
"aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
"aten/src/ATen/native/cuda/Activation.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
Expand Down
2 changes: 1 addition & 1 deletion android/libs/fbjni
17 changes: 15 additions & 2 deletions android/pytorch_android/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
cmake_minimum_required(VERSION 3.4.1)
project(pytorch_jni CXX)

include(GNUInstallDirs)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_VERBOSE_MAKEFILE ON)
message(STATUS "ANDROID_STL:${ANDROID_STL}")
Expand Down Expand Up @@ -68,8 +71,8 @@ target_compile_options(pytorch_jni PRIVATE
-fexceptions
)

target_include_directories(pytorch_jni PUBLIC
${libtorch_include_DIR}
target_include_directories(pytorch_jni BEFORE
PUBLIC $<BUILD_INTERFACE:${libtorch_include_DIR}>
)

set(fbjni_DIR ${CMAKE_CURRENT_LIST_DIR}/../libs/fbjni/)
Expand Down Expand Up @@ -155,3 +158,13 @@ if(USE_VULKAN)
endif()

target_link_libraries(pytorch_jni ${pytorch_jni_LIBS})

install(TARGETS pytorch_jni
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) #For windows

if(MSVC)
install(FILES $<TARGET_PDB_FILE:pytorch_jni> DESTINATION ${CMAKE_INSTALL_LIBDIR} OPTIONAL)
install(TARGETS pytorch_jni DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
121 changes: 0 additions & 121 deletions aten/src/ATen/CUDAGeneratorImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,122 +2,10 @@

#include <c10/core/GeneratorImpl.h>
#include <ATen/core/Generator.h>
#include <ATen/Tensor.h>
#include <ATen/Context.h>
#include <limits>

// TODO: this file should be in ATen/cuda, not top level

namespace at {
/**
* Note [CUDA Graph-safe RNG states]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Strategy:
* ~~~~~~~~~
* A CUDA graph containing multiple RNG ops behaves like a
* single giant kernel from the perspective of ops external
* to the graph. During graph capture, logic below records
* the total of all offset increments that occur in the graphed
* region, and records the final total as the offset for the
* entire graph.
*
* When the graph reruns, the logic that reruns it
* increments this device's CUDA generator's offset
* by that total.
*
* Meanwhile, within the graph, at capture time, instead of
* populating PhiloxCudaStates with the uint64_t offset pulled
* directly from the global state, PhiloxCudaState instead
* holds a pointer to one-element stream-local int64_t device tensor
* holding an initial offset value, and a uint64_t holding an
* intra-graph offset. (The intra-graph offset starts from zero
* when capture begins.) In each consumer kernel,
* at::cuda::philox::unpack computes the offset to use for this kernel
* as intra-graph offset + *initial offset.
*
* When the graph reruns, the logic that reruns it first
* fill_s the initial offset tensor with this device's
* CUDA generator's current offset.
*
* The control flow above ensures graphed execution is bitwise
* identical to eager execution as long as RNG ops are enqueued
* from a single thread, even if RNG ops and graphs containing
* RNG ops are enqueued and run simultaneously on multiple streams.
*
* Usage:
* ~~~~~~
* PhiloxCudaState in this file, and unpack() in
* cuda/CUDAGraphsUtils.cuh allow non-divergent use of
* CUDAGeneratorImpl whether graph capture is underway or not.
*
* Each PhiloxCudaState instance should be used for one and only one
* consumer kernel.
*
* Example (see e.g. native/cuda/Dropout.cu):
*
* #include <ATen/cuda/CUDAGeneratorImpl.h>
* #include <ATen/cuda/CUDAGraphsUtils.cuh>
*
* __global__ void kernel(..., PhiloxCudaState philox_args) {
* auto seeds = at::cuda::philox::unpack(philox_args);
* IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
* curandStatePhilox4_32_10_t state;
* curand_init(std::get<0>(seeds), // seed
* idx, // per-thread subsequence
* std::get<1>(seeds), // offset in subsequence
* &state);
* ...
* }
*
* host_caller(...) {
* PhiloxCudaState rng_engine_inputs;
* {
* // See Note [Acquire lock when using random generators]
* std::lock_guard<std::mutex> lock(gen->mutex_);
*
* // gen could be HostState or DevState here! No divergent code needed!
* rng_engine_inputs = gen->philox_cuda_state(offset_increment);
* }
* kernel<<<...>>>(..., rng_engine_inputs);
* }
*
*/


// Stores state values. Passed as a kernel argument. See "Usage:" above.
struct PhiloxCudaState {
PhiloxCudaState() = default;
PhiloxCudaState(const PhiloxCudaState&) = default;
// Called if graph capture is not underway
PhiloxCudaState(uint64_t seed,
uint64_t offset) {
seed_ = seed;
offset_.val = offset;
}
// Called if graph capture is underway
PhiloxCudaState(uint64_t seed,
int64_t* offset_extragraph,
uint32_t offset_intragraph) {
seed_ = seed;
offset_.ptr = offset_extragraph;
offset_intragraph_ = offset_intragraph;
captured_ = true;
}

// Public members, directly accessible by at::cuda::philox::unpack.
// If we made them private with getters/setters, the getters/setters
// would have to be __device__, and we can't declare __device__ in ATen.
union Payload {
uint64_t val;
int64_t* ptr;
};

uint64_t seed_;
Payload offset_;
uint32_t offset_intragraph_;
bool captured_ = false;
};

struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
// Constructors
Expand All @@ -131,22 +19,13 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
uint64_t seed() override;
void set_philox_offset_per_thread(uint64_t offset);
uint64_t philox_offset_per_thread();
void graph_prologue(int64_t* offset_extragraph_);
uint64_t graph_epilogue();
PhiloxCudaState philox_cuda_state(uint64_t increment);

// Temporarily accommodates call sites that use philox_engine_inputs.
// Allows incremental refactor of call sites to use philox_cuda_state.
std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);

static DeviceType device_type();

private:
CUDAGeneratorImpl* clone_impl() const override;
uint64_t seed_ = default_rng_seed_val;
uint64_t philox_offset_per_thread_ = 0;
int64_t* offset_extragraph_;
uint32_t offset_intragraph_ = 0;
};

namespace cuda {
Expand Down
5 changes: 2 additions & 3 deletions aten/src/ATen/ScalarOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ Tensor& scalar_fill(Tensor& self, Scalar value) {
return self;
}

Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt, c10::optional<c10::MemoryFormat> memory_format_opt) {
Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt) {
at::tracer::impl::NoTracerDispatchMode tracer_guard;
at::AutoNonVariableTypeMode non_var_type_mode(true);
auto result = at::detail::empty_cpu({}, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
auto result = at::detail::empty_cpu({}, dtype_opt, c10::nullopt, device_opt, c10::nullopt, c10::nullopt);
scalar_fill(result, s);
return result;
}
Expand Down
10 changes: 4 additions & 6 deletions aten/src/ATen/ScalarOps.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ namespace detail {
// but we also want to skip compute_types which in not avoidable
// in TensorIterator for now.
Tensor& scalar_fill(Tensor& self, Scalar value);
TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Layout> layout_opt,
c10::optional<Device> device_opt, c10::optional<bool> pin_memory_opt,
c10::optional<c10::MemoryFormat> memory_format_opt);
TORCH_API Tensor scalar_tensor_static(Scalar s, c10::optional<ScalarType> dtype_opt, c10::optional<Device> device_opt);
} // namespace detail
} // namespace at

Expand All @@ -27,12 +25,12 @@ inline at::Tensor scalar_to_tensor(Scalar s, const Device device = at::kCPU) {
// This is the fast track we have for CPU scalar tensors.
if (device == at::kCPU && !s.isComplex()) {
if (s.isFloatingPoint()) {
return at::detail::scalar_tensor_static(s, at::kDouble, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
return at::detail::scalar_tensor_static(s, at::kDouble, at::kCPU);
} else if (s.isBoolean()) {
return at::detail::scalar_tensor_static(s, at::kBool, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
return at::detail::scalar_tensor_static(s, at::kBool, at::kCPU);
} else {
AT_ASSERT(s.isIntegral(false));
return at::detail::scalar_tensor_static(s, at::kLong, c10::nullopt, at::kCPU, c10::nullopt, c10::nullopt);
return at::detail::scalar_tensor_static(s, at::kLong, at::kCPU);
}
}
if (s.isFloatingPoint()) {
Expand Down
6 changes: 0 additions & 6 deletions aten/src/ATen/core/Generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,6 @@
#include <c10/util/intrusive_ptr.h>
#include <c10/core/Device.h>
#include <c10/core/DispatchKeySet.h>

// For the record I don't think this is a correct pimpl idiom.
// Including Impl header in interface header defeats the purpose
// because you can't change Impl private members without forcing
// everything that included the interface to rebuild.
// Impl should be forward-declared in the interface header instead.
#include <c10/core/GeneratorImpl.h>

/**
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/core/aten_interned_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ _(aten, nll_loss2d_forward) \
_(aten, nll_loss_backward) \
_(aten, nll_loss_forward) \
_(aten, nonzero) \
_(aten, nonzero_numpy) \
_(aten, norm) \
_(aten, norm_except_dim) \
_(aten, normal) \
Expand Down

0 comments on commit f419c73

Please sign in to comment.