From 4c9eb57914fb538e21c46b63cfb5c2e9d5bc2f20 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 18 Nov 2020 19:35:20 -0800 Subject: [PATCH] [PyTorch] Narrow Device to 2 bytes by narrowing DeviceType and DeviceIndex (#47023) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47023 DeviceType pretty clearly only needs 1 byte. DeviceIndex only needs 1 byte given that machines don't have anywhere near 255 GPUs in them as far as I know. ghstack-source-id: 116901430 Test Plan: Existing tests, added assertion to catch if my assumption about DeviceIndex is incorrect Reviewed By: dzhulgakov Differential Revision: D24605460 fbshipit-source-id: 7c9a89027fcf8eebd623b7cdbf6302162c981cd2 --- aten/src/ATen/cuda/CUDADevice.h | 2 +- c10/core/Device.h | 14 +++++++------- c10/core/DeviceType.h | 3 +-- c10/core/Stream.h | 8 ++++---- c10/core/TensorOptions.h | 2 +- c10/core/impl/DeviceGuardImplInterface.h | 4 ++-- c10/cuda/CUDAFunctions.cpp | 4 ++++ caffe2/proto/caffe2.proto | 1 - caffe2/proto/caffe2_pb.h | 6 ------ caffe2/python/__init__.py | 1 - caffe2/utils/proto_utils.cc | 1 - test/cpp/api/tensor_options.cpp | 4 ++-- test/test_torch.py | 5 ----- torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp | 4 ++-- torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h | 4 ++-- 15 files changed, 26 insertions(+), 37 deletions(-) diff --git a/aten/src/ATen/cuda/CUDADevice.h b/aten/src/ATen/cuda/CUDADevice.h index 9d14ab1627e0..2d0a682488fd 100644 --- a/aten/src/ATen/cuda/CUDADevice.h +++ b/aten/src/ATen/cuda/CUDADevice.h @@ -11,7 +11,7 @@ namespace cuda { inline Device getDeviceFromPtr(void* ptr) { cudaPointerAttributes attr; AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); - return {DeviceType::CUDA, static_cast(attr.device)}; + return {DeviceType::CUDA, static_cast(attr.device)}; } }} // namespace at::cuda diff --git a/c10/core/Device.h b/c10/core/Device.h index f1249e865f8b..7827119bb0ac 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -15,7 +15,7 @@ namespace c10 { /// A DeviceIndex is not independently meaningful without knowing /// the DeviceType it is associated; try to use Device rather than /// DeviceIndex directly. -using DeviceIndex = int16_t; +using DeviceIndex = int8_t; /// Represents a a compute device on which a tensor is located. A device is /// uniquely identified by a type, which specifies the type of machine it is @@ -94,9 +94,9 @@ struct C10_API Device final { DeviceIndex index_ = -1; void validate() { TORCH_CHECK(index_ == -1 || index_ >= 0, - "Device index must be -1 or non-negative, got ", index_); + "Device index must be -1 or non-negative, got ", (int)index_); TORCH_CHECK(!is_cpu() || index_ <= 0, - "CPU device index must be -1 or zero, got ", index_); + "CPU device index must be -1 or zero, got ", (int)index_); } }; @@ -112,8 +112,8 @@ struct hash { size_t operator()(c10::Device d) const noexcept { // Are you here because this static assert failed? Make sure you ensure // that the bitmasking code below is updated accordingly! - static_assert(sizeof(c10::DeviceType) == 2, "DeviceType is not 16-bit"); - static_assert(sizeof(c10::DeviceIndex) == 2, "DeviceIndex is not 16-bit"); + static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit"); + static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit"); // Note [Hazard when concatenating signed integers] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // We must first convert to a same-sized unsigned type, before promoting to @@ -124,8 +124,8 @@ struct hash { // Technically, by C/C++ integer promotion rules, we only need one of the // uint32_t casts to the result type, but we put in both for explicitness's sake. uint32_t bits = - static_cast(static_cast(d.type())) << 16 - | static_cast(static_cast(d.index())); + static_cast(static_cast(d.type())) << 16 + | static_cast(static_cast(d.index())); return std::hash{}(bits); } }; diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h index b4037f664561..86935436ae1c 100644 --- a/c10/core/DeviceType.h +++ b/c10/core/DeviceType.h @@ -12,7 +12,7 @@ namespace c10 { -enum class DeviceType : int16_t { +enum class DeviceType : int8_t { CPU = 0, CUDA = 1, // CUDA. MKLDNN = 2, // Reserved for explicit MKLDNN @@ -30,7 +30,6 @@ enum class DeviceType : int16_t { // in DeviceType.cpp // - Change the number below COMPILE_TIME_MAX_DEVICE_TYPES = 12, - ONLY_FOR_TEST = 20901, // This device type is only for test. }; constexpr DeviceType kCPU = DeviceType::CPU; diff --git a/c10/core/Stream.h b/c10/core/Stream.h index 5baac5325af7..62d5261534ee 100644 --- a/c10/core/Stream.h +++ b/c10/core/Stream.h @@ -111,14 +111,14 @@ class Stream final { uint64_t pack() const noexcept { // Are you here because this static assert failed? Make sure you ensure // that the bitmasking code below is updated accordingly! - static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit"); - static_assert(sizeof(DeviceIndex) == 2, "DeviceIndex is not 16-bit"); + static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit"); + static_assert(sizeof(DeviceIndex) == 1, "DeviceIndex is not 8-bit"); static_assert(sizeof(StreamId) == 4, "DeviceIndex is not 32-bit"); // Concat these together into a 64-bit integer // See Note [Hazard when concatenating signed integers] uint64_t bits = - static_cast(static_cast(device_type())) << 48 - | static_cast(static_cast(device_index())) << 32 + static_cast(static_cast(device_type())) << 48 + | static_cast(static_cast(device_index())) << 32 | static_cast(static_cast(id())); return bits; } diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index 420e0a984b77..347df066cc90 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -504,7 +504,7 @@ struct C10_API TensorOptions { // NB: We didn't use c10::optional here, because then we can't pack // the has_***_ boolean fields. - Device device_ = at::kCPU; // 32-bit + Device device_ = at::kCPU; // 16-bit caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make(); // 16-bit Layout layout_ = at::kStrided; // 8-bit MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h index f7f5b4f867a9..2ef02b57d3be 100644 --- a/c10/core/impl/DeviceGuardImplInterface.h +++ b/c10/core/impl/DeviceGuardImplInterface.h @@ -215,8 +215,8 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) { // FB employees can see // https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/ // for more details - static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit"); - auto p = device_guard_impl_registry[static_cast(type) & 0xFFFF].load(); + static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit"); + auto p = device_guard_impl_registry[static_cast(type) & 0xFF].load(); // This seems to be the first place where you make use of a device // when you pass devices to factory functions. Give a nicer error diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 99bb721798d9..4c87cb9ec8c6 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -1,5 +1,7 @@ #include +#include + namespace c10 { namespace cuda { @@ -93,6 +95,8 @@ DeviceIndex device_count() noexcept { // initialize number of devices only once static int count = []() { try { + auto result = device_count_impl(); + TORCH_INTERNAL_ASSERT(result <= std::numeric_limits::max(), "Too many CUDA devices, DeviceIndex overflowed"); return device_count_impl(); } catch (const c10::Error& ex) { // We don't want to fail, but still log the warning diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto index dd8f365b4d85..76acbf201f71 100644 --- a/caffe2/proto/caffe2.proto +++ b/caffe2/proto/caffe2.proto @@ -239,7 +239,6 @@ enum DeviceTypeProto { PROTO_XLA = 9; // XLA / TPU // Change the following number if you add more devices in the code. PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 10; - PROTO_ONLY_FOR_TEST = 20901; // This device type is only for test. } // Device-specific options. We do not distinguish DeviceOption protos for diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h index 6bd886b559c1..23af2be32219 100644 --- a/caffe2/proto/caffe2_pb.h +++ b/caffe2/proto/caffe2_pb.h @@ -15,7 +15,6 @@ constexpr DeviceType IDEEP = DeviceType::IDEEP; constexpr DeviceType HIP = DeviceType::HIP; constexpr DeviceType COMPILE_TIME_MAX_DEVICE_TYPES = DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES; -constexpr DeviceType ONLY_FOR_TEST = DeviceType::ONLY_FOR_TEST; inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) { switch (p) { @@ -35,8 +34,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) { return DeviceType::HIP; case caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES: return DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES; - case caffe2::PROTO_ONLY_FOR_TEST: - return DeviceType::ONLY_FOR_TEST; default: AT_ERROR( "Unknown device:", @@ -69,8 +66,6 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) { return caffe2::PROTO_HIP; case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES: return caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES; - case DeviceType::ONLY_FOR_TEST: - return caffe2::PROTO_ONLY_FOR_TEST; default: AT_ERROR( "Unknown device:", @@ -102,7 +97,6 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption( case DeviceType::MKLDNN: case DeviceType::IDEEP: case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES: - case DeviceType::ONLY_FOR_TEST: break; default: AT_ERROR( diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 8582eff9ce19..1262f97588ad 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -12,7 +12,6 @@ caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES -caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST if platform.system() == 'Windows': is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta')) diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc index 5fc34d631c4f..70484643cded 100644 --- a/caffe2/utils/proto_utils.cc +++ b/caffe2/utils/proto_utils.cc @@ -53,7 +53,6 @@ C10_EXPORT bool IsCPUDeviceType(int device_type) { PROTO_CPU, PROTO_MKLDNN, PROTO_IDEEP, - PROTO_ONLY_FOR_TEST, }; return cpu_types.count(device_type); } diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp index 5de56139702a..d6b347a5d754 100644 --- a/test/cpp/api/tensor_options.cpp +++ b/test/cpp/api/tensor_options.cpp @@ -111,8 +111,8 @@ TEST(DeviceTest, ParsesCorrectlyFromString) { device = Device("hip"); ASSERT_EQ(device, Device(DeviceType::HIP)); - device = Device("hip:321"); - ASSERT_EQ(device, Device(DeviceType::HIP, 321)); + device = Device("hip:123"); + ASSERT_EQ(device, Device(DeviceType::HIP, 123)); std::vector badnesses = { "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"}; diff --git a/test/test_torch.py b/test/test_torch.py index 6682ba53fa72..15ebcb10c367 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -691,11 +691,6 @@ def test_device(self): self.assertEqual('cuda', cuda90.type) self.assertEqual(90, cuda90.index) - cuda23333 = torch.device('cuda', 23333) - self.assertEqual('cuda:23333', str(cuda23333)) - self.assertEqual('cuda', cuda23333.type) - self.assertEqual(23333, cuda23333.index) - self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1')) self.assertRaises(RuntimeError, lambda: torch.device('cpu:1')) self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1)) diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp index 27315ee47527..0e49f865388c 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp @@ -66,7 +66,7 @@ static void getMajorMinor( // Compiles the specified kernel and stores the metadata required to run it FusedKernelCUDA::FusedKernelCUDA( - int16_t device, + at::DeviceIndex device, std::string name, std::string code, std::vector input_desc, @@ -222,7 +222,7 @@ static std::shared_ptr createFusionKernel( std::vector concat_desc, bool has_random) { return std::make_shared( - device, + static_cast(device), std::move(name), std::move(code), std::move(input_desc), diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h index 5dba48dabfc7..96a4a4a0f76c 100644 --- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h +++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h @@ -22,7 +22,7 @@ namespace cuda { struct TORCH_CUDA_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel { FusedKernelCUDA( - int16_t device, + at::DeviceIndex device, std::string name, std::string code, std::vector input_desc, @@ -45,7 +45,7 @@ struct TORCH_CUDA_API FusedKernelCUDA // Note: per device to store device properties and compute launch heuristics // Acquiring these values at launch time would be too slow - int16_t device_; + at::DeviceIndex device_; int maxBlocks_; cudaDeviceProp* prop_; std::vector ptx_;