[PyTorch] Narrow Device to 2 bytes by narrowing DeviceType and Device…

…Index (#47023) Summary: Pull Request resolved: #47023 DeviceType pretty clearly only needs 1 byte. DeviceIndex only needs 1 byte given that machines don't have anywhere near 255 GPUs in them as far as I know. ghstack-source-id: 116901430 Test Plan: Existing tests, added assertion to catch if my assumption about DeviceIndex is incorrect Reviewed By: dzhulgakov Differential Revision: D24605460 fbshipit-source-id: 7c9a89027fcf8eebd623b7cdbf6302162c981cd2
pytorch · Nov 19, 2020 · 4c9eb57 · 4c9eb57
1 parent 72918e4
commit 4c9eb57
Show file tree

Hide file tree

Showing 15 changed files with 26 additions and 37 deletions.
diff --git a/aten/src/ATen/cuda/CUDADevice.h b/aten/src/ATen/cuda/CUDADevice.h
@@ -11,7 +11,7 @@ namespace cuda {
 inline Device getDeviceFromPtr(void* ptr) {
   cudaPointerAttributes attr;
   AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-  return {DeviceType::CUDA, static_cast<int16_t>(attr.device)};
+  return {DeviceType::CUDA, static_cast<DeviceIndex>(attr.device)};
 }
 
 }} // namespace at::cuda
diff --git a/c10/core/Device.h b/c10/core/Device.h
@@ -15,7 +15,7 @@ namespace c10 {
 /// A DeviceIndex is not independently meaningful without knowing
 /// the DeviceType it is associated; try to use Device rather than
 /// DeviceIndex directly.
-using DeviceIndex = int16_t;
+using DeviceIndex = int8_t;
 
 /// Represents a a compute device on which a tensor is located. A device is
 /// uniquely identified by a type, which specifies the type of machine it is
@@ -94,9 +94,9 @@ struct C10_API Device final {
   DeviceIndex index_ = -1;
   void validate() {
     TORCH_CHECK(index_ == -1 || index_ >= 0,
-        "Device index must be -1 or non-negative, got ", index_);
+        "Device index must be -1 or non-negative, got ", (int)index_);
     TORCH_CHECK(!is_cpu() || index_ <= 0,
-        "CPU device index must be -1 or zero, got ", index_);
+        "CPU device index must be -1 or zero, got ", (int)index_);
   }
 };
 
@@ -112,8 +112,8 @@ struct hash<c10::Device> {
   size_t operator()(c10::Device d) const noexcept {
     // Are you here because this static assert failed?  Make sure you ensure
     // that the bitmasking code below is updated accordingly!
-    static_assert(sizeof(c10::DeviceType) == 2, "DeviceType is not 16-bit");
-    static_assert(sizeof(c10::DeviceIndex) == 2, "DeviceIndex is not 16-bit");
+    static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
     // Note [Hazard when concatenating signed integers]
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // We must first convert to a same-sized unsigned type, before promoting to
@@ -124,8 +124,8 @@ struct hash<c10::Device> {
     // Technically, by C/C++ integer promotion rules, we only need one of the
     // uint32_t casts to the result type, but we put in both for explicitness's sake.
     uint32_t bits =
-        static_cast<uint32_t>(static_cast<uint16_t>(d.type())) << 16
-      | static_cast<uint32_t>(static_cast<uint16_t>(d.index()));
+        static_cast<uint32_t>(static_cast<uint8_t>(d.type())) << 16
+      | static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
     return std::hash<uint32_t>{}(bits);
   }
 };

diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
@@ -12,7 +12,7 @@
 
 namespace c10 {
 
-enum class DeviceType : int16_t {
+enum class DeviceType : int8_t {
   CPU = 0,
   CUDA = 1, // CUDA.
   MKLDNN = 2, // Reserved for explicit MKLDNN
@@ -30,7 +30,6 @@ enum class DeviceType : int16_t {
   //    in DeviceType.cpp
   //  - Change the number below
   COMPILE_TIME_MAX_DEVICE_TYPES = 12,
-  ONLY_FOR_TEST = 20901, // This device type is only for test.
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;

diff --git a/c10/core/Stream.h b/c10/core/Stream.h
@@ -111,14 +111,14 @@ class Stream final {
   uint64_t pack() const noexcept {
     // Are you here because this static assert failed?  Make sure you ensure
     // that the bitmasking code below is updated accordingly!
-    static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
-    static_assert(sizeof(DeviceIndex) == 2, "DeviceIndex is not 16-bit");
+    static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(DeviceIndex) == 1, "DeviceIndex is not 8-bit");
     static_assert(sizeof(StreamId) == 4, "DeviceIndex is not 32-bit");
     // Concat these together into a 64-bit integer
     // See Note [Hazard when concatenating signed integers]
     uint64_t bits =
-        static_cast<uint64_t>(static_cast<uint16_t>(device_type())) << 48
-      | static_cast<uint64_t>(static_cast<uint16_t>(device_index())) << 32
+        static_cast<uint64_t>(static_cast<uint8_t>(device_type())) << 48
+      | static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 32
       | static_cast<uint64_t>(static_cast<uint32_t>(id()));
     return bits;
   }

diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
@@ -504,7 +504,7 @@ struct C10_API TensorOptions {
   // NB: We didn't use c10::optional here, because then we can't pack
   // the has_***_ boolean fields.
 
-  Device device_ = at::kCPU; // 32-bit
+  Device device_ = at::kCPU; // 16-bit
   caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
   Layout layout_ = at::kStrided; // 8-bit
   MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit

diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
@@ -215,8 +215,8 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
   // FB employees can see
   //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
   // for more details
-  static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
-  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
+  static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFF].load();
 
   // This seems to be the first place where you make use of a device
   // when you pass devices to factory functions.  Give a nicer error

diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
@@ -1,5 +1,7 @@
 #include <c10/cuda/CUDAFunctions.h>
 
+#include <limits>
+
 namespace c10 {
 namespace cuda {
 
@@ -93,6 +95,8 @@ DeviceIndex device_count() noexcept {
   // initialize number of devices only once
   static int count = []() {
     try {
+      auto result = device_count_impl();
+      TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
       return device_count_impl();
     } catch (const c10::Error& ex) {
       // We don't want to fail, but still log the warning

diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
@@ -239,7 +239,6 @@ enum DeviceTypeProto {
   PROTO_XLA = 9;                    // XLA / TPU
   // Change the following number if you add more devices in the code.
   PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 10;
-  PROTO_ONLY_FOR_TEST = 20901;   // This device type is only for test.
 }
 
 // Device-specific options. We do not distinguish DeviceOption protos for

diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
@@ -15,7 +15,6 @@ constexpr DeviceType IDEEP = DeviceType::IDEEP;
 constexpr DeviceType HIP = DeviceType::HIP;
 constexpr DeviceType COMPILE_TIME_MAX_DEVICE_TYPES =
     DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
-constexpr DeviceType ONLY_FOR_TEST = DeviceType::ONLY_FOR_TEST;
 
 inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   switch (p) {
@@ -35,8 +34,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
       return DeviceType::HIP;
     case caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES:
       return DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
-    case caffe2::PROTO_ONLY_FOR_TEST:
-      return DeviceType::ONLY_FOR_TEST;
     default:
       AT_ERROR(
           "Unknown device:",
@@ -69,8 +66,6 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
       return caffe2::PROTO_HIP;
     case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
       return caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES;
-    case DeviceType::ONLY_FOR_TEST:
-      return caffe2::PROTO_ONLY_FOR_TEST;
     default:
       AT_ERROR(
           "Unknown device:",
@@ -102,7 +97,6 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
     case DeviceType::MKLDNN:
     case DeviceType::IDEEP:
     case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
-    case DeviceType::ONLY_FOR_TEST:
       break;
     default:
       AT_ERROR(

diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
@@ -12,7 +12,6 @@
 caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
-caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 
 if platform.system() == 'Windows':
     is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))

diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
@@ -53,7 +53,6 @@ C10_EXPORT bool IsCPUDeviceType(int device_type) {
       PROTO_CPU,
       PROTO_MKLDNN,
       PROTO_IDEEP,
-      PROTO_ONLY_FOR_TEST,
   };
   return cpu_types.count(device_type);
 }

diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
@@ -111,8 +111,8 @@ TEST(DeviceTest, ParsesCorrectlyFromString) {
   device = Device("hip");
   ASSERT_EQ(device, Device(DeviceType::HIP));
 
-  device = Device("hip:321");
-  ASSERT_EQ(device, Device(DeviceType::HIP, 321));
+  device = Device("hip:123");
+  ASSERT_EQ(device, Device(DeviceType::HIP, 123));
 
   std::vector<std::string> badnesses = {
       "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};

diff --git a/test/test_torch.py b/test/test_torch.py
@@ -691,11 +691,6 @@ def test_device(self):
             self.assertEqual('cuda', cuda90.type)
             self.assertEqual(90, cuda90.index)
 
-            cuda23333 = torch.device('cuda', 23333)
-            self.assertEqual('cuda:23333', str(cuda23333))
-            self.assertEqual('cuda', cuda23333.type)
-            self.assertEqual(23333, cuda23333.index)
-
             self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1'))
             self.assertRaises(RuntimeError, lambda: torch.device('cpu:1'))
             self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1))

diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -66,7 +66,7 @@ static void getMajorMinor(
 
 // Compiles the specified kernel and stores the metadata required to run it
 FusedKernelCUDA::FusedKernelCUDA(
-    int16_t device,
+    at::DeviceIndex device,
     std::string name,
     std::string code,
     std::vector<TensorDesc> input_desc,
@@ -222,7 +222,7 @@ static std::shared_ptr<FusedKernel> createFusionKernel(
     std::vector<PartitionDesc> concat_desc,
     bool has_random) {
   return std::make_shared<FusedKernelCUDA>(
-      device,
+      static_cast<at::DeviceIndex>(device),
       std::move(name),
       std::move(code),
       std::move(input_desc),

diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
@@ -22,7 +22,7 @@ namespace cuda {
 struct TORCH_CUDA_API FusedKernelCUDA
     : public ::torch::jit::fuser::FusedKernel {
   FusedKernelCUDA(
-      int16_t device,
+      at::DeviceIndex device,
       std::string name,
       std::string code,
       std::vector<TensorDesc> input_desc,
@@ -45,7 +45,7 @@ struct TORCH_CUDA_API FusedKernelCUDA
 
   // Note: per device to store device properties and compute launch heuristics
   //  Acquiring these values at launch time would be too slow
-  int16_t device_;
+  at::DeviceIndex device_;
   int maxBlocks_;
   cudaDeviceProp* prop_;
   std::vector<char> ptx_;