From 4c9eb57914fb538e21c46b63cfb5c2e9d5bc2f20 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@fb.com>
Date: Wed, 18 Nov 2020 19:35:20 -0800
Subject: [PATCH] [PyTorch] Narrow Device to 2 bytes by narrowing DeviceType
 and DeviceIndex (#47023)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/47023

DeviceType pretty clearly only needs 1 byte. DeviceIndex only needs 1 byte given that machines don't have anywhere near 255 GPUs in them as far as I know.
ghstack-source-id: 116901430

Test Plan: Existing tests, added assertion to catch if my assumption about DeviceIndex is incorrect

Reviewed By: dzhulgakov

Differential Revision: D24605460

fbshipit-source-id: 7c9a89027fcf8eebd623b7cdbf6302162c981cd2
---
 aten/src/ATen/cuda/CUDADevice.h                    |  2 +-
 c10/core/Device.h                                  | 14 +++++++-------
 c10/core/DeviceType.h                              |  3 +--
 c10/core/Stream.h                                  |  8 ++++----
 c10/core/TensorOptions.h                           |  2 +-
 c10/core/impl/DeviceGuardImplInterface.h           |  4 ++--
 c10/cuda/CUDAFunctions.cpp                         |  4 ++++
 caffe2/proto/caffe2.proto                          |  1 -
 caffe2/proto/caffe2_pb.h                           |  6 ------
 caffe2/python/__init__.py                          |  1 -
 caffe2/utils/proto_utils.cc                        |  1 -
 test/cpp/api/tensor_options.cpp                    |  4 ++--
 test/test_torch.py                                 |  5 -----
 torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp |  4 ++--
 torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h   |  4 ++--
 15 files changed, 26 insertions(+), 37 deletions(-)
diff --git a/aten/src/ATen/cuda/CUDADevice.h b/aten/src/ATen/cuda/CUDADevice.h
index 9d14ab1627e0..2d0a682488fd 100644
--- a/aten/src/ATen/cuda/CUDADevice.h
+++ b/aten/src/ATen/cuda/CUDADevice.h
@@ -11,7 +11,7 @@ namespace cuda {
 inline Device getDeviceFromPtr(void* ptr) {
   cudaPointerAttributes attr;
   AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
-  return {DeviceType::CUDA, static_cast<int16_t>(attr.device)};
+  return {DeviceType::CUDA, static_cast<DeviceIndex>(attr.device)};
 }
 
 }} // namespace at::cuda
diff --git a/c10/core/Device.h b/c10/core/Device.h
index f1249e865f8b..7827119bb0ac 100644
--- a/c10/core/Device.h
+++ b/c10/core/Device.h
@@ -15,7 +15,7 @@ namespace c10 {
 /// A DeviceIndex is not independently meaningful without knowing
 /// the DeviceType it is associated; try to use Device rather than
 /// DeviceIndex directly.
-using DeviceIndex = int16_t;
+using DeviceIndex = int8_t;
 
 /// Represents a a compute device on which a tensor is located. A device is
 /// uniquely identified by a type, which specifies the type of machine it is
@@ -94,9 +94,9 @@ struct C10_API Device final {
   DeviceIndex index_ = -1;
   void validate() {
     TORCH_CHECK(index_ == -1 || index_ >= 0,
-        "Device index must be -1 or non-negative, got ", index_);
+        "Device index must be -1 or non-negative, got ", (int)index_);
     TORCH_CHECK(!is_cpu() || index_ <= 0,
-        "CPU device index must be -1 or zero, got ", index_);
+        "CPU device index must be -1 or zero, got ", (int)index_);
   }
 };
 
@@ -112,8 +112,8 @@ struct hash<c10::Device> {
   size_t operator()(c10::Device d) const noexcept {
     // Are you here because this static assert failed?  Make sure you ensure
     // that the bitmasking code below is updated accordingly!
-    static_assert(sizeof(c10::DeviceType) == 2, "DeviceType is not 16-bit");
-    static_assert(sizeof(c10::DeviceIndex) == 2, "DeviceIndex is not 16-bit");
+    static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
     // Note [Hazard when concatenating signed integers]
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     // We must first convert to a same-sized unsigned type, before promoting to
@@ -124,8 +124,8 @@ struct hash<c10::Device> {
     // Technically, by C/C++ integer promotion rules, we only need one of the
     // uint32_t casts to the result type, but we put in both for explicitness's sake.
     uint32_t bits =
-        static_cast<uint32_t>(static_cast<uint16_t>(d.type())) << 16
-      | static_cast<uint32_t>(static_cast<uint16_t>(d.index()));
+        static_cast<uint32_t>(static_cast<uint8_t>(d.type())) << 16
+      | static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
     return std::hash<uint32_t>{}(bits);
   }
 };
diff --git a/c10/core/DeviceType.h b/c10/core/DeviceType.h
index b4037f664561..86935436ae1c 100644
--- a/c10/core/DeviceType.h
+++ b/c10/core/DeviceType.h
@@ -12,7 +12,7 @@
 
 namespace c10 {
 
-enum class DeviceType : int16_t {
+enum class DeviceType : int8_t {
   CPU = 0,
   CUDA = 1, // CUDA.
   MKLDNN = 2, // Reserved for explicit MKLDNN
@@ -30,7 +30,6 @@ enum class DeviceType : int16_t {
   //    in DeviceType.cpp
   //  - Change the number below
   COMPILE_TIME_MAX_DEVICE_TYPES = 12,
-  ONLY_FOR_TEST = 20901, // This device type is only for test.
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
diff --git a/c10/core/Stream.h b/c10/core/Stream.h
index 5baac5325af7..62d5261534ee 100644
--- a/c10/core/Stream.h
+++ b/c10/core/Stream.h
@@ -111,14 +111,14 @@ class Stream final {
   uint64_t pack() const noexcept {
     // Are you here because this static assert failed?  Make sure you ensure
     // that the bitmasking code below is updated accordingly!
-    static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
-    static_assert(sizeof(DeviceIndex) == 2, "DeviceIndex is not 16-bit");
+    static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(DeviceIndex) == 1, "DeviceIndex is not 8-bit");
     static_assert(sizeof(StreamId) == 4, "DeviceIndex is not 32-bit");
     // Concat these together into a 64-bit integer
     // See Note [Hazard when concatenating signed integers]
     uint64_t bits =
-        static_cast<uint64_t>(static_cast<uint16_t>(device_type())) << 48
-      | static_cast<uint64_t>(static_cast<uint16_t>(device_index())) << 32
+        static_cast<uint64_t>(static_cast<uint8_t>(device_type())) << 48
+      | static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 32
       | static_cast<uint64_t>(static_cast<uint32_t>(id()));
     return bits;
   }
diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h
index 420e0a984b77..347df066cc90 100644
--- a/c10/core/TensorOptions.h
+++ b/c10/core/TensorOptions.h
@@ -504,7 +504,7 @@ struct C10_API TensorOptions {
   // NB: We didn't use c10::optional here, because then we can't pack
   // the has_***_ boolean fields.
 
-  Device device_ = at::kCPU; // 32-bit
+  Device device_ = at::kCPU; // 16-bit
   caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
   Layout layout_ = at::kStrided; // 8-bit
   MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index f7f5b4f867a9..2ef02b57d3be 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -215,8 +215,8 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
   // FB employees can see
   //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
   // for more details
-  static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
-  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
+  static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFF].load();
 
   // This seems to be the first place where you make use of a device
   // when you pass devices to factory functions.  Give a nicer error
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 99bb721798d9..4c87cb9ec8c6 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -1,5 +1,7 @@
 #include <c10/cuda/CUDAFunctions.h>
 
+#include <limits>
+
 namespace c10 {
 namespace cuda {
 
@@ -93,6 +95,8 @@ DeviceIndex device_count() noexcept {
   // initialize number of devices only once
   static int count = []() {
     try {
+      auto result = device_count_impl();
+      TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
       return device_count_impl();
     } catch (const c10::Error& ex) {
       // We don't want to fail, but still log the warning
diff --git a/caffe2/proto/caffe2.proto b/caffe2/proto/caffe2.proto
index dd8f365b4d85..76acbf201f71 100644
--- a/caffe2/proto/caffe2.proto
+++ b/caffe2/proto/caffe2.proto
@@ -239,7 +239,6 @@ enum DeviceTypeProto {
   PROTO_XLA = 9;                    // XLA / TPU
   // Change the following number if you add more devices in the code.
   PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 10;
-  PROTO_ONLY_FOR_TEST = 20901;   // This device type is only for test.
 }
 
 // Device-specific options. We do not distinguish DeviceOption protos for
diff --git a/caffe2/proto/caffe2_pb.h b/caffe2/proto/caffe2_pb.h
index 6bd886b559c1..23af2be32219 100644
--- a/caffe2/proto/caffe2_pb.h
+++ b/caffe2/proto/caffe2_pb.h
@@ -15,7 +15,6 @@ constexpr DeviceType IDEEP = DeviceType::IDEEP;
 constexpr DeviceType HIP = DeviceType::HIP;
 constexpr DeviceType COMPILE_TIME_MAX_DEVICE_TYPES =
     DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
-constexpr DeviceType ONLY_FOR_TEST = DeviceType::ONLY_FOR_TEST;
 
 inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
   switch (p) {
@@ -35,8 +34,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
       return DeviceType::HIP;
     case caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES:
       return DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
-    case caffe2::PROTO_ONLY_FOR_TEST:
-      return DeviceType::ONLY_FOR_TEST;
     default:
       AT_ERROR(
           "Unknown device:",
@@ -69,8 +66,6 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
       return caffe2::PROTO_HIP;
     case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
       return caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES;
-    case DeviceType::ONLY_FOR_TEST:
-      return caffe2::PROTO_ONLY_FOR_TEST;
     default:
       AT_ERROR(
           "Unknown device:",
@@ -102,7 +97,6 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
     case DeviceType::MKLDNN:
     case DeviceType::IDEEP:
     case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
-    case DeviceType::ONLY_FOR_TEST:
       break;
     default:
       AT_ERROR(
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
index 8582eff9ce19..1262f97588ad 100644
--- a/caffe2/python/__init__.py
+++ b/caffe2/python/__init__.py
@@ -12,7 +12,6 @@
 caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
 caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
 caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
-caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST
 
 if platform.system() == 'Windows':
     is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
diff --git a/caffe2/utils/proto_utils.cc b/caffe2/utils/proto_utils.cc
index 5fc34d631c4f..70484643cded 100644
--- a/caffe2/utils/proto_utils.cc
+++ b/caffe2/utils/proto_utils.cc
@@ -53,7 +53,6 @@ C10_EXPORT bool IsCPUDeviceType(int device_type) {
       PROTO_CPU,
       PROTO_MKLDNN,
       PROTO_IDEEP,
-      PROTO_ONLY_FOR_TEST,
   };
   return cpu_types.count(device_type);
 }
diff --git a/test/cpp/api/tensor_options.cpp b/test/cpp/api/tensor_options.cpp
index 5de56139702a..d6b347a5d754 100644
--- a/test/cpp/api/tensor_options.cpp
+++ b/test/cpp/api/tensor_options.cpp
@@ -111,8 +111,8 @@ TEST(DeviceTest, ParsesCorrectlyFromString) {
   device = Device("hip");
   ASSERT_EQ(device, Device(DeviceType::HIP));
 
-  device = Device("hip:321");
-  ASSERT_EQ(device, Device(DeviceType::HIP, 321));
+  device = Device("hip:123");
+  ASSERT_EQ(device, Device(DeviceType::HIP, 123));
 
   std::vector<std::string> badnesses = {
       "", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};
diff --git a/test/test_torch.py b/test/test_torch.py
index 6682ba53fa72..15ebcb10c367 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -691,11 +691,6 @@ def test_device(self):
             self.assertEqual('cuda', cuda90.type)
             self.assertEqual(90, cuda90.index)
 
-            cuda23333 = torch.device('cuda', 23333)
-            self.assertEqual('cuda:23333', str(cuda23333))
-            self.assertEqual('cuda', cuda23333.type)
-            self.assertEqual(23333, cuda23333.index)
-
             self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1'))
             self.assertRaises(RuntimeError, lambda: torch.device('cpu:1'))
             self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1))
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
index 27315ee47527..0e49f865388c 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
@@ -66,7 +66,7 @@ static void getMajorMinor(
 
 // Compiles the specified kernel and stores the metadata required to run it
 FusedKernelCUDA::FusedKernelCUDA(
-    int16_t device,
+    at::DeviceIndex device,
     std::string name,
     std::string code,
     std::vector<TensorDesc> input_desc,
@@ -222,7 +222,7 @@ static std::shared_ptr<FusedKernel> createFusionKernel(
     std::vector<PartitionDesc> concat_desc,
     bool has_random) {
   return std::make_shared<FusedKernelCUDA>(
-      device,
+      static_cast<at::DeviceIndex>(device),
       std::move(name),
       std::move(code),
       std::move(input_desc),
diff --git a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
index 5dba48dabfc7..96a4a4a0f76c 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
@@ -22,7 +22,7 @@ namespace cuda {
 struct TORCH_CUDA_API FusedKernelCUDA
     : public ::torch::jit::fuser::FusedKernel {
   FusedKernelCUDA(
-      int16_t device,
+      at::DeviceIndex device,
       std::string name,
       std::string code,
       std::vector<TensorDesc> input_desc,
@@ -45,7 +45,7 @@ struct TORCH_CUDA_API FusedKernelCUDA
 
   // Note: per device to store device properties and compute launch heuristics
   //  Acquiring these values at launch time would be too slow
-  int16_t device_;
+  at::DeviceIndex device_;
   int maxBlocks_;
   cudaDeviceProp* prop_;
   std::vector<char> ptx_;