Skip to content

Commit

Permalink
[PyTorch] Narrow Device to 2 bytes by narrowing DeviceType and Device…
Browse files Browse the repository at this point in the history
…Index (#47023)

Summary:
Pull Request resolved: #47023

DeviceType pretty clearly only needs 1 byte. DeviceIndex only needs 1 byte given that machines don't have anywhere near 255 GPUs in them as far as I know.
ghstack-source-id: 116901430

Test Plan: Existing tests, added assertion to catch if my assumption about DeviceIndex is incorrect

Reviewed By: dzhulgakov

Differential Revision: D24605460

fbshipit-source-id: 7c9a89027fcf8eebd623b7cdbf6302162c981cd2
  • Loading branch information
swolchok authored and facebook-github-bot committed Nov 19, 2020
1 parent 72918e4 commit 4c9eb57
Show file tree
Hide file tree
Showing 15 changed files with 26 additions and 37 deletions.
2 changes: 1 addition & 1 deletion aten/src/ATen/cuda/CUDADevice.h
Expand Up @@ -11,7 +11,7 @@ namespace cuda {
inline Device getDeviceFromPtr(void* ptr) {
cudaPointerAttributes attr;
AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
return {DeviceType::CUDA, static_cast<int16_t>(attr.device)};
return {DeviceType::CUDA, static_cast<DeviceIndex>(attr.device)};
}

}} // namespace at::cuda
14 changes: 7 additions & 7 deletions c10/core/Device.h
Expand Up @@ -15,7 +15,7 @@ namespace c10 {
/// A DeviceIndex is not independently meaningful without knowing
/// the DeviceType it is associated; try to use Device rather than
/// DeviceIndex directly.
using DeviceIndex = int16_t;
using DeviceIndex = int8_t;

/// Represents a a compute device on which a tensor is located. A device is
/// uniquely identified by a type, which specifies the type of machine it is
Expand Down Expand Up @@ -94,9 +94,9 @@ struct C10_API Device final {
DeviceIndex index_ = -1;
void validate() {
TORCH_CHECK(index_ == -1 || index_ >= 0,
"Device index must be -1 or non-negative, got ", index_);
"Device index must be -1 or non-negative, got ", (int)index_);
TORCH_CHECK(!is_cpu() || index_ <= 0,
"CPU device index must be -1 or zero, got ", index_);
"CPU device index must be -1 or zero, got ", (int)index_);
}
};

Expand All @@ -112,8 +112,8 @@ struct hash<c10::Device> {
size_t operator()(c10::Device d) const noexcept {
// Are you here because this static assert failed? Make sure you ensure
// that the bitmasking code below is updated accordingly!
static_assert(sizeof(c10::DeviceType) == 2, "DeviceType is not 16-bit");
static_assert(sizeof(c10::DeviceIndex) == 2, "DeviceIndex is not 16-bit");
static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
// Note [Hazard when concatenating signed integers]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// We must first convert to a same-sized unsigned type, before promoting to
Expand All @@ -124,8 +124,8 @@ struct hash<c10::Device> {
// Technically, by C/C++ integer promotion rules, we only need one of the
// uint32_t casts to the result type, but we put in both for explicitness's sake.
uint32_t bits =
static_cast<uint32_t>(static_cast<uint16_t>(d.type())) << 16
| static_cast<uint32_t>(static_cast<uint16_t>(d.index()));
static_cast<uint32_t>(static_cast<uint8_t>(d.type())) << 16
| static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
return std::hash<uint32_t>{}(bits);
}
};
Expand Down
3 changes: 1 addition & 2 deletions c10/core/DeviceType.h
Expand Up @@ -12,7 +12,7 @@

namespace c10 {

enum class DeviceType : int16_t {
enum class DeviceType : int8_t {
CPU = 0,
CUDA = 1, // CUDA.
MKLDNN = 2, // Reserved for explicit MKLDNN
Expand All @@ -30,7 +30,6 @@ enum class DeviceType : int16_t {
// in DeviceType.cpp
// - Change the number below
COMPILE_TIME_MAX_DEVICE_TYPES = 12,
ONLY_FOR_TEST = 20901, // This device type is only for test.
};

constexpr DeviceType kCPU = DeviceType::CPU;
Expand Down
8 changes: 4 additions & 4 deletions c10/core/Stream.h
Expand Up @@ -111,14 +111,14 @@ class Stream final {
uint64_t pack() const noexcept {
// Are you here because this static assert failed? Make sure you ensure
// that the bitmasking code below is updated accordingly!
static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
static_assert(sizeof(DeviceIndex) == 2, "DeviceIndex is not 16-bit");
static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
static_assert(sizeof(DeviceIndex) == 1, "DeviceIndex is not 8-bit");
static_assert(sizeof(StreamId) == 4, "DeviceIndex is not 32-bit");
// Concat these together into a 64-bit integer
// See Note [Hazard when concatenating signed integers]
uint64_t bits =
static_cast<uint64_t>(static_cast<uint16_t>(device_type())) << 48
| static_cast<uint64_t>(static_cast<uint16_t>(device_index())) << 32
static_cast<uint64_t>(static_cast<uint8_t>(device_type())) << 48
| static_cast<uint64_t>(static_cast<uint8_t>(device_index())) << 32
| static_cast<uint64_t>(static_cast<uint32_t>(id()));
return bits;
}
Expand Down
2 changes: 1 addition & 1 deletion c10/core/TensorOptions.h
Expand Up @@ -504,7 +504,7 @@ struct C10_API TensorOptions {
// NB: We didn't use c10::optional here, because then we can't pack
// the has_***_ boolean fields.

Device device_ = at::kCPU; // 32-bit
Device device_ = at::kCPU; // 16-bit
caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
Layout layout_ = at::kStrided; // 8-bit
MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit
Expand Down
4 changes: 2 additions & 2 deletions c10/core/impl/DeviceGuardImplInterface.h
Expand Up @@ -215,8 +215,8 @@ inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
// FB employees can see
// https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
// for more details
static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit");
auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFFFF].load();
static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFF].load();

// This seems to be the first place where you make use of a device
// when you pass devices to factory functions. Give a nicer error
Expand Down
4 changes: 4 additions & 0 deletions c10/cuda/CUDAFunctions.cpp
@@ -1,5 +1,7 @@
#include <c10/cuda/CUDAFunctions.h>

#include <limits>

namespace c10 {
namespace cuda {

Expand Down Expand Up @@ -93,6 +95,8 @@ DeviceIndex device_count() noexcept {
// initialize number of devices only once
static int count = []() {
try {
auto result = device_count_impl();
TORCH_INTERNAL_ASSERT(result <= std::numeric_limits<DeviceIndex>::max(), "Too many CUDA devices, DeviceIndex overflowed");
return device_count_impl();
} catch (const c10::Error& ex) {
// We don't want to fail, but still log the warning
Expand Down
1 change: 0 additions & 1 deletion caffe2/proto/caffe2.proto
Expand Up @@ -239,7 +239,6 @@ enum DeviceTypeProto {
PROTO_XLA = 9; // XLA / TPU
// Change the following number if you add more devices in the code.
PROTO_COMPILE_TIME_MAX_DEVICE_TYPES = 10;
PROTO_ONLY_FOR_TEST = 20901; // This device type is only for test.
}

// Device-specific options. We do not distinguish DeviceOption protos for
Expand Down
6 changes: 0 additions & 6 deletions caffe2/proto/caffe2_pb.h
Expand Up @@ -15,7 +15,6 @@ constexpr DeviceType IDEEP = DeviceType::IDEEP;
constexpr DeviceType HIP = DeviceType::HIP;
constexpr DeviceType COMPILE_TIME_MAX_DEVICE_TYPES =
DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
constexpr DeviceType ONLY_FOR_TEST = DeviceType::ONLY_FOR_TEST;

inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
switch (p) {
Expand All @@ -35,8 +34,6 @@ inline CAFFE2_API DeviceType ProtoToType(const caffe2::DeviceTypeProto p) {
return DeviceType::HIP;
case caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES:
return DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES;
case caffe2::PROTO_ONLY_FOR_TEST:
return DeviceType::ONLY_FOR_TEST;
default:
AT_ERROR(
"Unknown device:",
Expand Down Expand Up @@ -69,8 +66,6 @@ inline CAFFE2_API DeviceTypeProto TypeToProto(const DeviceType& t) {
return caffe2::PROTO_HIP;
case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
return caffe2::PROTO_COMPILE_TIME_MAX_DEVICE_TYPES;
case DeviceType::ONLY_FOR_TEST:
return caffe2::PROTO_ONLY_FOR_TEST;
default:
AT_ERROR(
"Unknown device:",
Expand Down Expand Up @@ -102,7 +97,6 @@ inline CAFFE2_API caffe2::DeviceOption DeviceToOption(
case DeviceType::MKLDNN:
case DeviceType::IDEEP:
case DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES:
case DeviceType::ONLY_FOR_TEST:
break;
default:
AT_ERROR(
Expand Down
1 change: 0 additions & 1 deletion caffe2/python/__init__.py
Expand Up @@ -12,7 +12,6 @@
caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
caffe2_pb2.ONLY_FOR_TEST = caffe2_pb2.PROTO_ONLY_FOR_TEST

if platform.system() == 'Windows':
is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
Expand Down
1 change: 0 additions & 1 deletion caffe2/utils/proto_utils.cc
Expand Up @@ -53,7 +53,6 @@ C10_EXPORT bool IsCPUDeviceType(int device_type) {
PROTO_CPU,
PROTO_MKLDNN,
PROTO_IDEEP,
PROTO_ONLY_FOR_TEST,
};
return cpu_types.count(device_type);
}
Expand Down
4 changes: 2 additions & 2 deletions test/cpp/api/tensor_options.cpp
Expand Up @@ -111,8 +111,8 @@ TEST(DeviceTest, ParsesCorrectlyFromString) {
device = Device("hip");
ASSERT_EQ(device, Device(DeviceType::HIP));

device = Device("hip:321");
ASSERT_EQ(device, Device(DeviceType::HIP, 321));
device = Device("hip:123");
ASSERT_EQ(device, Device(DeviceType::HIP, 123));

std::vector<std::string> badnesses = {
"", "cud:1", "cuda:", "cpu::1", ":1", "3", "tpu:4", "??"};
Expand Down
5 changes: 0 additions & 5 deletions test/test_torch.py
Expand Up @@ -691,11 +691,6 @@ def test_device(self):
self.assertEqual('cuda', cuda90.type)
self.assertEqual(90, cuda90.index)

cuda23333 = torch.device('cuda', 23333)
self.assertEqual('cuda:23333', str(cuda23333))
self.assertEqual('cuda', cuda23333.type)
self.assertEqual(23333, cuda23333.index)

self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1'))
self.assertRaises(RuntimeError, lambda: torch.device('cpu:1'))
self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1))
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/jit/codegen/fuser/cuda/fused_kernel.cpp
Expand Up @@ -66,7 +66,7 @@ static void getMajorMinor(

// Compiles the specified kernel and stores the metadata required to run it
FusedKernelCUDA::FusedKernelCUDA(
int16_t device,
at::DeviceIndex device,
std::string name,
std::string code,
std::vector<TensorDesc> input_desc,
Expand Down Expand Up @@ -222,7 +222,7 @@ static std::shared_ptr<FusedKernel> createFusionKernel(
std::vector<PartitionDesc> concat_desc,
bool has_random) {
return std::make_shared<FusedKernelCUDA>(
device,
static_cast<at::DeviceIndex>(device),
std::move(name),
std::move(code),
std::move(input_desc),
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
Expand Up @@ -22,7 +22,7 @@ namespace cuda {
struct TORCH_CUDA_API FusedKernelCUDA
: public ::torch::jit::fuser::FusedKernel {
FusedKernelCUDA(
int16_t device,
at::DeviceIndex device,
std::string name,
std::string code,
std::vector<TensorDesc> input_desc,
Expand All @@ -45,7 +45,7 @@ struct TORCH_CUDA_API FusedKernelCUDA

// Note: per device to store device properties and compute launch heuristics
// Acquiring these values at launch time would be too slow
int16_t device_;
at::DeviceIndex device_;
int maxBlocks_;
cudaDeviceProp* prop_;
std::vector<char> ptx_;
Expand Down

0 comments on commit 4c9eb57

Please sign in to comment.