diff --git a/CMakeLists.txt b/CMakeLists.txt index a388a4549d66..62ea0a64d6c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,10 +205,10 @@ cmake_dependent_option( USE_VALGRIND "Use Valgrind. Only available on Linux." ON "LINUX" OFF) option(USE_VULKAN "Use Vulkan GPU backend" OFF) -option(USE_VULKAN_API "Use Vulkan GPU backend v2" OFF) -option(USE_VULKAN_WRAPPER "Use Vulkan wrapper" ON) -option(USE_VULKAN_SHADERC_RUNTIME "Use Vulkan Shader compilation runtime(Needs shaderc lib)" OFF) -option(USE_VULKAN_RELAXED_PRECISION "Use Vulkan relaxed precision(mediump)" OFF) +option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON) +option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF) +option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF) +option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON) option(USE_XNNPACK "Use XNNPACK" ON) option(USE_ZMQ "Use ZMQ" OFF) option(USE_ZSTD "Use ZSTD" OFF) @@ -554,22 +554,23 @@ endif() if(USE_VULKAN) string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN") -endif() - -if(USE_VULKAN_API) string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_API") -endif() -if(USE_VULKAN_WRAPPER) - string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_WRAPPER") -endif() + if(USE_VULKAN_FP16_INFERENCE) + string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_FP16_INFERENCE") + endif() -if(USE_VULKAN_SHADERC_RUNTIME) - string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_SHADERC_RUNTIME") -endif() + if(USE_VULKAN_RELAXED_PRECISION) + string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION") + endif() -if(USE_VULKAN_RELAXED_PRECISION) - string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_RELAXED_PRECISION") + if(USE_VULKAN_SHADERC_RUNTIME) + string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_SHADERC_RUNTIME") + endif() + + if(USE_VULKAN_WRAPPER) + string(APPEND CMAKE_CXX_FLAGS " -DUSE_VULKAN_WRAPPER") + endif() endif() if(USE_PYTORCH_METAL) diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index f7b27e5947da..fd3c95f2573b 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -65,8 +65,7 @@ file(GLOB native_cpp "native/*.cpp") file(GLOB native_mkl_cpp "native/mkl/*.cpp") file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") file(GLOB vulkan_cpp "vulkan/*.cpp") -file(GLOB native_vulkan_cpp "native/vulkan/*.cpp") -file(GLOB native_vulkan_api_cpp "native/vulkan/api/*.cpp" "native/vulkan/ops/*.cpp") +file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/ops/*.cpp") # Metal file(GLOB metal_h "metal/*.h") @@ -126,9 +125,6 @@ if(AT_MKLDNN_ENABLED) endif() if(USE_VULKAN) set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp} ${native_vulkan_cpp} ${vulkan_generated_cpp}) - if(USE_VULKAN_API) - set(all_cpu_cpp ${all_cpu_cpp} ${native_vulkan_api_cpp}) - endif() else() set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp}) endif() diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp index 90920a8e5a82..3646ae7e9496 100644 --- a/aten/src/ATen/native/vulkan/Vulkan.cpp +++ b/aten/src/ATen/native/vulkan/Vulkan.cpp @@ -779,17 +779,17 @@ void ComputeUnit::createComputePipeline( { uint32_t offset = 0; size_t size = sizeof(WorkGroupSize::x); - spMapEntries[0].constantID = 1; + spMapEntries[0].constantID = 0; spMapEntries[0].offset = offset; spMapEntries[0].size = size; offset += size; size = sizeof(WorkGroupSize::y); - spMapEntries[1].constantID = 2; + spMapEntries[1].constantID = 1; spMapEntries[1].offset = offset; spMapEntries[1].size = size; offset += size; size = sizeof(WorkGroupSize::z); - spMapEntries[2].constantID = 3; + spMapEntries[2].constantID = 2; spMapEntries[2].offset = offset; spMapEntries[2].size = size; } diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h index 4ba02a5e9926..b4203530f635 100644 --- a/aten/src/ATen/native/vulkan/api/Adapter.h +++ b/aten/src/ATen/native/vulkan/api/Adapter.h @@ -33,7 +33,7 @@ struct Adapter final { } inline Shader::WorkGroup local_work_group_size() const { - return { 8u, 8u, 1u, }; + return { 4u, 4u, 4u, }; } }; diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h index 9a26fa8f48a7..b720608d844b 100644 --- a/aten/src/ATen/native/vulkan/api/Allocator.h +++ b/aten/src/ATen/native/vulkan/api/Allocator.h @@ -13,6 +13,8 @@ #ifdef USE_VULKAN_WRAPPER #define VMA_STATIC_VULKAN_FUNCTIONS 0 +#else + #define VMA_DYNAMIC_VULKAN_FUNCTIONS 0 #endif #ifdef DEBUG diff --git a/aten/src/ATen/native/vulkan/api/Cache.h b/aten/src/ATen/native/vulkan/api/Cache.h index 83ea3343aa83..b224adbbeeda 100644 --- a/aten/src/ATen/native/vulkan/api/Cache.h +++ b/aten/src/ATen/native/vulkan/api/Cache.h @@ -72,7 +72,7 @@ template inline auto Cache::retrieve( const Descriptor& descriptor) { auto iterator = cache_.find(descriptor); - if (cache_.cend() == iterator) { + if C10_UNLIKELY(cache_.cend() == iterator) { iterator = cache_.insert({descriptor, factory_(descriptor)}).first; } diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp index e01691935d70..5aa3586d4683 100644 --- a/aten/src/ATen/native/vulkan/api/Command.cpp +++ b/aten/src/ATen/native/vulkan/api/Command.cpp @@ -36,9 +36,11 @@ VkCommandPool create_command_pool( return command_pool; } -VkCommandBuffer allocate_command_buffer( +void allocate_command_buffers( const VkDevice device, - const VkCommandPool command_pool) { + const VkCommandPool command_pool, + VkCommandBuffer* const command_buffers, + const uint32_t count) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device, "Invalid Vulkan device!"); @@ -47,37 +49,28 @@ VkCommandBuffer allocate_command_buffer( command_pool, "Invalid Vulkan command pool!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_buffers && (count > 0u), + "Invalid usage!"); + const VkCommandBufferAllocateInfo command_buffer_allocate_info{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, nullptr, command_pool, VK_COMMAND_BUFFER_LEVEL_PRIMARY, - 1u, + count, }; - VkCommandBuffer command_buffer{}; VK_CHECK(vkAllocateCommandBuffers( device, &command_buffer_allocate_info, - &command_buffer)); - - TORCH_CHECK( - command_buffer, - "Invalid Vulkan command buffer!"); - - return command_buffer; + command_buffers)); } } // namespace -Command::Buffer::Buffer() - : command_buffer_(VK_NULL_HANDLE) { -} - -Command::Buffer::Buffer( - const VkDevice device, - const VkCommandPool command_pool) - : command_buffer_(allocate_command_buffer(device, command_pool)) { +Command::Buffer::Buffer(const VkCommandBuffer command_buffer) + : command_buffer_(command_buffer) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_buffer_, "Invalid Vulkan command buffer!"); @@ -99,6 +92,10 @@ void Command::Buffer::Buffer::begin() { VK_CHECK(vkBeginCommandBuffer( command_buffer_, &command_buffer_begin_info)); + + // Reset + bound_.reset(); + barriers_.reset(); } void Command::Buffer::Buffer::end() { @@ -110,74 +107,90 @@ void Command::Buffer::Buffer::end() { VK_CHECK(vkEndCommandBuffer(command_buffer_)); } -void Command::Buffer::barrier( - const Pipeline::Barrier& barrier) { +void Command::Buffer::barrier() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_buffer_, "This command buffer is in an invalid state! " "Potential reason: This command buffer is moved from."); - c10::SmallVector global_memory_barriers; - c10::SmallVector image_memory_barriers; - - if (!barrier.buffers.empty()) { - // Using global memory barriers instead of buffer memory barriers for - // buffers. The consensus seems to be that there is no advantage in - // using the latter in favor of the former. - - VkMemoryBarrier global_memory_barrier{ - VK_STRUCTURE_TYPE_MEMORY_BARRIER, - nullptr, - 0u, - 0u, - }; - - // Coalesce all buffer memory barriers into one global memory barrier. + if (barriers_.stage) { + c10::SmallVector buffer_memory_barriers; + + for (const Resource::Buffer::Barrier& barrier : barriers_.buffers) { + buffer_memory_barriers.push_back({ + VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + nullptr, + barrier.memory.src, + barrier.memory.dst, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + barrier.object.handle, + barrier.object.offset, + barrier.object.range, + }); + } - for (const Resource::Buffer::Barrier& barrier : barrier.buffers) { - global_memory_barrier.srcAccessMask |= barrier.memory.src; - global_memory_barrier.dstAccessMask |= barrier.memory.dst; + c10::SmallVector image_memory_barriers; + + for (const Resource::Image::Barrier& barrier : barriers_.images) { + image_memory_barriers.push_back({ + VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + nullptr, + barrier.memory.src, + barrier.memory.dst, + barrier.layout.src, + barrier.layout.dst, + VK_QUEUE_FAMILY_IGNORED, + VK_QUEUE_FAMILY_IGNORED, + barrier.object.handle, + { + VK_IMAGE_ASPECT_COLOR_BIT, + 0u, + VK_REMAINING_MIP_LEVELS, + 0u, + VK_REMAINING_ARRAY_LAYERS, + }, + }); } - global_memory_barriers.push_back(global_memory_barrier); + vkCmdPipelineBarrier( + command_buffer_, + barriers_.stage.src, + barriers_.stage.dst, + 0u, + 0u, + nullptr, + buffer_memory_barriers.size(), + buffer_memory_barriers.data(), + image_memory_barriers.size(), + image_memory_barriers.data()); } - for (const Resource::Image::Barrier& barrier : barrier.images) { - image_memory_barriers.push_back({ - VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, - nullptr, - barrier.memory.src, - barrier.memory.dst, - barrier.layout.src, - barrier.layout.dst, - VK_QUEUE_FAMILY_IGNORED, - VK_QUEUE_FAMILY_IGNORED, - barrier.object.handle, - VkImageSubresourceRange{ - VK_IMAGE_ASPECT_COLOR_BIT, - 0u, - VK_REMAINING_MIP_LEVELS, - 0u, - VK_REMAINING_ARRAY_LAYERS, - }, - }); - } + // Reset + barriers_.reset(); +} - vkCmdPipelineBarrier( +void Command::Buffer::barrier(const Pipeline::Barrier& barrier) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_buffer_, - barrier.stage.src, - barrier.stage.dst, - 0u, - global_memory_barriers.size(), - global_memory_barriers.data(), - 0u, - nullptr, - image_memory_barriers.size(), - image_memory_barriers.data()); + "This command buffer is in an invalid state! " + "Potential reason: This command buffer is moved from."); + + barriers_.stage.src |= barrier.stage.src; + barriers_.stage.dst |= barrier.stage.dst; + + barriers_.buffers.insert( + barriers_.buffers.end(), + barrier.buffers.begin(), + barrier.buffers.end()); + + barriers_.images.insert( + barriers_.images.end(), + barrier.images.begin(), + barrier.images.end()); } -void Command::Buffer::bind( - const Pipeline::Object& pipeline) { +void Command::Buffer::bind(const Pipeline::Object& pipeline) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_buffer_, "This command buffer is in an invalid state! " @@ -197,8 +210,7 @@ void Command::Buffer::bind( } } -void Command::Buffer::bind( - const Descriptor::Set& set) { +void Command::Buffer::bind(const Descriptor::Set& set) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_buffer_, "This command buffer is in an invalid state! " @@ -241,6 +253,8 @@ void Command::Buffer::copy( destination, "Invalid Vulkan destination buffer!"); + barrier(); + const VkBufferCopy buffer_copy{ 0u, 0u, @@ -262,17 +276,19 @@ void Command::Buffer::dispatch( "This command buffer is in an invalid state! " "Potential reason: This command buffer is moved from."); + barrier(); + vkCmdDispatch( command_buffer_, utils::div_up( - global_work_group.width, - bound_.pipeline.local_work_group.width), + global_work_group.data[0u], + bound_.pipeline.local_work_group.data[0u]), utils::div_up( - global_work_group.height, - bound_.pipeline.local_work_group.height), + global_work_group.data[1u], + bound_.pipeline.local_work_group.data[1u]), utils::div_up( - global_work_group.depth, - bound_.pipeline.local_work_group.depth)); + global_work_group.data[2u], + bound_.pipeline.local_work_group.data[2u])); } void Command::Buffer::submit( @@ -306,7 +322,8 @@ Command::Pool::Pool(const GPU& gpu) : device_(gpu.device), command_pool_( create_command_pool(gpu.device, gpu.adapter->compute_queue_family_index), - VK_DELETER(CommandPool)(device_)) { + VK_DELETER(CommandPool)(device_)), + buffer_{} { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_, "Invalid Vulkan device!"); @@ -314,11 +331,14 @@ Command::Pool::Pool(const GPU& gpu) TORCH_INTERNAL_ASSERT_DEBUG_ONLY( command_pool_, "Invalid Vulkan command pool!"); + + buffer_.pool.reserve(Configuration::kReserve); } Command::Pool::Pool(Pool&& pool) : device_(std::move(pool.device_)), - command_pool_(std::move(pool.command_pool_)) { + command_pool_(std::move(pool.command_pool_)), + buffer_(std::move(pool.buffer_)) { pool.device_ = VK_NULL_HANDLE; } @@ -326,6 +346,7 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) { if (&pool != this) { device_ = std::move(pool.device_); command_pool_ = std::move(pool.command_pool_); + buffer_ = std::move(pool.buffer_); pool.device_ = VK_NULL_HANDLE; }; @@ -333,13 +354,42 @@ Command::Pool& Command::Pool::operator=(Pool&& pool) { return *this; } +Command::Pool::~Pool() { + try { + if (device_ && command_pool_) { + purge(); + } + } + catch (const std::exception& e) { + LOG(WARNING) + << "Vulkan: Command pool destructor raised an exception! Error: " + << e.what(); + } + catch (...) { + LOG(WARNING) + << "Vulkan: Command pool destructor raised an unknown exception!"; + } +} + Command::Buffer Command::Pool::allocate() { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_ && command_pool_, "This command pool is in an invalid state! " "Potential reason: This command pool is moved from."); - return Buffer(device_, command_pool_.get()); + if (buffer_.pool.size() == buffer_.in_use) { + buffer_.pool.resize( + buffer_.pool.size() + + Configuration::kQuantum); + + allocate_command_buffers( + device_, + command_pool_.get(), + buffer_.pool.data() + buffer_.in_use, + Configuration::kQuantum); + } + + return Buffer(buffer_.pool[buffer_.in_use++]); } void Command::Pool::purge() { @@ -348,6 +398,7 @@ void Command::Pool::purge() { "This command pool is in an invalid state! " "Potential reason: This command pool is moved from."); + buffer_.in_use = 0u; VK_CHECK(vkResetCommandPool(device_, command_pool_.get(), 0u)); } diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h index 8e2f235cfa27..42f073674be5 100644 --- a/aten/src/ATen/native/vulkan/api/Command.h +++ b/aten/src/ATen/native/vulkan/api/Command.h @@ -20,8 +20,7 @@ struct Command final { class Buffer final { public: - Buffer(); - Buffer(VkDevice device, VkCommandPool command_pool); + Buffer(VkCommandBuffer command_buffer = VK_NULL_HANDLE); Buffer(const Buffer&) = delete; Buffer& operator=(const Buffer&) = delete; Buffer(Buffer&&); @@ -39,12 +38,32 @@ struct Command final { void dispatch(const Shader::WorkGroup& global_work_group); void submit(VkQueue queue, Resource::Fence fence = {}); + private: + void barrier(); + private: VkCommandBuffer command_buffer_; - struct { + + struct Bound final { Pipeline::Object pipeline; VkDescriptorSet descriptor_set; + + void reset(); } bound_; + + struct Barrier final { + struct Stage final { + VkPipelineStageFlags src; + VkPipelineStageFlags dst; + + operator bool() const; + } stage; + + c10::SmallVector buffers; + c10::SmallVector images; + + void reset(); + } barriers_; }; // @@ -58,14 +77,24 @@ struct Command final { Pool& operator=(const Pool&) = delete; Pool(Pool&&); Pool& operator=(Pool&&); - ~Pool() = default; + ~Pool(); Buffer allocate(); void purge(); private: + struct Configuration final { + static constexpr uint32_t kQuantum = 64u; + static constexpr uint32_t kReserve = 1024u; + }; + VkDevice device_; Handle command_pool_; + + struct { + std::vector pool; + size_t in_use; + } buffer_; } pool /* [thread_count] */; explicit Command(const GPU& gpu) @@ -79,7 +108,8 @@ struct Command final { inline Command::Buffer::Buffer(Buffer&& buffer) : command_buffer_(std::move(buffer.command_buffer_)), - bound_(std::move(buffer.bound_)) { + bound_(std::move(buffer.bound_)), + barriers_(std::move(buffer.barriers_)) { buffer.command_buffer_ = VK_NULL_HANDLE; } @@ -87,6 +117,7 @@ inline Command::Buffer& Command::Buffer::operator=(Buffer&& buffer) { if (&buffer != this) { command_buffer_ = std::move(buffer.command_buffer_); bound_ = std::move(buffer.bound_); + barriers_ = std::move(buffer.barriers_); buffer.command_buffer_ = VK_NULL_HANDLE; }; @@ -98,6 +129,22 @@ inline Command::Buffer::operator bool() const { return VK_NULL_HANDLE != command_buffer_; } +inline void Command::Buffer::Bound::reset() { + pipeline = {}; + descriptor_set = VK_NULL_HANDLE; +} + +inline Command::Buffer::Barrier::Stage::operator bool() const { + return (0u != src) || + (0u != dst); +} + +inline void Command::Buffer::Barrier::reset() { + stage = {}; + buffers.clear(); + images.clear(); +} + } // namespace api } // namespace vulkan } // namespace native diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h index be89073e90ba..d606f1d859a9 100644 --- a/aten/src/ATen/native/vulkan/api/Common.h +++ b/aten/src/ATen/native/vulkan/api/Common.h @@ -19,16 +19,16 @@ #endif /* USE_VULKAN_WRAPPER */ #define VK_CHECK(function) \ - { \ + do { \ const VkResult result = (function); \ TORCH_CHECK(VK_SUCCESS == result, "VkResult:", result); \ - } + } while (false) #define VK_CHECK_RELAXED(function) \ - { \ + do { \ const VkResult result = (function); \ TORCH_CHECK(VK_SUCCESS <= result, "VkResult:", result); \ - } + } while (false) #define VK_DELETER(Handle) \ at::native::vulkan::api::destroy_##Handle diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp index ef638f917956..09dfa8fc1d77 100644 --- a/aten/src/ATen/native/vulkan/api/Context.cpp +++ b/aten/src/ATen/native/vulkan/api/Context.cpp @@ -111,7 +111,7 @@ Context::~Context() { } void Context::flush() { - VK_CHECK(vkDeviceWaitIdle(device())); + VK_CHECK(vkQueueWaitIdle(queue())); resource().pool.purge(); descriptor().pool.purge(); diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h index 687ddbbfe931..41adfc5fb272 100644 --- a/aten/src/ATen/native/vulkan/api/Context.h +++ b/aten/src/ATen/native/vulkan/api/Context.h @@ -128,7 +128,8 @@ inline void bind( const std::index_sequence, Arguments&&...arguments) { C10_UNUSED const int _[]{ - (descriptor_set.bind(Indices, arguments), 0)..., + 0, + (descriptor_set.bind(Indices, std::forward(arguments)), 0)..., }; } diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp index 037f793dfa2a..317536248987 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp +++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp @@ -6,50 +6,41 @@ namespace vulkan { namespace api { namespace { -VkDescriptorPool create_descriptor_pool( - const VkDevice device) { +VkDescriptorPool create_descriptor_pool(const VkDevice device) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device, "Invalid Vulkan device!"); const struct { uint32_t capacity; - c10::SmallVector sizes; + c10::SmallVector sizes; } descriptor { 1024u, { - // Note: It is OK for the sum of descriptors per type, below, to exceed - // the max total figure above, but be concenious of memory consumption. - // Considering how the descriptor pool must be frequently purged anyway - // as a result of the impracticality of having enormous pools that - // persist through the execution of the program, there is diminishing - // return in increasing max counts. - { - /* - Buffers - */ + /* + Buffers + */ - { - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - 768u, - }, - { - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - 768u, - }, + { + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + 1024u, + }, + { + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + 1024u, + }, - /* - Images - */ + /* + Images + */ - { - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - 768u, - }, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - 768u, - }, + { + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + 1024u, + }, + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + 1024u, }, }, }; @@ -57,7 +48,7 @@ VkDescriptorPool create_descriptor_pool( const VkDescriptorPoolCreateInfo descriptor_pool_create_info{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr, - 0u, /* Do not use VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT. */ + 0u, descriptor.capacity, static_cast(descriptor.sizes.size()), descriptor.sizes.data(), @@ -77,10 +68,12 @@ VkDescriptorPool create_descriptor_pool( return descriptor_pool; } -VkDescriptorSet allocate_descriptor_set( +void allocate_descriptor_sets( const VkDevice device, const VkDescriptorPool descriptor_pool, - const VkDescriptorSetLayout descriptor_set_layout) { + const VkDescriptorSetLayout descriptor_set_layout, + VkDescriptorSet* const descriptor_sets, + const uint32_t count) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device, "Invalid Vulkan device!"); @@ -93,41 +86,43 @@ VkDescriptorSet allocate_descriptor_set( descriptor_set_layout, "Invalid Vulkan descriptor set layout!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_sets && (count > 0u), + "Invalid usage!"); + + const std::vector descriptor_set_layouts{ + count, + descriptor_set_layout, + }; + const VkDescriptorSetAllocateInfo descriptor_set_allocate_info{ VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, nullptr, descriptor_pool, - 1u, - &descriptor_set_layout, + descriptor_set_layouts.size(), + descriptor_set_layouts.data(), }; - VkDescriptorSet descriptor_set{}; VK_CHECK(vkAllocateDescriptorSets( device, &descriptor_set_allocate_info, - &descriptor_set)); - - TORCH_CHECK( - descriptor_set, - "Invalid Vulkan descriptor set!"); - - return descriptor_set; + descriptor_sets)); } } // namespace Descriptor::Set::Set( const VkDevice device, - const VkDescriptorPool descriptor_pool, - const Shader::Layout::Object& shader_layout) + VkDescriptorSet descriptor_set, + const Shader::Layout::Signature& shader_layout_signature) : device_(device), - descriptor_set_( - allocate_descriptor_set( - device_, - descriptor_pool, - shader_layout.handle)), - shader_layout_signature_(shader_layout.signature), + descriptor_set_(descriptor_set), + shader_layout_signature_(shader_layout_signature), bindings_{} { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( descriptor_set_, "Invalid Vulkan descriptor set!"); @@ -135,7 +130,7 @@ Descriptor::Set::Set( void Descriptor::Set::update(const Item& item) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, + device_ && descriptor_set_, "This descriptor set is in an invalid state! " "Potential reason: This descriptor set is moved from."); @@ -160,7 +155,7 @@ Descriptor::Set& Descriptor::Set::bind( const uint32_t binding, const Resource::Buffer::Object& buffer) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, + device_ && descriptor_set_, "This descriptor set is in an invalid state! " "Potential reason: This descriptor set is moved from."); @@ -183,7 +178,7 @@ Descriptor::Set& Descriptor::Set::bind( const uint32_t binding, const Resource::Image::Object& image) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, + device_ && descriptor_set_, "This descriptor set is in an invalid state! " "Potential reason: This descriptor set is moved from."); @@ -194,7 +189,10 @@ Descriptor::Set& Descriptor::Set::bind( .image = { image.sampler, image.view, - image.layout + [](const VkDescriptorType type, const VkImageLayout layout) { + return (VK_DESCRIPTOR_TYPE_STORAGE_IMAGE == type) ? + VK_IMAGE_LAYOUT_GENERAL : layout; + }(shader_layout_signature_[binding], image.layout), }, }, }); @@ -204,7 +202,7 @@ Descriptor::Set& Descriptor::Set::bind( VkDescriptorSet Descriptor::Set::handle() const { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device_, + device_ && descriptor_set_, "This descriptor set is in an invalid state! " "Potential reason: This descriptor set is moved from."); @@ -238,7 +236,7 @@ VkDescriptorSet Descriptor::Set::handle() const { } }; - c10::SmallVector write_descriptor_sets; + c10::SmallVector write_descriptor_sets; for (const Item& item : bindings_.items) { VkWriteDescriptorSet write{ @@ -271,6 +269,7 @@ VkDescriptorSet Descriptor::Set::handle() const { 0u, nullptr); + // Reset bindings_.dirty = false; } @@ -281,7 +280,8 @@ Descriptor::Pool::Pool(const GPU& gpu) : device_(gpu.device), descriptor_pool_( create_descriptor_pool(gpu.device), - VK_DELETER(DescriptorPool)(device_)) { + VK_DELETER(DescriptorPool)(device_)), + set_{} { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_, "Invalid Vulkan device!"); @@ -293,7 +293,8 @@ Descriptor::Pool::Pool(const GPU& gpu) Descriptor::Pool::Pool(Pool&& pool) : device_(std::move(pool.device_)), - descriptor_pool_(std::move(pool.descriptor_pool_)) { + descriptor_pool_(std::move(pool.descriptor_pool_)), + set_(std::move(pool.set_)) { pool.device_ = VK_NULL_HANDLE; } @@ -301,6 +302,7 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) { if (&pool != this) { device_ = std::move(pool.device_); descriptor_pool_ = std::move(pool.descriptor_pool_); + set_ = std::move(pool.set_); pool.device_ = VK_NULL_HANDLE; }; @@ -308,9 +310,25 @@ Descriptor::Pool& Descriptor::Pool::operator=(Pool&& pool) { return *this; } +Descriptor::Pool::~Pool() { + try { + if (device_ && descriptor_pool_) { + purge(); + } + } + catch (const std::exception& e) { + LOG(WARNING) + << "Vulkan: Descriptor pool destructor raised an exception! Error: " + << e.what(); + } + catch (...) { + LOG(WARNING) + << "Vulkan: Descriptor pool destructor raised an unknown exception!"; + } +} + Descriptor::Set Descriptor::Pool::allocate( - const Shader::Layout::Object& shader_layout) -{ + const Shader::Layout::Object& shader_layout) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY( device_ && descriptor_pool_, "This descriptor pool is in an invalid state! " @@ -320,10 +338,31 @@ Descriptor::Set Descriptor::Pool::allocate( shader_layout, "Invalid Vulkan shader layout!"); + auto iterator = set_.layouts.find(shader_layout.handle); + if (set_.layouts.cend() == iterator) { + iterator = set_.layouts.insert({shader_layout.handle, {}}).first; + iterator->second.pool.reserve(Configuration::kReserve); + } + + auto& layout = iterator->second; + + if (layout.pool.size() == layout.in_use) { + layout.pool.resize( + layout.pool.size() + + Configuration::kQuantum); + + allocate_descriptor_sets( + device_, + descriptor_pool_.get(), + shader_layout.handle, + layout.pool.data() + layout.in_use, + Configuration::kQuantum); + } + return Set( device_, - descriptor_pool_.get(), - shader_layout); + layout.pool[layout.in_use++], + shader_layout.signature); } void Descriptor::Pool::purge() { @@ -332,6 +371,7 @@ void Descriptor::Pool::purge() { "This descriptor pool is in an invalid state! " "Potential reason: This descriptor pool is moved from."); + set_.layouts.clear(); VK_CHECK(vkResetDescriptorPool(device_, descriptor_pool_.get(), 0u)); } diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h index e268696d781b..440bb9aa4097 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.h +++ b/aten/src/ATen/native/vulkan/api/Descriptor.h @@ -60,21 +60,16 @@ struct Descriptor final { public: Set( VkDevice device, - VkDescriptorPool descriptor_pool, - const Shader::Layout::Object& shader_layout); + VkDescriptorSet descriptor_set, + const Shader::Layout::Signature& shader_layout_signature); Set(const Set&) = delete; Set& operator=(const Set&) = delete; Set(Set&&); Set& operator=(Set&&); ~Set() = default; - Set& bind( - uint32_t binding, - const Resource::Buffer::Object& buffer); - - Set& bind( - uint32_t binding, - const Resource::Image::Object& image); + Set& bind(uint32_t binding, const Resource::Buffer::Object& buffer); + Set& bind(uint32_t binding, const Resource::Image::Object& image); VkDescriptorSet handle() const; @@ -82,6 +77,7 @@ struct Descriptor final { struct Item final { uint32_t binding; VkDescriptorType type; + union { VkDescriptorBufferInfo buffer; VkDescriptorImageInfo image; @@ -96,7 +92,7 @@ struct Descriptor final { Shader::Layout::Signature shader_layout_signature_; struct { - c10::SmallVector items; + c10::SmallVector items; mutable bool dirty; } bindings_; }; @@ -112,14 +108,28 @@ struct Descriptor final { Pool& operator=(const Pool&) = delete; Pool(Pool&&); Pool& operator=(Pool&&); - ~Pool() = default; + ~Pool(); Set allocate(const Shader::Layout::Object& shader_layout); void purge(); private: + struct Configuration final { + static constexpr uint32_t kQuantum = 16u; + static constexpr uint32_t kReserve = 64u; + }; + VkDevice device_; Handle descriptor_pool_; + + struct { + struct Layout final { + std::vector pool; + size_t in_use; + }; + + ska::flat_hash_map layouts; + } set_; } pool /* [thread_count] */; explicit Descriptor(const GPU& gpu) @@ -132,9 +142,10 @@ struct Descriptor final { // inline Descriptor::Set::Set(Set&& set) - : device_(set.device_), - descriptor_set_(set.descriptor_set_), - bindings_(set.bindings_) { + : device_(std::move(set.device_)), + descriptor_set_(std::move(set.descriptor_set_)), + shader_layout_signature_(std::move(set.shader_layout_signature_)), + bindings_(std::move(set.bindings_)) { set.device_ = VK_NULL_HANDLE; set.descriptor_set_ = VK_NULL_HANDLE; } @@ -143,6 +154,7 @@ inline Descriptor::Set& Descriptor::Set::operator=(Set&& set) { if (&set != this) { device_ = std::move(set.device_); descriptor_set_ = std::move(set.descriptor_set_); + shader_layout_signature_ = std::move(set.shader_layout_signature_); bindings_ = std::move(set.bindings_); set.device_ = VK_NULL_HANDLE; diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp index 9facc3f49e0f..4b15203892ed 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp +++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp @@ -100,38 +100,31 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()( descriptor.shader_module, "Invalid Vulkan shader module!"); - constexpr uint32_t x_offset = 0u; - constexpr uint32_t x_size = sizeof(Shader::WorkGroup::width); - constexpr uint32_t y_offset = x_offset + x_size; - constexpr uint32_t y_size = sizeof(Shader::WorkGroup::height); - constexpr uint32_t z_offset = y_offset + y_size; - constexpr uint32_t z_size = sizeof(Shader::WorkGroup::depth); - constexpr VkSpecializationMapEntry specialization_map_entires[3]{ // X { - 1u, - x_offset, - x_size, + 0u, + offsetof(Shader::WorkGroup, data[0u]), + sizeof(Shader::WorkGroup::data[0u]), }, // Y { - 2u, - y_offset, - y_size, + 1u, + offsetof(Shader::WorkGroup, data[1u]), + sizeof(Shader::WorkGroup::data[1u]), }, // Z { - 3u, - z_offset, - z_size, + 2u, + offsetof(Shader::WorkGroup, data[2u]), + sizeof(Shader::WorkGroup::data[2u]), }, }; const VkSpecializationInfo specialization_info{ 3u, specialization_map_entires, - sizeof(Shader::WorkGroup), + sizeof(descriptor.local_work_group), &descriptor.local_work_group, }; diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h index 50893b709473..1d1966790dbf 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.h +++ b/aten/src/ATen/native/vulkan/api/Pipeline.h @@ -96,6 +96,21 @@ struct Pipeline final { } } layout; + // + // Stage + // + + struct Stage final { + typedef uint8_t Flags; + + enum Type : Flags { + None = 0u << 0u, + Compute = 1u << 0u, + Host = 1u << 1u, + Transfer = 1u << 2u, + }; + }; + /* Descriptor */ @@ -202,9 +217,9 @@ inline size_t Pipeline::Factory::Hasher::operator()( return c10::get_hash( descriptor.pipeline_layout, descriptor.shader_module, - descriptor.local_work_group.width, - descriptor.local_work_group.height, - descriptor.local_work_group.depth); + descriptor.local_work_group.data[0u], + descriptor.local_work_group.data[1u], + descriptor.local_work_group.data[2u]); } inline Pipeline::Object::operator bool() const { diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp index e239ba2f7763..a7177e379058 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.cpp +++ b/aten/src/ATen/native/vulkan/api/Resource.cpp @@ -32,7 +32,7 @@ VmaAllocator create_allocator( nullptr, 1u, nullptr, - nullptr, // TODO (Ashkan): VULKAN_WRAPPER + nullptr, nullptr, instance, VK_API_VERSION_1_0, @@ -48,8 +48,9 @@ VmaAllocator create_allocator( VmaAllocationCreateInfo create_allocation_create_info( const Resource::Memory::Descriptor& descriptor) { return VmaAllocationCreateInfo{ - 0u, /* VMA_ALLOCATION_CREATE_MAPPED_BIT - MoltenVK Issue #175 */ - /* VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT */ + VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT | + /* VMA_ALLOCATION_CREATE_MAPPED_BIT - MoltenVK Issue #175 */ + 0, descriptor.usage, descriptor.required, descriptor.preferred, @@ -85,16 +86,20 @@ void release_image(const Resource::Image& image) { } // namespace -void* map(const Resource::Memory& memory) { - // Call will be ignored by implementation if the memory type this allocation - // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior - // we want. - VK_CHECK(vmaInvalidateAllocation( - memory.allocator, memory.allocation, 0u, VK_WHOLE_SIZE)); - +void* map( + const Resource::Memory& memory, + const Resource::Memory::Access::Flags access) { void* data = nullptr; VK_CHECK(vmaMapMemory(memory.allocator, memory.allocation, &data)); + if (access & Resource::Memory::Access::Read) { + // Call will be ignored by implementation if the memory type this allocation + // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior + // we want. + VK_CHECK(vmaInvalidateAllocation( + memory.allocator, memory.allocation, 0u, VK_WHOLE_SIZE)); + } + return data; } @@ -119,14 +124,14 @@ void Resource::Memory::Scope::operator()(const void* const data) const { return; } - vmaUnmapMemory(allocator_, allocation_); - if (access_ & Access::Write) { // Call will be ignored by implementation if the memory type this allocation // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior // we want. VK_CHECK(vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE)); } + + vmaUnmapMemory(allocator_, allocation_); } Resource::Image::Sampler::Factory::Factory(const GPU& gpu) @@ -151,11 +156,11 @@ Resource::Image::Sampler::Factory::operator()( descriptor.address_mode, 0.0f, VK_FALSE, - 0.0f, + 1.0f, VK_FALSE, VK_COMPARE_OP_NEVER, 0.0f, - 0.0f, + VK_LOD_CLAMP_NONE, descriptor.border, VK_FALSE, }; @@ -239,7 +244,9 @@ Resource::Pool::Pool(const GPU& gpu) Resource::Pool::~Pool() { try { - purge(); + if (device_ && allocator_) { + purge(); + } } catch (const std::exception& e) { LOG(WARNING) @@ -387,9 +394,9 @@ Resource::Image Resource::Pool::image( { VK_IMAGE_ASPECT_COLOR_BIT, 0u, - 1u, + VK_REMAINING_MIP_LEVELS, 0u, - 1u, + VK_REMAINING_ARRAY_LAYERS, }, }; diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h index a9428d272782..340d926206ff 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.h +++ b/aten/src/ATen/native/vulkan/api/Resource.h @@ -12,7 +12,6 @@ namespace native { namespace vulkan { namespace api { - struct Resource final { class Pool; @@ -47,6 +46,7 @@ struct Resource final { typedef uint8_t Flags; enum Type : Flags { + None = 0u << 0u, Read = 1u << 0u, Write = 1u << 1u, }; @@ -334,17 +334,17 @@ class Resource::Memory::Scope final { template inline Resource::Memory::Handle Resource::Memory::map() const & { - void* map(const Memory& memory); + void* map(const Memory& memory, Access::Flags); return Handle{ - reinterpret_cast(map(*this)), + reinterpret_cast(map(*this, Access::Read)), Scope(allocator, allocation, Access::Read), }; } template inline Resource::Memory::Handle Resource::Memory::map() & { - void* map(const Memory& memory); + void* map(const Memory& memory, Access::Flags); static_assert( (kAccess == Access::Read) || @@ -353,7 +353,7 @@ inline Resource::Memory::Handle Resource::Memory::map() & { "Invalid memory access!"); return Handle{ - reinterpret_cast(map(*this)), + reinterpret_cast(map(*this, kAccess)), Scope(allocator, allocation, kAccess), }; } diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp index 2c090d073bdf..43d1a62ac201 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.cpp +++ b/aten/src/ATen/native/vulkan/api/Shader.cpp @@ -18,7 +18,7 @@ Shader::Layout::Factory::Factory(const GPU& gpu) Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( const Descriptor& descriptor) const { - c10::SmallVector bindings; + c10::SmallVector bindings; uint32_t binding = 0u; for (const VkDescriptorType type : descriptor.signature) { diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h index b2238a95de50..718504e69bd4 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.h +++ b/aten/src/ATen/native/vulkan/api/Shader.h @@ -4,6 +4,7 @@ #include #include +#include #include namespace at { @@ -44,7 +45,7 @@ struct Shader final { Signature */ - typedef c10::SmallVector Signature; + typedef c10::SmallVector Signature; /* Descriptor @@ -112,7 +113,7 @@ struct Shader final { // Work Group // - typedef VkExtent3D WorkGroup; + typedef utils::uvec3 WorkGroup; /* Descriptor @@ -224,9 +225,9 @@ inline void Shader::Layout::Cache::purge() { inline bool operator==( const Shader::WorkGroup& _1, const Shader::WorkGroup& _2) { - return (_1.width == _2.width) && - (_1.height == _2.height) && - (_1.depth == _2.depth); + return (_1.data[0u] == _2.data[0u]) && + (_1.data[1u] == _2.data[1u]) && + (_1.data[2u] == _2.data[2u]); } inline Shader::Descriptor::Descriptor(const char* const glsl) diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h index c46a652da856..1d261849a5e7 100644 --- a/aten/src/ATen/native/vulkan/api/Utils.h +++ b/aten/src/ATen/native/vulkan/api/Utils.h @@ -8,41 +8,34 @@ namespace vulkan { namespace api { namespace utils { -inline int64_t align_down( - const int64_t number, - const int64_t multiple) { +// +// Alignment +// + +template +inline constexpr Type align_down( + const Type number, + const Type multiple) { return (number / multiple) * multiple; } -inline int64_t align_up( - const int64_t number, - const int64_t multiple) { +template +inline constexpr Type align_up( + const Type number, + const Type multiple) { return align_down(number + multiple - 1, multiple); } -inline int64_t div_up( - const int64_t numerator, - const int64_t denominator) { +template +inline constexpr Type div_up( + const Type numerator, + const Type denominator) { return (numerator + denominator - 1) / denominator; } -inline VkFormat convert(const caffe2::TypeMeta dtype) { - switch (c10::typeMetaToScalarType(dtype)) { - case kFloat: -#ifdef VULKAN_FP16_INFERENCE - return VK_FORMAT_R16G16B16A16_SFLOAT; -#else - return VK_FORMAT_R32G32B32A32_SFLOAT; -#endif /* VULKAN_FP16_INFERENCE */ - - default: - TORCH_CHECK( - false, - "Vulkan tensor format not supported!"); - } - - return VK_FORMAT_UNDEFINED; -} +// +// Cast +// namespace detail { @@ -79,6 +72,37 @@ inline constexpr To safe_downcast(const From v) { return detail::safe_downcast(v); } +// +// Vector +// + +namespace detail { + +template +struct vec final { + Type data[N]; +}; + +} // namespace detail + +template +using ivec = detail::vec; +using ivec2 = ivec<2u>; +using ivec3 = ivec<3u>; +using ivec4 = ivec<4u>; + +template +using uvec = detail::vec; +using uvec2 = uvec<2u>; +using uvec3 = uvec<3u>; +using uvec4 = uvec<4u>; + +template +using vec = detail::vec; +using vec2 = vec<2u>; +using vec3 = vec<3u>; +using vec4 = vec<4u>; + } // namespace utils } // namespace api } // namespace vulkan diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl index 3c9a50a267f9..58394dca19da 100644 --- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl +++ b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl @@ -2,7 +2,7 @@ #define PRECISION $precision layout(std430) buffer; layout(std430) uniform; -layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; +layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) readonly buffer kernel { vec4 data[]; } @@ -13,7 +13,7 @@ layout(set = 0, binding = 2) uniform constBlock { } uConstBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { ivec3 pos = ivec3(gl_GlobalInvocationID) * ivec3(4, 1, 1); diff --git a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl index 99c6f54b919b..d5b9af843dbe 100644 --- a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl @@ -6,25 +6,24 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + vec2 stride; + vec2 kernel; +} uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const vec3 isize = textureSize(uInput, 0); - const vec2 stride = isize.xy / size.xy; - const vec2 kernel = isize.xy - (size.xy - 1) * stride; - - if (all(lessThan(pos, size))) { - const vec2 ipos = pos.xy * stride; + if (all(lessThan(pos, uBlock.size.xyz))) { + const vec2 ipos = pos.xy * uBlock.stride; const ivec2 start = ivec2(ipos); - const ivec2 end = ivec2(ceil(ipos + kernel)); + const ivec2 end = ivec2(ceil(ipos + uBlock.kernel)); const ivec2 range = end - start; vec4 sum = vec4(0); diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl index 771a2a1b9349..8dcff0476edf 100644 --- a/aten/src/ATen/native/vulkan/glsl/add.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add.glsl @@ -6,22 +6,20 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; layout(set = 0, binding = 2) uniform PRECISION sampler3D uInput1; layout(set = 0, binding = 3) uniform PRECISION restrict Block { + ivec3 size; float alpha; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl index f8cdb8ea05e6..ed82d0cbe87b 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl @@ -6,21 +6,19 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput0; layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec3 size; float alpha; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl index 1f49c9e9d475..8882ba0d8ff2 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl @@ -6,21 +6,19 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec3 size; float other; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl index fd6ec2953afb..bffd680669fb 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl @@ -6,20 +6,18 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION restrict Block { + ivec3 size; float other; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/addmm.glsl b/aten/src/ATen/native/vulkan/glsl/addmm.glsl index 7489a74a33f5..61f76fa8cf5d 100644 --- a/aten/src/ATen/native/vulkan/glsl/addmm.glsl +++ b/aten/src/ATen/native/vulkan/glsl/addmm.glsl @@ -6,28 +6,24 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uM1; layout(set = 0, binding = 2) uniform PRECISION sampler3D uM2; layout(set = 0, binding = 3) uniform PRECISION sampler3D uT; layout(set = 0, binding = 4) uniform PRECISION restrict Block { - float alpha; - float beta; + ivec4 size; + vec2 multiplier; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const int dim = textureSize(uM1, 0).x; - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { vec4 sum = vec4(0); - for (int k = 0; k < dim; ++k) { + for (int k = 0; k < uBlock.size.w; ++k) { sum = fma( texelFetch(uM1, ivec3(k, pos.y, pos.z), 0), texelFetch(uM2, ivec3(pos.x, k, pos.z), 0), @@ -37,6 +33,6 @@ void main() { imageStore( uOutput, pos, - uBlock.alpha * sum + uBlock.beta * texelFetch(uT, pos, 0)); + uBlock.multiplier.x * sum + uBlock.multiplier.y * texelFetch(uT, pos, 0)); } } diff --git a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl index 8036be567b65..df2bbcf18014 100644 --- a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl @@ -6,29 +6,26 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION restrict Block { - ivec2 kernel; + ivec4 size; + ivec2 isize; ivec2 stride; ivec2 padding; + ivec2 kernel; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const ivec3 isize = textureSize(uInput, 0); - const float range = uBlock.kernel.x * uBlock.kernel.y; - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding; const ivec2 start = max(ivec2(0), ipos); - const ivec2 end = min(ipos + uBlock.kernel, isize.xy); + const ivec2 end = min(ipos + uBlock.kernel, uBlock.isize); vec4 sum = vec4(0); @@ -41,6 +38,6 @@ void main() { imageStore( uOutput, pos, - sum / range); + sum / uBlock.size.w); } } diff --git a/aten/src/ATen/native/vulkan/glsl/clamp.glsl b/aten/src/ATen/native/vulkan/glsl/clamp.glsl index 8482e27f48e4..c394dfd26627 100644 --- a/aten/src/ATen/native/vulkan/glsl/clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/clamp.glsl @@ -6,25 +6,22 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION restrict Block { - float min; - float max; + ivec4 size; + vec2 clamp; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { imageStore( uOutput, pos, - clamp(texelFetch(uInput, pos, 0), uBlock.min, uBlock.max)); + clamp(texelFetch(uInput, pos, 0), uBlock.clamp.x, uBlock.clamp.y)); } } diff --git a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl index 9bfc77a44053..b16258685114 100644 --- a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl @@ -6,24 +6,21 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput; -layout(set = 0, binding = 1) uniform PRECISION restrict Block { - float min; - float max; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 1) uniform PRECISION restrict Block { + ivec4 size; + vec2 clamp; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { imageStore( uOutput, pos, - clamp(imageLoad(uOutput, pos), uBlock.min, uBlock.max)); + clamp(imageLoad(uOutput, pos), uBlock.clamp.x, uBlock.clamp.y)); } } diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl index 0a58c5d0a2f6..fd54c2f38721 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl @@ -6,7 +6,7 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; layout(set = 0, binding = 3) buffer PRECISION restrict readonly Bias { @@ -20,7 +20,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block vec2 clamp; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl index cdff4560bfa8..fe50262f7d46 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl @@ -6,7 +6,7 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; layout(set = 0, binding = 3) buffer PRECISION restrict readonly Bias { @@ -20,7 +20,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block vec2 clamp; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl index c9f1b43ad4d5..37a5898b9f10 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl @@ -2,7 +2,7 @@ #define PRECISION $precision layout(std430) buffer; layout(std430) uniform; -layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; +layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; layout(set = 0, binding = 3) readonly buffer bias { @@ -23,7 +23,7 @@ uConstBlock; #define UP_DIV(x, y) (((x) + (y)-1) / (y)) -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl index 9a48022b85f2..b73c58e0f54d 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl @@ -2,7 +2,7 @@ #define PRECISION $precision layout(std430) buffer; layout(std430) uniform; -layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; +layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; layout(set = 0, binding = 3) readonly buffer bias { @@ -23,7 +23,7 @@ uConstBlock; #define UP_DIV(x, y) (((x) + (y)-1) / (y)) -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { ivec3 gpos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl index c2962844e0bc..bbc745ca8efd 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl @@ -6,7 +6,7 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; layout(set = 0, binding = 3) buffer PRECISION restrict readonly Bias { @@ -20,7 +20,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block int W; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl index 5c08399f765c..d19c370ec9bd 100644 --- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl @@ -10,23 +10,21 @@ layout(set = 0, binding = 0) uniform PRECISION sampler3D uIma layout(set = 0, binding = 1) buffer PRECISION restrict writeonly Buffer { float data[]; } uBuffer; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec4 offset; +} uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = textureSize(uImage, 0); - const int plane = size.x * size.y; - const int block = 4 * plane; - const ivec4 offset = plane * ivec4(0, 1, 2, 3); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { const vec4 texel = texelFetch(uImage, pos, 0); - const int base = pos.x + size.x * pos.y + block * pos.z; - const ivec4 index = base + offset; + const int base = pos.x + uBlock.size.x * pos.y + uBlock.size.w * pos.z; + const ivec4 index = base + uBlock.offset; uBuffer.data[index.x] = texel.r; uBuffer.data[index.y] = texel.g; diff --git a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl index 5db8a53e1770..948b797a5207 100644 --- a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl @@ -2,7 +2,7 @@ #define PRECISION $precision layout(std430) buffer; layout(std430) uniform; -layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; +layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform constBlock { ivec4 inputSize; @@ -17,7 +17,7 @@ uConstBlock; #define UP_DIV(x, y) (((x) + (y)-1) / (y)) #define FLT_MAX 3.402823466e+38 -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl index 7c5795a6e96a..130d716ca9e6 100644 --- a/aten/src/ATen/native/vulkan/glsl/mean.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl @@ -6,24 +6,25 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec2 isize; +} uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This implementation is suboptimal and should be revisted. void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const ivec3 isize = textureSize(uInput, 0); - const float range = isize.x * isize.y; - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { vec4 sum = vec4(0); - for (int y = 0; y < isize.y; ++y) { - for (int x = 0; x < isize.x; ++x) { + for (int y = 0; y < uBlock.isize.y; ++y) { + for (int x = 0; x < uBlock.isize.x; ++x) { sum += texelFetch(uInput, ivec3(x, y, pos.z), 0); } } @@ -31,6 +32,6 @@ void main() { imageStore( uOutput, pos, - sum / range); + sum / uBlock.size.w); } } diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl index a2b137cef42e..266226aa708b 100644 --- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl @@ -1,29 +1,41 @@ #version 450 core #define PRECISION $precision + layout(std430) buffer; layout(std430) uniform; -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; + +/* Qualifiers: layout - storage - precision - memory */ + +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION restrict Block { - int W; - int H; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec2 isize; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// This implementation is suboptimal and should be revisted. void main() { - ivec3 pos = ivec3(gl_GlobalInvocationID); - vec4 r = vec4(1.0) / (float(uBlock.W) * float(uBlock.H)); - vec4 acc = vec4(0); - int xi, yi; - int zi = (imageSize(uOutput).x*pos.y + pos.x)/4; - int zo = (imageSize(uOutput).x*pos.y + pos.x)%4; - for (yi = 0; yi < uBlock.H; ++yi) { - for (xi = 0; xi < uBlock.W; ++xi) { - acc += texelFetch(uInput, ivec3(xi, yi, zi), 0); + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (all(lessThan(pos, uBlock.size.xyz))) { + vec4 sum = vec4(0); + + const int z = pos.x + uBlock.size.x * pos.y; + const int zi = z / 4; + const int zo = z % 4; + + for (int y = 0; y < uBlock.isize.y; ++y) { + for (int x = 0; x < uBlock.isize.x; ++x) { + sum += texelFetch(uInput, ivec3(x, y, zi), 0); + } } - } - vec4 outValue = r * acc; - imageStore(uOutput, pos, vec4(outValue[zo], 0,0,0)); + imageStore( + uOutput, + pos, + vec4(sum[zo], 0, 0, 0) / uBlock.size.w); + } } diff --git a/aten/src/ATen/native/vulkan/glsl/mm.glsl b/aten/src/ATen/native/vulkan/glsl/mm.glsl index dfcb155e7515..00ab5f31e6db 100644 --- a/aten/src/ATen/native/vulkan/glsl/mm.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mm.glsl @@ -6,23 +6,22 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uM1; layout(set = 0, binding = 2) uniform PRECISION sampler3D uM2; +layout(set = 0, binding = 3) uniform PRECISION restrict Block { + ivec4 size; +} uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const int dim = textureSize(uM1, 0).x; - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { vec4 sum = vec4(0); - for (int k = 0; k < dim; ++k) { + for (int k = 0; k < uBlock.size.w; ++k) { sum = fma( texelFetch(uM1, ivec3(k, pos.y, pos.z), 0), texelFetch(uM2, ivec3(pos.x, k, pos.z), 0), diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl index b73e3180c725..d3a98ba30bea 100644 --- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl @@ -6,21 +6,19 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec3 size; float other; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl index c259e5aa5a58..b49252e128cc 100644 --- a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl @@ -6,20 +6,18 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION restrict Block { + ivec3 size; float other; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size))) { imageStore( uOutput, pos, diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl index 23407891d649..fb87b5a36918 100644 --- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl +++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl @@ -6,25 +6,23 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uImage; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uImage; layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { float data[]; } uBuffer; +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec4 offset; +} uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uImage); - const int plane = size.x * size.y; - const int block = 4 * plane; - const ivec4 offset = plane * ivec4(0, 1, 2, 3); - - if (all(lessThan(pos, size))) { - const int base = pos.x + size.x * pos.y + block * pos.z; - const ivec4 index = base + offset; + if (all(lessThan(pos, uBlock.size.xyz))) { + const int base = pos.x + uBlock.size.x * pos.y + uBlock.size.w * pos.z; + const ivec4 index = base + uBlock.offset; imageStore( uImage, diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl index bd0b6637efae..af8e33588f78 100644 --- a/aten/src/ATen/native/vulkan/glsl/permute.glsl +++ b/aten/src/ATen/native/vulkan/glsl/permute.glsl @@ -17,7 +17,7 @@ layout(set = 0, binding = 2) uniform constBlock { } uConst; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { ivec3 pos = ivec3(gl_GlobalInvocationID); diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl index a979cf275c21..efb1c5c7fc9a 100644 --- a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl @@ -6,26 +6,24 @@ layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ -layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D uOutput; +layout(set = 0, binding = 0, rgba16f) uniform PRECISION restrict writeonly image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION restrict Block { + ivec4 size; + ivec2 isize; vec2 scale; } uBlock; -layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); - /* Dynamically Uniform */ - const ivec3 size = imageSize(uOutput); - const ivec3 isize = textureSize(uInput, 0); - - if (all(lessThan(pos, size))) { + if (all(lessThan(pos, uBlock.size.xyz))) { const ivec2 ipos = clamp( ivec2(pos.xy * uBlock.scale), ivec2(0), - isize.xy - 1); + uBlock.isize); imageStore( uOutput, diff --git a/aten/src/ATen/native/vulkan/ops/Add.cpp b/aten/src/ATen/native/vulkan/ops/Add.cpp index 24e9cd6dc021..270a1d5f8168 100644 --- a/aten/src/ATen/native/vulkan/ops/Add.cpp +++ b/aten/src/ATen/native/vulkan/ops/Add.cpp @@ -7,6 +7,8 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + Tensor add_scalar( const Tensor& self_arg, const Scalar other, @@ -18,8 +20,8 @@ Tensor add_scalar( vTensor v_output{ context, - self.sizes(), - self.options(), + v_self.sizes(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); @@ -27,8 +29,10 @@ Tensor add_scalar( { if (v_output.has_image() && v_self.has_image()) { const struct { + uvec3 extents; float other; } block { + v_self.extents(), other.to() * alpha.to(), }; @@ -43,10 +47,15 @@ Tensor add_scalar( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer), + v_self.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -62,24 +71,26 @@ Tensor add_scalar( } Tensor& add_scalar_( - Tensor& self_arg, + Tensor& self, const Scalar other, const Scalar alpha) { api::Context* const context = api::context(); TORCH_CHECK( - self_arg.is_vulkan(), + self.is_vulkan(), "Vulkan: In-place add is only supported on Vulkan tensors."); - vTensor& v_self = convert(self_arg); + vTensor& v_self = convert(self); api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { if (v_self.has_image()) { const struct { + uvec3 extents; float other; } block { + v_self.extents(), other.to() * alpha.to(), }; @@ -93,7 +104,10 @@ Tensor& add_scalar_( v_self.extents(), // Read-Write access triggers an async synchronization if necessory // and inserts appropriate barriers if hazards are detected. - v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write), + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -105,7 +119,7 @@ Tensor& add_scalar_( command_buffer.end(); command_buffer.submit(context->gpu().queue); - return self_arg; + return self; } Tensor add_tensor( @@ -122,8 +136,8 @@ Tensor add_tensor( vTensor v_output{ context, - self.sizes(), - self.options(), + v_self.sizes(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); @@ -131,8 +145,10 @@ Tensor add_tensor( { if (v_self.has_image() && v_other.has_image()) { const struct { + uvec3 extents; float alpha; } block { + v_output.extents(), alpha.to(), }; @@ -148,13 +164,20 @@ Tensor add_tensor( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer), + v_self.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_other.image(command_buffer), + v_other.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -170,16 +193,16 @@ Tensor add_tensor( } Tensor& add_tensor_( - Tensor& self_arg, + Tensor& self, const Tensor& other_arg, const Scalar alpha) { api::Context* const context = api::context(); TORCH_CHECK( - self_arg.is_vulkan(), + self.is_vulkan(), "Vulkan: In-place add is only supported on Vulkan tensors."); - vTensor& v_self = convert(self_arg); + vTensor& v_self = convert(self); const Tensor other = other_arg.is_vulkan() ? other_arg : other_arg.vulkan(); const vTensor& v_other = convert(other); @@ -187,10 +210,12 @@ Tensor& add_tensor_( api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { - if (v_self.has_image() && v_other.has_image()) { + if (v_self.has_image() && v_other.has_image() && !self.is_same(other)) { const struct { + uvec3 extents; float alpha; } block { + v_self.extents(), alpha.to(), }; @@ -205,10 +230,15 @@ Tensor& add_tensor_( v_self.extents(), // Read-Write access triggers an async synchronization if necessory // and inserts appropriate barriers if hazards are detected. - v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write), + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_other.image(command_buffer), + v_other.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -220,7 +250,7 @@ Tensor& add_tensor_( command_buffer.end(); command_buffer.submit(context->gpu().queue); - return self_arg; + return self; } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index 6aec84d8b349..369a47fee93a 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -7,12 +7,14 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + Tensor clamp( const Tensor& self_arg, - const c10::optional min_value, - const c10::optional max_value) { + const c10::optional min, + const c10::optional max) { TORCH_CHECK( - min_value || max_value, + min || max, "At least one of 'min' or 'max' must not be None"); api::Context* const context = api::context(); @@ -22,8 +24,8 @@ Tensor clamp( vTensor v_output{ context, - self.sizes(), - self.options(), + v_self.sizes(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); @@ -31,11 +33,16 @@ Tensor clamp( { if (v_output.has_image() && v_self.has_image()) { const struct { - float min_value; - float max_value; + uvec3 extents; + uint32_t _; + vec2 clamp; } block { - min_value ? min_value->to() : -std::numeric_limits::infinity(), - max_value ? max_value->to() : std::numeric_limits::infinity(), + v_output.extents(), + 0u, + { + min ? min->to() : -std::numeric_limits::infinity(), + max ? max->to() : std::numeric_limits::infinity(), + }, }; context->dispatch( @@ -49,10 +56,15 @@ Tensor clamp( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer), + v_self.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -68,31 +80,36 @@ Tensor clamp( } Tensor& clamp_( - Tensor& self_arg, - const c10::optional min_value, - const c10::optional max_value) { + Tensor& self, + const c10::optional min, + const c10::optional max) { api::Context* const context = api::context(); TORCH_CHECK( - min_value || max_value, + min || max, "At least one of 'min' or 'max' must not be None"); TORCH_CHECK( - self_arg.is_vulkan(), + self.is_vulkan(), "Vulkan: In-place clamp is only supported on Vulkan tensors."); - vTensor& v_self = convert(self_arg); + vTensor& v_self = convert(self); api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { if (v_self.has_image()) { const struct { - float min_value; - float max_value; + uvec3 extents; + uint32_t _; + vec2 clamp; } block { - min_value ? min_value->to() : -std::numeric_limits::infinity(), - max_value ? max_value->to() : std::numeric_limits::infinity(), + v_self.extents(), + 0u, + { + min ? min->to() : -std::numeric_limits::infinity(), + max ? max->to() : std::numeric_limits::infinity(), + }, }; context->dispatch( @@ -105,7 +122,10 @@ Tensor& clamp_( v_self.extents(), // Read-Write access triggers an async synchronization if necessory // and inserts appropriate barriers if hazards are detected. - v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write), + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -117,7 +137,7 @@ Tensor& clamp_( command_buffer.end(); command_buffer.submit(context->gpu().queue); - return self_arg; + return self; } Tensor hardtanh( diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index 7cf3b4fe5137..c549468aaec8 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -9,6 +9,8 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + inline bool is_depthwise( const IntArrayRef filter, const int64_t groups) { @@ -64,14 +66,13 @@ vTensor pack_weights( // General // - using namespace api::utils; vTensor v_weight{ api::context(), &pool, { - div_up(src_filter[Layout::Filter::output], 4), - 4 * align_up(src_filter[Layout::Filter::input], 4), + div_up(src_filter[Layout::Filter::output], INT64_C(4)), + 4 * align_up(src_filter[Layout::Filter::input], INT64_C(4)), src_filter[Layout::Filter::height], src_filter[Layout::Filter::width], }, @@ -174,8 +175,8 @@ std::array pack_filter( }; return { - api::utils::align_up(filter[Layout::Filter::output], 4), - api::utils::align_up(filter[Layout::Filter::input], 4), + align_up(filter[Layout::Filter::output], INT64_C(4)), + align_up(filter[Layout::Filter::input], INT64_C(4)), effective( filter[Layout::Filter::height], dilation[Layout::Parameter::height]), @@ -270,8 +271,6 @@ void conv2d_depthwise( const IntArrayRef dilation, const float output_min, const float output_max) { - using namespace api::utils; - if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { const struct { int32_t kernel_x, kernel_y; @@ -305,16 +304,25 @@ void conv2d_depthwise( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_input.image(command_buffer), + v_input.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_weight.image(command_buffer), + v_weight.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_bias.buffer(command_buffer), + v_bias.buffer( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -336,10 +344,8 @@ void conv2d_pointwise( const IntArrayRef padding, const float output_min, const float output_max) { - using namespace api::utils; - if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { - + vTensor v_weight_reshaped{ context, {1,1, v_weight.sizes()[0], v_weight.sizes()[1]}, @@ -351,8 +357,13 @@ void conv2d_pointwise( temp_command_buffer.begin(); temp_command_buffer.copy( - v_weight.buffer(temp_command_buffer), - v_weight_reshaped.buffer(temp_command_buffer, vTensor::Access::Write) + v_weight.buffer( + temp_command_buffer, + vTensor::Stage::Transfer), + v_weight_reshaped.buffer( + temp_command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write) ); temp_command_buffer.end(); @@ -389,16 +400,26 @@ void conv2d_pointwise( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_input.image(command_buffer), + v_input.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_weight_reshaped.image(command_buffer, vTensor::Access::Read), + v_weight_reshaped.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_bias.buffer(command_buffer), + v_bias.buffer( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -421,8 +442,6 @@ void conv2d( const IntArrayRef dilation, const float output_min, const float output_max) { - using namespace api::utils; - if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { const struct { int32_t kernel_x, kernel_y, kernel_ic, kernel_oc; @@ -458,16 +477,25 @@ void conv2d( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_input.image(command_buffer), + v_input.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_weight.image(command_buffer), + v_weight.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_bias.buffer(command_buffer), + v_bias.buffer( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.h b/aten/src/ATen/native/vulkan/ops/Convolution.h index 2bab7091d4ab..7bd27bb1942b 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.h +++ b/aten/src/ATen/native/vulkan/ops/Convolution.h @@ -1,5 +1,6 @@ #pragma once -#ifdef USE_VULKAN + +#ifdef USE_VULKAN_API #include #include @@ -96,4 +97,4 @@ c10::intrusive_ptr conv2d_clamp_prepack( } // namespace native } // namespace at -#endif /* USE_VULKAN */ +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Copy.cpp b/aten/src/ATen/native/vulkan/ops/Copy.cpp index 2f74d1be00ab..bbd326b42ace 100644 --- a/aten/src/ATen/native/vulkan/ops/Copy.cpp +++ b/aten/src/ATen/native/vulkan/ops/Copy.cpp @@ -48,13 +48,18 @@ Tensor& copy_(Tensor& self, const Tensor& src) { // are automatically inserted if a RAW hazard is detected. // - Recording any potential pending sync operations into the same // command buffer prevents an expensive queue submission. - convert(src).buffer(command_buffer), + convert(src).buffer( + command_buffer, + vTensor::Stage::Transfer), // - Write-only access never triggers a sync as the contents will be // overwritten regardless. Having said that, appropriate barriers // are inserted automatically if WAR or WAW hazards are detected. // - Recording pending sync operations into the same command buffer // prevents an expensive queue submission. - v_self.buffer(command_buffer, vTensor::Access::Write)); + v_self.buffer( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write)); command_buffer.end(); command_buffer.submit(api::context()->gpu().queue); diff --git a/aten/src/ATen/native/vulkan/ops/Mean.cpp b/aten/src/ATen/native/vulkan/ops/Mean.cpp index 05e96df722d0..f6d63c14f381 100644 --- a/aten/src/ATen/native/vulkan/ops/Mean.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mean.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace at { @@ -7,38 +8,48 @@ namespace vulkan { namespace ops { namespace { -int64_t normalize_dim(int64_t d, int64_t n) { - return (d % n + n) % n; -} +using namespace api::utils; Tensor mean( const at::Tensor& input_arg, const IntArrayRef dim, const bool keepdim, const optional dtype) { - TORCH_INTERNAL_ASSERT( - input_arg.dim() == 4, "vulkan_mean expects 4-dimensional input"); + TORCH_CHECK( + input_arg.dim() == 4, + "Vulkan mean expects 4-dimensional input!"); + static const std::unordered_set expected_dims_set({2, 3}); std::unordered_set dims_set; + for (const auto& d : dim) { - dims_set.insert(normalize_dim(d, 4)); + dims_set.insert(utils::normalize(d, 4)); } - TORCH_INTERNAL_ASSERT( + + TORCH_CHECK( dims_set == expected_dims_set, - "vulkan_mean currently only supported for image-wide reduction"); + "Vulkan mean currently only supports image-wide reduction!"); + + api::Context* const context = api::context(); + + const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan(); + const vTensor& v_input = convert(input); + const IntArrayRef v_input_sizes = v_input.sizes(); + + c10::SmallVector output_sizes{ + v_input_sizes[Layout::Activation4D::batch], + v_input_sizes[Layout::Activation4D::channels], + }; - std::vector output_dims{input_arg.sizes()[0], input_arg.sizes()[1]}; if (keepdim) { - output_dims.push_back(1); - output_dims.push_back(1); + output_sizes.push_back(1); + output_sizes.push_back(1); } - api::Context* const context = api::context(); - const vTensor& v_input = convert(input_arg); vTensor v_output{ - context, - output_dims, - input_arg.options(), + context, + output_sizes, + v_input.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); @@ -46,38 +57,72 @@ Tensor mean( { if (v_input.has_image()) { const struct { - uint32_t input_width, input_height; - } block{ - input_arg.sizes()[3], - input_arg.sizes()[2], + uvec3 extents; + int32_t range; + ivec2 iextents; + } block { + v_output.extents(), + safe_downcast( + v_input_sizes[Layout::Activation4D::width] * + v_input_sizes[Layout::Activation4D::height]), + { + safe_downcast(v_input_sizes[Layout::Activation4D::width]), + safe_downcast(v_input_sizes[Layout::Activation4D::height]), + }, }; if (keepdim) { context->dispatch( command_buffer, { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(mean), v_output.extents(), - v_output.image(command_buffer, vTensor::Access::Write), - v_input.image(command_buffer)); - } else { + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_input.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { context->dispatch( command_buffer, { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(mean2d), v_output.extents(), - v_output.image(command_buffer, vTensor::Access::Write), - v_input.image(command_buffer), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_input.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); } - } else { + } + else { TORCH_CHECK(false, "Not implemented!"); } } diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp index ca342e70a7b8..e3335b8fc760 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp @@ -7,137 +7,129 @@ namespace vulkan { namespace ops { namespace { -vTensor pack_weights(api::Resource::Pool& pool, const Tensor& weight_arg) { - return convert(weight_arg.vulkan()); +using namespace api::utils; + +vTensor pack_weights( + api::Resource::Pool& pool, + const Tensor& weight_arg) { + // Pending Stephen's fix. + const Tensor weight = weight_arg.is_vulkan() ? weight_arg : weight_arg.vulkan(); + return convert(weight); } vTensor pack_biases( api::Resource::Pool& pool, - const c10::optional& bias_arg, - const Tensor& weight_arg) { + const Tensor& weight_arg, + const c10::optional& bias_arg) { if (bias_arg) { - return convert(bias_arg->vulkan()); - } else { - vTensor v_bias{ - api::context(), - &pool, - {weight_arg.size(Layout::Parameter::width)}, - weight_arg.options(), - }; - - using Future = vTensor::Future; - Future v_bias_future = v_bias.host(); - Future::Payload v_bias_payload = v_bias_future.wait(); - - memset( - v_bias_payload.get(), - // 2's complement integers and IEEE-754 floating point numbers both - // have identical bit representations for 0, so can use memset which - // only accepts uint8_t parameter. - 0, - v_bias.nbytes()); - - return v_bias; + return convert( + bias_arg->is_vulkan() ? + *bias_arg : + bias_arg->vulkan()); } -} -bool available(const Tensor& weight, const c10::optional& bias) { - bool valid = true; - if (bias && bias->ndimension() > 1) { - valid = - (bias->sizes()[Layout::Parameter::width] == - weight.sizes()[Layout::Parameter::width]); - } - return api::available() && valid; + vTensor v_bias{ + api::context(), + &pool, + { + weight_arg.size(Layout::Parameter::width), + }, + weight_arg.options(), + }; + + using Future = vTensor::Future; + Future v_bias_future = v_bias.host(); + Future::Payload v_bias_payload = v_bias_future.wait(); + + memset( + v_bias_payload.get(), + // 2's complement integers and IEEE-754 floating point numbers both + // have identical bit representations for 0, so can use memset which + // only accepts uint8_t parameter. + 0, + v_bias.nbytes()); + + return v_bias; } -bool usable( - const Tensor& input, +bool available( const Tensor& weight, const c10::optional& bias) { - return (input.sizes()[Layout::Parameter::width] == - weight.sizes()[Layout::Parameter::height]); + return api::available() && + // Weight + (2 == weight.ndimension()) && + (weight.size(Layout::Parameter::height) > 0) && + (weight.size(Layout::Parameter::width) > 0) && + ((c10::DeviceType::CPU == weight.device().type()) || + (c10::DeviceType::Vulkan == weight.device().type())) && + (kFloat == weight.scalar_type()) && + !weight.requires_grad() && + // Bias + ((bias && bias->defined()) ? ((bias->ndimension() > 0) && + ((c10::DeviceType::CPU == bias->device().type()) || + (c10::DeviceType::Vulkan == bias->device().type())) && + (kFloat == bias->scalar_type()) && + ((bias->ndimension() > 1) ? + (bias->size(Layout::Parameter::width) == + weight.size(Layout::Parameter::width)) + : true) && + !bias->requires_grad()) + : true) && + true; } -void addmm_impl( - api::Context* const context, - api::Command::Buffer& command_buffer, - vTensor& v_output, - const vTensor& v_self, - const vTensor& v_mat1, - const vTensor& v_mat2, - const float beta, - const float alpha) { - if (v_output.has_image() && v_self.has_image() && v_mat1.has_image() && - v_mat2.has_image()) { - const struct { - float alpha, beta; - } block{ - alpha, - beta, - }; - - context->dispatch( - command_buffer, - { - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, - }, - VK_KERNEL(addmm), - v_output.extents(), - // Write-only access bypasses synchronization but inserts appropriate - // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - v_mat1.image(command_buffer), - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - v_mat2.image(command_buffer), - // Read-only access is implied on const tensors and triggers an async - // synchronization if necessary. - v_self.image(command_buffer), - context->resource().pool.uniform(block).object); - } else { - TORCH_CHECK(false, "Not implemented!"); - } +bool usable( + const Tensor& input, + const Tensor& weight, + const c10::optional& /* bias */) { + return (2 == input.ndimension()) && + (c10::DeviceType::Vulkan == input.device().type()) && + (kFloat == input.scalar_type()) && + (input.size(Layout::Parameter::width) == + weight.size(Layout::Parameter::height)) && + !input.requires_grad() && + true; } Tensor addmm( - const Tensor& self, - const Tensor& mat1, - const Tensor& mat2, + const Tensor& bias, + const Tensor& input, + const Tensor& weight, const Scalar beta, const Scalar alpha) { - return LinearOpContext::create(api::context()->resource().pool, mat2, self) - .run(mat1, beta.to(), alpha.to()); + return LinearOpContext::create( + api::context()->resource().pool, + weight, + bias).run( + input, + alpha.to(), + beta.to()); } -Tensor mm(const Tensor& self_arg, const Tensor& mat2_arg) { +Tensor mm( + const Tensor& mat1_arg, + const Tensor& mat2_arg) { api::Context* const context = api::context(); - const Tensor mat1 = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); + const Tensor mat1 = mat1_arg.is_vulkan() ? mat1_arg : mat1_arg.vulkan(); const vTensor& v_mat1 = convert(mat1); const Tensor mat2 = mat2_arg.is_vulkan() ? mat2_arg : mat2_arg.vulkan(); const vTensor& v_mat2 = convert(mat2); - const auto mat1_sizes = mat1.sizes(); - const auto mat2_sizes = mat2.sizes(); + const auto v_mat1_sizes = v_mat1.sizes(); + const auto v_mat2_sizes = v_mat2.sizes(); TORCH_CHECK( - mat1_sizes[Layout::Parameter::width] == - mat2_sizes[Layout::Parameter::height], + v_mat1_sizes[Layout::Parameter::width] == + v_mat2_sizes[Layout::Parameter::height], "Incompatible matrix dimensions!"); vTensor v_output{ context, { - mat1_sizes[Layout::Parameter::height], - mat2_sizes[Layout::Parameter::width], + v_mat1_sizes[Layout::Parameter::height], + v_mat2_sizes[Layout::Parameter::width], }, mat1.options(), }; @@ -146,24 +138,43 @@ Tensor mm(const Tensor& self_arg, const Tensor& mat2_arg) { command_buffer.begin(); { if (v_mat1.has_image() && v_mat2.has_image()) { + const struct { + uvec3 size; + int32_t K; + } block { + v_output.extents(), + safe_downcast(v_mat1_sizes[Layout::Parameter::width]), + }; + context->dispatch( command_buffer, { VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(mm), v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_mat1.image(command_buffer), + v_mat1.image( + command_buffer, + vTensor::Stage::Compute), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_mat2.image(command_buffer)); + v_mat2.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); } else { TORCH_CHECK(false, "Not implemented!"); } @@ -191,7 +202,7 @@ LinearOpContext::LinearOpContext( const c10::optional& bias) : packed_{ pack_weights(pool, weight), - pack_biases(pool, bias, weight), + pack_biases(pool, weight, bias), }, unpacked_{ weight, @@ -203,7 +214,12 @@ LinearOpContext LinearOpContext::create( api::Resource::Pool& pool, const Tensor& weight, const c10::optional& bias) { - TORCH_CHECK(available(weight, bias)) + TORCH_CHECK( + available(weight, bias), + "Vulkan Linear not available! " + "Reason: The provided (weight, bias) parameters are either invalid " + "individually or their combination is not supported by Vulkan Impl."); + // Pass in the originals return LinearOpContext{ pool, @@ -212,8 +228,10 @@ LinearOpContext LinearOpContext::create( }; } -Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha) - const { +Tensor LinearOpContext::run( + const Tensor& input_arg, + const float alpha, + const float beta) const { api::Context* const context = api::context(); const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan(); @@ -222,12 +240,14 @@ Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha) TORCH_CHECK( usable(input, unpacked_.weight, unpacked_.bias), "Vulkan Linear not usable! " - "Reason: The provided input tensor is either invalid or unsupported by Vulkan impl."); + "Reason: The provided input tensor is either invalid on its own, or its " + "combination with the provided weight and bias tensors are unsupported by " + "Vulkan impl."); vTensor v_output{ context, { - input_arg.sizes()[Layout::Parameter::height], + v_input.sizes()[Layout::Parameter::height], packed_.v_weight.sizes()[Layout::Parameter::width], }, input.options(), @@ -236,19 +256,61 @@ Tensor LinearOpContext::run(const Tensor& input_arg, float beta, float alpha) api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { - if (input_arg.ndimension() == 2) { - addmm_impl( - context, + if (v_output.has_image() && + v_input.has_image() && + packed_.v_weight.has_image() && + packed_.v_bias.has_image()) { + const struct { + uvec3 size; + int32_t K; + vec2 multiplier; + } block { + v_output.extents(), + safe_downcast(v_input.sizes()[Layout::Parameter::width]), + { + alpha, + beta, + }, + }; + + context->dispatch( command_buffer, - v_output, - packed_.v_bias, - v_input, - packed_.v_weight, - beta, - alpha); - } else { - TORCH_CHECK( - false, "linear_run does not yet support inputs with ndim > 2!") + { + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }, + VK_KERNEL(addmm), + v_output.extents(), + // Write-only access bypasses synchronization but inserts appropriate + // barriers if necessary. + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + v_input.image( + command_buffer, + vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + packed_.v_weight.image( + command_buffer, + vTensor::Stage::Compute), + // Read-only access is implied on const tensors and triggers an async + // synchronization if necessary. + packed_.v_bias.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); + } + else { + TORCH_CHECK(false, "Not implemented!"); } } command_buffer.end(); @@ -264,12 +326,14 @@ LinearOpContext::State LinearOpContext::unpack() const { }; } - c10::intrusive_ptr linear_prepack( Tensor&& weight, c10::optional&& bias) { - return c10::make_intrusive(LinearOpContext::create( - persistent()->pool, std::move(weight), std::move(bias))); + return c10::make_intrusive( + LinearOpContext::create( + persistent()->pool, + std::move(weight), + std::move(bias))); } Tensor linear_run( diff --git a/aten/src/ATen/native/vulkan/ops/Mm.h b/aten/src/ATen/native/vulkan/ops/Mm.h index 08c84967d00f..2c389c555a1a 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.h +++ b/aten/src/ATen/native/vulkan/ops/Mm.h @@ -1,5 +1,6 @@ #pragma once -#ifdef USE_VULKAN + +#ifdef USE_VULKAN_API #include #include @@ -52,4 +53,4 @@ Tensor linear_run( } // namespace native } // namespace at -#endif /* USE_VULKAN */ +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Mul.cpp b/aten/src/ATen/native/vulkan/ops/Mul.cpp index c22456808b82..84226135929a 100644 --- a/aten/src/ATen/native/vulkan/ops/Mul.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mul.cpp @@ -7,6 +7,8 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + Tensor mul_scalar( const Tensor& self_arg, const Scalar other) { @@ -17,8 +19,8 @@ Tensor mul_scalar( vTensor v_output{ context, - self.sizes(), - self.options(), + v_self.sizes(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); @@ -26,8 +28,10 @@ Tensor mul_scalar( { if (v_output.has_image() && v_self.has_image()) { const struct { + uvec3 extents; float other; } block { + v_output.extents(), other.to(), }; @@ -42,10 +46,15 @@ Tensor mul_scalar( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer), + v_self.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -61,23 +70,25 @@ Tensor mul_scalar( } Tensor& mul_scalar_( - Tensor& self_arg, + Tensor& self, const Scalar other) { api::Context* const context = api::context(); TORCH_CHECK( - self_arg.is_vulkan(), + self.is_vulkan(), "Vulkan: In-place mul_scalar is only supported on Vulkan tensors."); - vTensor& v_self = convert(self_arg); + vTensor& v_self = convert(self); api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { if (v_self.has_image()) { const struct { + uvec3 extents; float other; } block { + v_self.extents(), other.to(), }; @@ -91,7 +102,10 @@ Tensor& mul_scalar_( v_self.extents(), // Read-Write access triggers an async synchronization if necessory // and inserts appropriate barriers if hazards are detected. - v_self.image(command_buffer, vTensor::Access::Read | vTensor::Access::Write), + v_self.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Read | vTensor::Access::Write), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); @@ -103,7 +117,7 @@ Tensor& mul_scalar_( command_buffer.end(); command_buffer.submit(context->gpu().queue); - return self_arg; + return self; } #ifdef USE_VULKAN_API diff --git a/aten/src/ATen/native/vulkan/ops/Persistent.cpp b/aten/src/ATen/native/vulkan/ops/Persistent.cpp index 4b92198b1da2..bea5e97e5021 100644 --- a/aten/src/ATen/native/vulkan/ops/Persistent.cpp +++ b/aten/src/ATen/native/vulkan/ops/Persistent.cpp @@ -6,18 +6,19 @@ namespace vulkan { namespace ops { Persistent* persistent() { - static const std::unique_ptr persistent([]() -> Persistent* { - try { - return new Persistent{ - api::Resource::Pool{ - api::context()->gpu(), - }, - }; - } - catch (...) { - return nullptr; - } - }()); + static const std::unique_ptr persistent( + []() -> Persistent* { + try { + return new Persistent{ + api::Resource::Pool{ + api::context()->gpu(), + }, + }; + } + catch (...) { + return nullptr; + } + }()); TORCH_CHECK( persistent, diff --git a/aten/src/ATen/native/vulkan/ops/Pool.cpp b/aten/src/ATen/native/vulkan/ops/Pool.cpp index 8e853ee538f9..0bc97d6741bc 100644 --- a/aten/src/ATen/native/vulkan/ops/Pool.cpp +++ b/aten/src/ATen/native/vulkan/ops/Pool.cpp @@ -8,12 +8,14 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + Tensor adaptive_avg_pool2d( const at::Tensor& self_arg, const IntArrayRef output_size) { - TORCH_INTERNAL_ASSERT( + TORCH_CHECK( self_arg.dim() == 4, - "vulkan_adaptive_avg_pool2d expects 4-dimensional input!"); + "Vulkan adaptive_avg_pool2d expects 4-dimensional input!"); api::Context* const context = api::context(); @@ -28,27 +30,59 @@ Tensor adaptive_avg_pool2d( output_size[Layout::Activation4D::batch], output_size[Layout::Activation4D::channels], }, - self.options(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { if (v_self.has_image()) { + const uvec3 v_output_size = v_output.extents(); + const uvec3 v_self_size = v_self.extents(); + + const vec2 stride { + static_cast(v_self_size.data[0u]) / v_output_size.data[0u], + static_cast(v_self_size.data[1u]) / v_output_size.data[1u], + }; + + const struct { + uvec3 size; + uint32_t _; + vec2 stride; + vec2 kernel; + } block { + v_output.extents(), + 0u, + stride, + { + v_self_size.data[0u] - (v_output_size.data[0u] - 1u) * stride.data[0u], + v_self_size.data[1u] - (v_output_size.data[1u] - 1u) * stride.data[1u], + }, + }; + context->dispatch( command_buffer, { VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(adaptive_avg_pool2d), v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer)); + v_self.image( + command_buffer, + vTensor::Stage::Compute), + // Object lifetime is managed by the resource pool. + // It is OK not to keep track of the handle. + context->resource().pool.uniform(block).object); } else { TORCH_CHECK(false, "Not implemented!"); @@ -134,26 +168,43 @@ Tensor avg_pool2d( output_height, output_width, }, - self.options(), + v_self.options(), }; api::Command::Buffer command_buffer = context->command().pool.allocate(); command_buffer.begin(); { - using namespace api::utils; + using namespace utils; if (v_self.has_image()) { const struct { - int32_t kernel_width, kernel_height; - int32_t stride_x, stride_y; - int32_t padding_x, padding_y; + uvec3 extents; + int32_t range; + ivec2 iextents; + ivec2 stride; + ivec2 padding; + ivec2 kernel; } block { - safe_downcast(kernel[Layout::Parameter::width]), - safe_downcast(kernel[Layout::Parameter::height]), - safe_downcast(stride[Layout::Parameter::width]), - safe_downcast(stride[Layout::Parameter::height]), - safe_downcast(padding[Layout::Parameter::width]), - safe_downcast(padding[Layout::Parameter::height]), + v_output.extents(), + safe_downcast( + kernel[Layout::Parameter::width] * + kernel[Layout::Parameter::height]), + { + safe_downcast(self.size(Layout::Activation4D::width)), + safe_downcast(self.size(Layout::Activation4D::height)), + }, + { + safe_downcast(stride[Layout::Parameter::width]), + safe_downcast(stride[Layout::Parameter::height]), + }, + { + safe_downcast(padding[Layout::Parameter::width]), + safe_downcast(padding[Layout::Parameter::height]), + }, + { + safe_downcast(kernel[Layout::Parameter::width]), + safe_downcast(kernel[Layout::Parameter::height]), + }, }; context->dispatch( @@ -167,10 +218,15 @@ Tensor avg_pool2d( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.image(command_buffer), + v_self.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); diff --git a/aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp b/aten/src/ATen/native/vulkan/ops/Register.cpp similarity index 98% rename from aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp rename to aten/src/ATen/native/vulkan/ops/Register.cpp index 699944b7c48e..7b226654af01 100644 --- a/aten/src/ATen/native/vulkan/ops/RegisterOpContextClass.cpp +++ b/aten/src/ATen/native/vulkan/ops/Register.cpp @@ -1,10 +1,10 @@ -#ifdef USE_VULKAN +#ifdef USE_VULKAN_API -#include -#include #include #include #include +#include +#include namespace at { namespace native { @@ -77,4 +77,4 @@ TORCH_LIBRARY_IMPL(vulkan_prepack, Vulkan, m) { } // namespace native } // namespace at -#endif /* USE_VULKAN */ +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Shape.cpp b/aten/src/ATen/native/vulkan/ops/Shape.cpp index 10d6ac24198a..8edfda60b76f 100644 --- a/aten/src/ATen/native/vulkan/ops/Shape.cpp +++ b/aten/src/ATen/native/vulkan/ops/Shape.cpp @@ -9,7 +9,7 @@ namespace { Tensor view( const Tensor& self_arg, - IntArrayRef shape) { + const IntArrayRef shape) { api::Context* const context = api::context(); const Tensor self = self_arg.is_vulkan() ? self_arg : self_arg.vulkan(); @@ -27,10 +27,15 @@ Tensor view( command_buffer.copy( // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_self.buffer(command_buffer), + v_self.buffer( + command_buffer, + vTensor::Stage::Transfer), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.buffer(command_buffer, vTensor::Access::Write)); + v_output.buffer( + command_buffer, + vTensor::Stage::Transfer, + vTensor::Access::Write)); } command_buffer.end(); command_buffer.submit(context->gpu().queue); diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.cpp b/aten/src/ATen/native/vulkan/ops/Tensor.cpp index abd185c1ca07..3570834b8dd4 100644 --- a/aten/src/ATen/native/vulkan/ops/Tensor.cpp +++ b/aten/src/ATen/native/vulkan/ops/Tensor.cpp @@ -6,29 +6,36 @@ namespace vulkan { namespace ops { namespace { -VkDeviceSize bytes( - const IntArrayRef sizes, - const caffe2::TypeMeta dtype) { - VkDeviceSize size = c10::elementSize(c10::typeMetaToScalarType(dtype)); +using namespace api::utils; - // Forward declaration - bool requires_image(IntArrayRef); +VkFormat vk_format(const caffe2::TypeMeta dtype) { + switch (c10::typeMetaToScalarType(dtype)) { + case kFloat: + #ifdef USE_VULKAN_FP16_INFERENCE + return VK_FORMAT_R16G16B16A16_SFLOAT; + #else + return VK_FORMAT_R32G32B32A32_SFLOAT; + #endif /* USE_VULKAN_FP16_INFERENCE */ - if (requires_image(sizes)) { - // Forward declaration - VkExtent3D image_extents(IntArrayRef); - - const VkExtent3D extents = image_extents(sizes); - size *= extents.width * extents.height * (4u * extents.depth); - } - else { - size *= prod_intlist(sizes); + default: + TORCH_CHECK( + false, + "Vulkan tensor format not supported!"); } - return size; + return VK_FORMAT_UNDEFINED; } -vTensor::Access::Flags convert(const VkAccessFlags vk_access) { +VkExtent3D vk_extent(const uvec3& extent) { + return { + extent.data[0u], + extent.data[1u], + extent.data[2u], + }; +} + +vTensor::Access::Flags access( + const VkAccessFlags vk_access) { vTensor::Access::Flags access = 0u; constexpr VkAccessFlags kRead = @@ -55,6 +62,115 @@ vTensor::Access::Flags convert(const VkAccessFlags vk_access) { return access; } +VkAccessFlags vk_access( + const vTensor::Stage::Flags stage, + const vTensor::Access::Flags access) { + VkAccessFlags vk_access = 0u; + + if (access & vTensor::Access::Read) { + if (stage & vTensor::Stage::Compute) { + vk_access |= VK_ACCESS_SHADER_READ_BIT; + } + + if (stage & vTensor::Stage::Host) { + vk_access |= VK_ACCESS_HOST_READ_BIT; + } + + if (stage & vTensor::Stage::Transfer) { + vk_access |= VK_ACCESS_TRANSFER_READ_BIT; + } + } + + if (access & vTensor::Access::Write) { + if (stage & vTensor::Stage::Compute) { + vk_access |= VK_ACCESS_SHADER_WRITE_BIT; + } + + if (stage & vTensor::Stage::Host) { + vk_access |= VK_ACCESS_HOST_WRITE_BIT; + } + + if (stage & vTensor::Stage::Transfer) { + vk_access |= VK_ACCESS_TRANSFER_WRITE_BIT; + } + } + + return vk_access; +} + +VkImageLayout vk_layout( + const vTensor::Stage::Flags stage, + const vTensor::Access::Flags access) { + switch (stage) { + case vTensor::Stage::Compute: + switch (access) { + case vTensor::Access::Read: + return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + + default: + return VK_IMAGE_LAYOUT_GENERAL; + } break; + + case vTensor::Stage::Transfer: + switch (access) { + case vTensor::Access::Read: + return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; + + case vTensor::Access::Write: + return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + + default: + TORCH_INTERNAL_ASSERT(false, "Invalid!"); + } break; + + default: + TORCH_INTERNAL_ASSERT(false, "Invalid!"); + } + + return VK_IMAGE_LAYOUT_UNDEFINED; +} + +VkPipelineStageFlags vk_stage( + const vTensor::Stage::Flags stage) { + VkPipelineStageFlags vk_stage = 0u; + + if (stage & vTensor::Stage::Compute) { + vk_stage |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + + if (stage & vTensor::Stage::Host) { + vk_stage |= VK_PIPELINE_STAGE_HOST_BIT; + } + + if (stage & vTensor::Stage::Transfer) { + vk_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; + } + + return vk_stage; +} + +VkDeviceSize buffer_bytes( + const IntArrayRef sizes, + const caffe2::TypeMeta dtype) { + VkDeviceSize size = c10::elementSize(c10::typeMetaToScalarType(dtype)); + + // Forward declaration + bool requires_image(IntArrayRef); + + if (requires_image(sizes)) { + // Forward declaration + uvec3 image_extents(IntArrayRef); + + const uvec3 extents = image_extents(sizes); + size *= extents.data[0u] * extents.data[1u] * (4u * extents.data[2u]); + } + else { + size *= prod_intlist(sizes); + } + + return size; +} + vTensor::Buffer allocate_buffer( const api::Adapter* const adapter, api::Resource::Pool* const pool, @@ -74,35 +190,29 @@ vTensor::Buffer allocate_buffer( // Forward declaration bool requires_staging(const api::Adapter*); - const VkFlags usage = [adapter]() { - VkFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; - - if (requires_staging(adapter)) { - usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT; - } - - return usage; - }(); + const VkFlags usage = + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT; const auto memory = [adapter]() -> api::Resource::Memory::Descriptor { if (requires_staging(adapter)) { return { VMA_MEMORY_USAGE_GPU_ONLY, 0u, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + 0u, }; } return { - VMA_MEMORY_USAGE_UNKNOWN, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VMA_MEMORY_USAGE_GPU_TO_CPU, + 0u, + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, }; }(); return pool->buffer({ - bytes(sizes, options.dtype()), + buffer_bytes(sizes, options.dtype()), // Usage { usage, @@ -115,7 +225,7 @@ bool requires_image(const IntArrayRef sizes) { return (1u <= sizes.size()) && (sizes.size() <= 4u); } -VkExtent3D image_extents(const IntArrayRef sizes) { +uvec3 image_extents(const IntArrayRef sizes) { int64_t width = 1; int64_t height = 1; int64_t depth = 1; @@ -151,7 +261,7 @@ VkExtent3D image_extents(const IntArrayRef sizes) { return { width, height, - api::utils::div_up(depth, 4), + div_up(depth, INT64_C(4)), }; } @@ -167,7 +277,7 @@ vTensor::Image allocate_image( return pool->image({ VK_IMAGE_TYPE_3D, - api::utils::convert(options.dtype()), + vk_format(options.dtype()), extents, // Usage { @@ -176,13 +286,20 @@ vTensor::Image allocate_image( { VMA_MEMORY_USAGE_GPU_ONLY, 0u, - VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + 0u, }, }, // View { VK_IMAGE_VIEW_TYPE_3D, - api::utils::convert(options.dtype()), + vk_format(options.dtype()), + }, + // Sampler + { + VK_FILTER_NEAREST, + VK_SAMPLER_MIPMAP_MODE_NEAREST, + VK_SAMPLER_ADDRESS_MODE_REPEAT, + VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK, }, }); } @@ -212,13 +329,13 @@ vTensor::Buffer allocate_staging( verify(options); return pool->buffer({ - bytes(sizes, options.dtype()), + buffer_bytes(sizes, options.dtype()), // Usage { VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, { - VMA_MEMORY_USAGE_CPU_ONLY, + VMA_MEMORY_USAGE_CPU_COPY, 0u, 0u, }, @@ -247,11 +364,11 @@ Barrier categorize( return Barrier::None; } - const vTensor::Access::Flags src_access = convert(vk_src_access); - const vTensor::Access::Flags dst_access = convert(vk_dst_access); + const vTensor::Access::Flags src_access = access(vk_src_access); + const vTensor::Access::Flags dst_access = access(vk_dst_access); - if (vTensor::Access::Read == (src_access & vTensor::Access::Read)) { - if (vTensor::Access::Read == (dst_access & vTensor::Access::Read)) { + if ((src_access & vTensor::Access::Read) == src_access) { + if ((dst_access & vTensor::Access::Read) == dst_access) { // RAR (Read after Read) return Barrier::None; } @@ -303,53 +420,81 @@ vTensor::vTensor( } const vTensor* vTensor::host() const { - view_->staging(Access::Read); + view_->staging(Stage::Host, Access::Read); return this; } vTensor* vTensor::host(const Access::Flags access) { - view_->staging(access); + view_->staging(Stage::Host, access); return this; } -vTensor::Buffer::Object vTensor::buffer() const & { - return view_->buffer(Access::Read).object; +vTensor::Buffer::Object vTensor::buffer( + const Stage::Flags stage) const & { + return view_->buffer( + stage, + Access::Read).object; } vTensor::Buffer::Object vTensor::buffer( + const Stage::Flags stage, const Access::Flags access) & { - return view_->buffer(access).object; + return view_->buffer( + stage, + access).object; } vTensor::Buffer::Object vTensor::buffer( - api::Command::Buffer& command_buffer) const & { - return view_->buffer(command_buffer, Access::Read).object; + api::Command::Buffer& command_buffer, + const Stage::Flags stage) const & { + return view_->buffer( + command_buffer, + stage, + Access::Read).object; } vTensor::Buffer::Object vTensor::buffer( api::Command::Buffer& command_buffer, + const Stage::Flags stage, const Access::Flags access) & { - return view_->buffer(command_buffer, access).object; + return view_->buffer( + command_buffer, + stage, + access).object; } -vTensor::Image::Object vTensor::image() const & { - return view_->image(Access::Read).object; +vTensor::Image::Object vTensor::image( + const Stage::Flags stage) const & { + return view_->image( + stage, + Access::Read).object; } vTensor::Image::Object vTensor::image( + const Stage::Flags stage, const Access::Flags access) & { - return view_->image(access).object; + return view_->image( + stage, + access).object; } vTensor::Image::Object vTensor::image( - api::Command::Buffer& command_buffer) const & { - return view_->image(command_buffer, Access::Read).object; + api::Command::Buffer& command_buffer, + const Stage::Flags stage) const & { + return view_->image( + command_buffer, + stage, + Access::Read).object; } vTensor::Image::Object vTensor::image( api::Command::Buffer& command_buffer, + const Stage::Flags stage, const Access::Flags access) & { - return view_->image(command_buffer, access).object; + return view_->image( + command_buffer, + stage, + access).object; } vTensor::View::View() @@ -399,7 +544,7 @@ vTensor::View::View( class vTensor::View::CMD final { public: - CMD(const View&); + explicit CMD(const View&); CMD(const View&, api::Command::Buffer&); CMD(const CMD&) = delete; CMD& operator=(const CMD&) = delete; @@ -446,9 +591,10 @@ class vTensor::View::CMD final { External, } type; - union { + union _ final { api::Command::Buffer internal; api::Command::Buffer* external; + ~_() {} } command_buffer_; }; @@ -489,7 +635,7 @@ api::Command::Buffer& vTensor::View::CMD::command_buffer() { } void vTensor::View::CMD::barrier(State::Transition transition) { - // Buffer and Staging are just an alias for the same memory location on UMA. + // Buffer and Staging are just an alias for the same memory region on UMA. if (view_.state_.is_uma()) { transition.first.buffer.stage |= transition.first.staging.stage; @@ -615,8 +761,6 @@ void vTensor::View::CMD::barrier(State::Transition transition) { barrier.stage.src = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; } - // Optimization opportunity: delay and batch. - command_buffer().barrier(barrier); } } @@ -633,13 +777,13 @@ void vTensor::View::CMD::copy_buffer_to_staging( state.transition({ // Staging { - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, + vk_stage(Stage::Transfer), + vk_access(Stage::Transfer, Access::Write), }, // Buffer { - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_READ_BIT, + vk_stage(Stage::Transfer), + vk_access(Stage::Transfer, Access::Read), }, // Image {}, @@ -660,13 +804,13 @@ void vTensor::View::CMD::copy_staging_to_buffer( state.transition({ // Staging { - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_READ_BIT, + vk_stage(Stage::Transfer), + vk_access(Stage::Transfer, Access::Read), }, // Buffer { - VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, + vk_stage(Stage::Transfer), + vk_access(Stage::Transfer, Access::Write), }, // Image {}, @@ -689,27 +833,47 @@ void vTensor::View::CMD::copy_buffer_to_image( {}, // Buffer { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, + vk_stage(Stage::Compute), + vk_access(Stage::Compute, Access::Read), }, // Image { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, - VK_IMAGE_LAYOUT_GENERAL, + vk_stage(Stage::Compute), + vk_access(Stage::Compute, Access::Write), + vk_layout(Stage::Compute, Access::Write), }, })); + const uvec3 extents = view_.extents(); + const uint32_t plane = extents.data[0u] * extents.data[1u]; + + const struct { + uvec3 extents; + uint32_t block; + uvec4 offset; + } block { + extents, + 4u * plane, + { + 0u * plane, + 1u * plane, + 2u * plane, + 3u * plane, + }, + }; + view_.context_->dispatch( command_buffer(), { VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(nchw_to_image), - view_.extents(), + extents, image, - buffer); + buffer, + view_.context_->resource().pool.uniform(block).object); } void vTensor::View::CMD::copy_image_to_buffer( @@ -726,27 +890,47 @@ void vTensor::View::CMD::copy_image_to_buffer( {}, // Buffer { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, + vk_stage(Stage::Compute), + vk_access(Stage::Compute, Access::Write), }, // Image { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, - VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + vk_stage(Stage::Compute), + vk_access(Stage::Compute, Access::Read), + vk_layout(Stage::Compute, Access::Read), }, })); + const uvec3 extents = view_.extents(); + const uint32_t plane = extents.data[0u] * extents.data[1u]; + + const struct { + uvec3 extents; + uint32_t block; + uvec4 offset; + } block { + extents, + 4u * plane, + { + 0u * plane, + 1u * plane, + 2u * plane, + 3u * plane, + }, + }; + view_.context_->dispatch( command_buffer(), { VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }, VK_KERNEL(image_to_nchw), view_.extents(), image, - buffer); + buffer, + view_.context_->resource().pool.uniform(block).object); } void vTensor::View::CMD::submit(const api::Resource::Fence fence) { @@ -769,9 +953,10 @@ vTensor::Buffer& vTensor::View::buffer() const { } vTensor::Buffer& vTensor::View::buffer( + const Stage::Flags stage, const Access::Flags access) const { CMD command_buffer(*this); - Buffer& buffer = this->buffer(command_buffer, access); + Buffer& buffer = this->buffer(command_buffer, stage, access); command_buffer.submit(); return buffer; @@ -779,25 +964,27 @@ vTensor::Buffer& vTensor::View::buffer( vTensor::Buffer& vTensor::View::buffer( api::Command::Buffer& command_buffer_, + const Stage::Flags stage, const Access::Flags access) const { CMD command_buffer(*this, command_buffer_); - return buffer(command_buffer, access); + return buffer(command_buffer, stage, access); } vTensor::Buffer& vTensor::View::buffer( CMD& command_buffer, + const Stage::Flags stage, const Access::Flags access) const { if ((access & Access::Read) && state_.is_dirty(Component::Buffer)) { if (state_.is_clean(Component::Staging)) { command_buffer.copy_staging_to_buffer( state_, - staging(command_buffer, Access::Read).object, + staging(command_buffer, Stage::Transfer, Access::Read).object, buffer().object); } else if (state_.is_clean(Component::Image)) { command_buffer.copy_image_to_buffer( state_, - image(command_buffer, Access::Read).object, + image(command_buffer, Stage::Compute, Access::Read).object, buffer().object); } else { @@ -813,20 +1000,8 @@ vTensor::Buffer& vTensor::View::buffer( {}, // Buffer { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - [access]() { - VkAccessFlags vk_access = 0u; - - if (access & Access::Read) { - vk_access |= VK_ACCESS_SHADER_READ_BIT; - } - - if (access & Access::Write) { - vk_access |= VK_ACCESS_SHADER_WRITE_BIT; - } - - return vk_access; - }(), + vk_stage(stage), + vk_access(stage, access), }, // Image {}, @@ -845,7 +1020,7 @@ vTensor::Image& vTensor::View::image() const { if (!image_ && state_.is_available(Component::Image)) { image_ = allocate_image( pool_, - extents(), + vk_extent(extents()), options()); } @@ -853,9 +1028,10 @@ vTensor::Image& vTensor::View::image() const { } vTensor::Image& vTensor::View::image( + const Stage::Flags stage, const Access::Flags access) const { CMD command_buffer(*this); - Image& image = this->image(command_buffer, access); + Image& image = this->image(command_buffer, stage, access); command_buffer.submit(); return image; @@ -863,18 +1039,20 @@ vTensor::Image& vTensor::View::image( vTensor::Image& vTensor::View::image( api::Command::Buffer& command_buffer_, + const Stage::Flags stage, const Access::Flags access) const { CMD command_buffer(*this, command_buffer_); - return image(command_buffer, access); + return image(command_buffer, stage, access); } vTensor::Image& vTensor::View::image( CMD& command_buffer, + const Stage::Flags stage, const Access::Flags access) const { if ((access & Access::Read) && state_.is_dirty(Component::Image)) { command_buffer.copy_buffer_to_image( state_, - buffer(command_buffer, Access::Read).object, + buffer(command_buffer, stage, Access::Read).object, image().object); } @@ -886,27 +1064,9 @@ vTensor::Image& vTensor::View::image( {}, // Image { - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - [access]() { - VkAccessFlags vk_access = 0u; - - if (access & Access::Read) { - vk_access |= VK_ACCESS_SHADER_READ_BIT; - } - - if (access & Access::Write) { - vk_access |= VK_ACCESS_SHADER_WRITE_BIT; - } - - return vk_access; - }(), - [access]() { - if (Access::Read == (access & Access::Read)) { - return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - } - - return VK_IMAGE_LAYOUT_GENERAL; - }(), + vk_stage(stage), + vk_access(stage, access), + vk_layout(stage, access), }, })); @@ -935,9 +1095,11 @@ vTensor::Buffer& vTensor::View::staging() const { return staging_; } -vTensor::Buffer& vTensor::View::staging(const Access::Flags access) const { +vTensor::Buffer& vTensor::View::staging( + const Stage::Flags stage, + const Access::Flags access) const { CMD command_buffer(*this); - Buffer& staging = this->staging(command_buffer, access); + Buffer& staging = this->staging(command_buffer, stage, access); command_buffer.submit(fence()); return staging; @@ -945,11 +1107,12 @@ vTensor::Buffer& vTensor::View::staging(const Access::Flags access) const { vTensor::Buffer& vTensor::View::staging( CMD& command_buffer, + const Stage::Flags stage, const Access::Flags access) const { if ((access & Access::Read) && state_.is_dirty(Component::Staging)) { command_buffer.copy_buffer_to_staging( state_, - buffer(command_buffer, Access::Read).object, + buffer(command_buffer, Stage::Transfer, Access::Read).object, staging().object); } @@ -957,20 +1120,8 @@ vTensor::Buffer& vTensor::View::staging( state_.transition({ // Staging { - VK_PIPELINE_STAGE_HOST_BIT, - [access]() { - VkAccessFlags vk_access = 0u; - - if (access & Access::Read) { - vk_access |= VK_ACCESS_HOST_READ_BIT; - } - - if (access & Access::Write) { - vk_access |= VK_ACCESS_HOST_WRITE_BIT; - } - - return vk_access; - }(), + vk_stage(stage), + vk_access(stage, access), }, // Buffer {}, diff --git a/aten/src/ATen/native/vulkan/ops/Tensor.h b/aten/src/ATen/native/vulkan/ops/Tensor.h index ca4187d12b7d..48d4cca84dd4 100644 --- a/aten/src/ATen/native/vulkan/ops/Tensor.h +++ b/aten/src/ATen/native/vulkan/ops/Tensor.h @@ -85,6 +85,7 @@ class vTensor final { Types */ + typedef api::Pipeline::Stage Stage; typedef api::Resource::Memory::Access Access; typedef api::Resource::Buffer Buffer; typedef api::Resource::Fence Fence; @@ -132,6 +133,9 @@ class vTensor final { Payload wait() const &; private: + template + friend class Future; + // Intentionally disabed to enforce a usage pattern wherein the Future's // lifetime exceeds that of the Payload as we use the Future's destructor // to eagerly (as opposed to lazily and upon first use) upload the @@ -139,10 +143,6 @@ class vTensor final { Payload wait() const && = delete; - private: - template - friend class Future; - private: const vTensor* tensor_; }; @@ -178,22 +178,22 @@ class vTensor final { predictability of usage and efficiency. */ - Buffer::Object buffer() const &; - Buffer::Object buffer(Access::Flags access) &; - Buffer::Object buffer(api::Command::Buffer&) const &; - Buffer::Object buffer(api::Command::Buffer&, Access::Flags) &; + Buffer::Object buffer(Stage::Flags) const &; + Buffer::Object buffer(Stage::Flags, Access::Flags) &; + Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const &; + Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) &; bool has_image() const; - Image::Object image() const &; - Image::Object image(Access::Flags access) &; - Image::Object image(api::Command::Buffer&) const &; - Image::Object image(api::Command::Buffer&, Access::Flags) &; + Image::Object image(Stage::Flags) const &; + Image::Object image(Stage::Flags, Access::Flags) &; + Image::Object image(api::Command::Buffer&, Stage::Flags) const &; + Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) &; /* Metadata */ - const VkExtent3D& extents() const; + const api::utils::uvec3& extents() const; const TensorOptions& options() const; IntArrayRef sizes() const; IntArrayRef strides() const; @@ -223,15 +223,15 @@ class vTensor final { Device */ - Buffer::Object buffer() const && = delete; - Buffer::Object buffer(Access::Flags) && = delete; - Buffer::Object buffer(api::Command::Buffer&) const && = delete; - Buffer::Object buffer(api::Command::Buffer&, Access::Flags) && = delete; + Buffer::Object buffer(Stage::Flags) const && = delete; + Buffer::Object buffer(Stage::Flags, Access::Flags) && = delete; + Buffer::Object buffer(api::Command::Buffer&, Stage::Flags) const && = delete; + Buffer::Object buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete; - Image::Object image() const && = delete; - Image::Object image(Access::Flags) && = delete; - Image::Object image(api::Command::Buffer&) const && = delete; - Image::Object image(api::Command::Buffer&, Access::Flags) && = delete; + Image::Object image(Stage::Flags) const && = delete; + Image::Object image(Stage::Flags, Access::Flags) && = delete; + Image::Object image(api::Command::Buffer&, Stage::Flags) const && = delete; + Image::Object image(api::Command::Buffer&, Stage::Flags, Access::Flags) && = delete; private: class View final { @@ -248,18 +248,30 @@ class vTensor final { View operator=(View&&) = delete; ~View() = default; - Buffer& buffer(Access::Flags) const; - Buffer& buffer(api::Command::Buffer&, Access::Flags) const; + /* + Device + */ + + Buffer& buffer(Stage::Flags, Access::Flags) const; + Buffer& buffer(api::Command::Buffer&, Stage::Flags, Access::Flags) const; bool has_image() const; - Image& image(Access::Flags) const; - Image& image(api::Command::Buffer&, Access::Flags) const; + Image& image(Stage::Flags, Access::Flags) const; + Image& image(api::Command::Buffer&, Stage::Flags, Access::Flags) const; - Buffer& staging(Access::Flags) const; - Buffer& staging(api::Command::Buffer&, Access::Flags) const; + /* + Host + */ + + Buffer& staging(Stage::Flags, Access::Flags) const; + Buffer& staging(api::Command::Buffer&, Stage::Flags, Access::Flags) const; vTensor::Memory& wait() const; - const VkExtent3D& extents() const; + /* + Metadata + */ + + const api::utils::uvec3& extents() const; const TensorOptions& options() const; IntArrayRef sizes() const; IntArrayRef strides() const; @@ -326,11 +338,11 @@ class vTensor final { private: // Accessors / Lazy Allocation Buffer& buffer() const; - Buffer& buffer(CMD&, Access::Flags) const; + Buffer& buffer(CMD&, Stage::Flags, Access::Flags) const; Image& image() const; - Image& image(CMD&, Access::Flags) const; + Image& image(CMD&, Stage::Flags, Access::Flags) const; Buffer& staging() const; - Buffer& staging(CMD&, Access::Flags) const; + Buffer& staging(CMD&, Stage::Flags, Access::Flags) const; Fence& fence() const; // Validation @@ -351,7 +363,7 @@ class vTensor final { mutable State state_; // Metadata - VkExtent3D extents_; + api::utils::uvec3 extents_; TensorOptions options_; c10::SmallVector sizes_; c10::SmallVector strides_; @@ -486,7 +498,7 @@ inline bool vTensor::has_image() const { return view_->has_image(); } -inline const VkExtent3D& vTensor::extents() const { +inline const api::utils::uvec3& vTensor::extents() const { return view_->extents(); } @@ -511,7 +523,7 @@ inline bool vTensor::View::has_image() const { return state_.is_available(View::Component::Image); } -inline const VkExtent3D& vTensor::View::extents() const { +inline const api::utils::uvec3& vTensor::View::extents() const { return extents_; } diff --git a/aten/src/ATen/native/vulkan/ops/Upsample.cpp b/aten/src/ATen/native/vulkan/ops/Upsample.cpp index e38e367c7241..32508c01eec1 100644 --- a/aten/src/ATen/native/vulkan/ops/Upsample.cpp +++ b/aten/src/ATen/native/vulkan/ops/Upsample.cpp @@ -8,6 +8,8 @@ namespace vulkan { namespace ops { namespace { +using namespace api::utils; + Tensor upsample_nearest2d( const Tensor& input_arg, const IntArrayRef output_sizes, @@ -17,18 +19,17 @@ Tensor upsample_nearest2d( const Tensor input = input_arg.is_vulkan() ? input_arg : input_arg.vulkan(); const vTensor& v_input = convert(input); - - const auto input_sizes = input.sizes(); + const auto v_input_sizes = v_input.sizes(); TORCH_CHECK( - (4 == input_sizes.size()) && (2 == output_sizes.size()), + (4 == v_input_sizes.size()) && (2 == output_sizes.size()), "Invalid input!"); vTensor v_output{ context, { - input_sizes[Layout::Activation4D::batch], - input_sizes[Layout::Activation4D::channels], + v_input_sizes[Layout::Activation4D::batch], + v_input_sizes[Layout::Activation4D::channels], output_sizes[Layout::Parameter::height], output_sizes[Layout::Parameter::width], }, @@ -40,16 +41,27 @@ Tensor upsample_nearest2d( { if (v_input.has_image()) { const struct { - float scale_x, scale_y; + uvec3 extents; + uint32_t _; + ivec2 iextents; + vec2 scale; } block { - compute_scales_value( - scales_w, - input_sizes[Layout::Activation4D::width], - output_sizes[Layout::Parameter::width]), - compute_scales_value( - scales_h, - input_sizes[Layout::Activation4D::height], - output_sizes[Layout::Parameter::height]), + v_output.extents(), + 0u, + { + safe_downcast(input.size(Layout::Activation4D::width) - 1), + safe_downcast(input.size(Layout::Activation4D::height) - 1), + }, + { + compute_scales_value( + scales_w, + v_input_sizes[Layout::Activation4D::width], + output_sizes[Layout::Parameter::width]), + compute_scales_value( + scales_h, + v_input_sizes[Layout::Activation4D::height], + output_sizes[Layout::Parameter::height]), + }, }; context->dispatch( @@ -63,10 +75,15 @@ Tensor upsample_nearest2d( v_output.extents(), // Write-only access bypasses synchronization but inserts appropriate // barriers if necessary. - v_output.image(command_buffer, vTensor::Access::Write), + v_output.image( + command_buffer, + vTensor::Stage::Compute, + vTensor::Access::Write), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_input.image(command_buffer), + v_input.image( + command_buffer, + vTensor::Stage::Compute), // Object lifetime is managed by the resource pool. // It is OK not to keep track of the handle. context->resource().pool.uniform(block).object); diff --git a/aten/src/ATen/native/vulkan/ops/Utils.h b/aten/src/ATen/native/vulkan/ops/Utils.h new file mode 100644 index 000000000000..ffdc2b6e94eb --- /dev/null +++ b/aten/src/ATen/native/vulkan/ops/Utils.h @@ -0,0 +1,25 @@ +#pragma once + +#ifdef USE_VULKAN_API + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace ops { +namespace utils { + +int64_t normalize( + const int64_t dimension, + const int64_t n) { + return (dimension % n + n) % n; +} + +} // namespace utils +} // namespace ops +} // namespace vulkan +} // namespace native +} // namespace at + +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 70618dc8df84..4bf9bf46a965 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -77,12 +77,8 @@ list(APPEND ATen_HIP_TEST_SRCS # ${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_stream_test.cpp list(APPEND ATen_VULKAN_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp) - -if(USE_VULKAN_API) - list(APPEND ATen_VULKAN_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp) -endif() + ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_api_test.cpp) list(APPEND ATen_MOBILE_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 03105eec90ea..73b221c81b9d 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -14,7 +14,13 @@ bool checkRtol(const at::Tensor& diff, const std::vector& inputs) { maxValue = fmax(tensor.abs().max().item(), maxValue); } - return diff.abs().max().item() < (2e-6 * maxValue); +#ifdef USE_VULKAN_FP16_INFERENCE + constexpr float tolerance = 1e-2; +#else + constexpr float tolerance = 1e-5; +#endif + + return diff.abs().max().item() < (tolerance * maxValue); } bool almostEqual(const at::Tensor& a, const at::Tensor& b) { @@ -500,11 +506,11 @@ TEST(VulkanAPITest, empty) { } TEST(VulkanAPITest, mean) { - const auto in_cpu = at::rand({5, 3, 9, 9}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - const auto out_cpu = at::mean(in_cpu, {-1, -2}, false); + const auto in_cpu = at::rand({17, 3, 79, 53}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); + const auto out_cpu = at::mean(in_cpu, {-1, -2}, true); const auto in_vulkan = in_cpu.vulkan(); - const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, false); + const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, true); const auto check = almostEqual(out_cpu, out_vulkan.cpu()); if (!check) { @@ -515,12 +521,12 @@ TEST(VulkanAPITest, mean) { ASSERT_TRUE(check); } -TEST(VulkanAPITest, mean_keep_dim) { - const auto in_cpu = at::rand({10, 3, 21, 21}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - const auto out_cpu = at::mean(in_cpu, {-1, -2}, true); +TEST(VulkanAPITest, mean2d) { + const auto in_cpu = at::rand({11, 7, 173, 37}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); + const auto out_cpu = at::mean(in_cpu, {-1, -2}, false); const auto in_vulkan = in_cpu.vulkan(); - const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, true); + const auto out_vulkan = at::mean(in_vulkan, {-1, -2}, false); const auto check = almostEqual(out_cpu, out_vulkan.cpu()); if (!check) { @@ -730,7 +736,7 @@ class Conv2d final : public BaseOp { stride_(stride), padding_(padding), w_(at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat))), - b_(at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){ + b_(at::rand(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){ } at::Tensor run(at::Tensor& t) const override { @@ -850,7 +856,6 @@ class MobileNetV2 final : public OpsList { ops_.emplace_back(new Hardtanh_()); ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); ops_.emplace_back(new Hardtanh_()); - ops_.emplace_back(new Hardtanh_()); ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); ops_.emplace_back(new Hardtanh_()); diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp index d5483a7327b1..7c4e96f7f1a6 100644 --- a/aten/src/ATen/test/vulkan_test.cpp +++ b/aten/src/ATen/test/vulkan_test.cpp @@ -1,3 +1,5 @@ +#ifndef USE_VULKAN_API + #include #include @@ -938,3 +940,5 @@ TEST(VulkanTest, avg_pool2d) { } ASSERT_TRUE(check); } + +#endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/vulkan/Context.cpp b/aten/src/ATen/vulkan/Context.cpp index 8d2b5281d2ae..793c690a0c14 100644 --- a/aten/src/ATen/vulkan/Context.cpp +++ b/aten/src/ATen/vulkan/Context.cpp @@ -3,6 +3,10 @@ #include #include +#ifdef USE_VULKAN_API +#include +#endif /* USE_VULKAN_API */ + namespace at { namespace vulkan { @@ -23,8 +27,12 @@ at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src) { namespace native { bool is_vulkan_available() { +#ifdef USE_VULKAN_API + return native::vulkan::api::available(); +#else auto p = at::vulkan::g_vulkan_impl_registry.load(); return p ? p->is_vulkan_available() : false; +#endif } } // namespace native diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 7c068bb1e842..92015c269083 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -137,6 +137,12 @@ function(caffe2_print_configuration_summary) message(STATUS " USE_OPENMP : ${USE_OPENMP}") message(STATUS " USE_TBB : ${USE_TBB}") message(STATUS " USE_VULKAN : ${USE_VULKAN}") + if(${USE_VULKAN}) + message(STATUS " USE_VULKAN_FP16_INFERENCE : ${USE_VULKAN_FP16_INFERENCE}") + message(STATUS " USE_VULKAN_RELAXED_PRECISION : ${USE_VULKAN_RELAXED_PRECISION}") + message(STATUS " USE_VULKAN_SHADERC_RUNTIME : ${USE_VULKAN_SHADERC_RUNTIME}") + message(STATUS " USE_VULKAN_WRAPPER : ${USE_VULKAN_WRAPPER}") + endif() message(STATUS " USE_PROF : ${USE_PROF}") message(STATUS " USE_QNNPACK : ${USE_QNNPACK}") message(STATUS " USE_PYTORCH_QNNPACK : ${USE_PYTORCH_QNNPACK}")