Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vk: Batch query copy requests to reduce number of vulkan commands used #14032

Merged
merged 2 commits into from Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
48 changes: 45 additions & 3 deletions rpcs3/Emu/RSX/VK/VKGSRender.cpp
Expand Up @@ -2894,7 +2894,7 @@ void VKGSRender::begin_conditional_rendering(const std::vector<rsx::reports::occ
if (!query_info.indices.empty())
{
const auto& index = query_info.indices.front();
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, m_cond_render_buffer->value, 0);
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, 1, m_cond_render_buffer->value, 0);

vk::insert_buffer_memory_barrier(*m_current_command_buffer, m_cond_render_buffer->value, 0, 4,
VK_PIPELINE_STAGE_TRANSFER_BIT, dst_stage,
Expand All @@ -2912,16 +2912,58 @@ void VKGSRender::begin_conditional_rendering(const std::vector<rsx::reports::occ
{
// We'll need to do some result aggregation using a compute shader.
auto scratch = vk::get_scratch_buffer(*m_current_command_buffer, num_hw_queries * 4);

// Range latching. Because of how the query pool manages allocations using a stack, we get an inverse sequential set of handles/indices that we can easily group together.
// This drastically boosts performance on some drivers like the NVIDIA proprietary one that seems to have a rather high cost for every individual query transer command.
struct { u32 first, last; } query_range = { umax, 0 };

auto copy_query_range_impl = [&]()
{
const auto count = (query_range.last - query_range.first + 1);
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, query_range.first, count, scratch->value, dst_offset);
dst_offset += count * 4;
};

for (usz i = first; i < last; ++i)
{
auto& query_info = m_occlusion_map[sources[i]->driver_handle];
for (const auto& index : query_info.indices)
{
m_occlusion_query_manager->get_query_result_indirect(*m_current_command_buffer, index, scratch->value, dst_offset);
dst_offset += 4;
// First iteration?
if (query_range.first == umax)
{
query_range = { index, index };
continue;
}

// Head?
if ((query_range.first - 1) == index)
{
query_range.first = index;
continue;
}

// Tail?
if ((query_range.last + 1) == index)
{
query_range.last = index;
continue;
}

// Flush pending queue. In practice, this is never reached and we fall out to the spill block outside the loops
copy_query_range_impl();

// Start a new range for the current index
query_range = { index, index };
}
}

if (query_range.first != umax)
{
// Dangling queries, flush
copy_query_range_impl();
}

// Sanity check
ensure(dst_offset <= scratch->size());

Expand Down
4 changes: 2 additions & 2 deletions rpcs3/Emu/RSX/VK/VKQueryPool.cpp
Expand Up @@ -168,11 +168,11 @@ namespace vk
return query_info.data;
}

void query_pool_manager::get_query_result_indirect(vk::command_buffer& cmd, u32 index, VkBuffer dst, VkDeviceSize dst_offset)
void query_pool_manager::get_query_result_indirect(vk::command_buffer& cmd, u32 index, u32 count, VkBuffer dst, VkDeviceSize dst_offset)
{
// We're technically supposed to stop any active renderpasses before streaming the results out, but that doesn't matter on IMR hw
// On TBDR setups like the apple M series, the stop is required (results are all 0 if you don't flush the RP), but this introduces a very heavy performance loss.
vkCmdCopyQueryPoolResults(cmd, *query_slot_status[index].pool, index, 1, dst, dst_offset, 4, VK_QUERY_RESULT_WAIT_BIT);
vkCmdCopyQueryPoolResults(cmd, *query_slot_status[index].pool, index, count, dst, dst_offset, 4, VK_QUERY_RESULT_WAIT_BIT);
}

void query_pool_manager::free_query(vk::command_buffer&/*cmd*/, u32 index)
Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/RSX/VK/VKQueryPool.h
Expand Up @@ -47,7 +47,7 @@ namespace vk

bool check_query_status(u32 index);
u32 get_query_result(u32 index);
void get_query_result_indirect(vk::command_buffer& cmd, u32 index, VkBuffer dst, VkDeviceSize dst_offset);
void get_query_result_indirect(vk::command_buffer& cmd, u32 index, u32 count, VkBuffer dst, VkDeviceSize dst_offset);

u32 allocate_query(vk::command_buffer& cmd);
void free_query(vk::command_buffer&/*cmd*/, u32 index);
Expand Down