Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rsx: Vertex cache improvements #13985

Merged
merged 8 commits into from Jun 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2,551 changes: 2,551 additions & 0 deletions 3rdparty/robin_hood/include/robin_hood.h

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions rpcs3/Emu/RSX/Common/unordered_map.hpp
@@ -0,0 +1,19 @@
#pragma once

#ifdef RSX_USE_STD_MAP
#include <unordered_map>

namespace rsx
{
template<typename T, typename U>
using unordered_map = std::unordered_map<T, U>;
}
#else
#include "3rdparty/robin_hood/include/robin_hood.h"

namespace rsx
{
template<typename T, typename U>
using unordered_map = ::robin_hood::unordered_map<T, U>;
}
#endif
3 changes: 3 additions & 0 deletions rpcs3/Emu/RSX/Core/RSXDisplay.h
Expand Up @@ -16,6 +16,9 @@ namespace rsx
s64 textures_upload_time;
s64 draw_exec_time;
s64 flip_time;

u32 vertex_cache_request_count;
u32 vertex_cache_miss_count;
};

struct frame_time_t
Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/RSX/GL/GLGSRender.cpp
Expand Up @@ -38,7 +38,7 @@ GLGSRender::GLGSRender(utils::serial* ar) noexcept : GSRender(ar)
{
m_shaders_cache = std::make_unique<gl::shader_cache>(m_prog_buffer, "opengl", "v1.94");

if (g_cfg.video.disable_vertex_cache || g_cfg.video.multithreaded_rsx)
if (g_cfg.video.disable_vertex_cache)
m_vertex_cache = std::make_unique<gl::null_vertex_cache>();
else
m_vertex_cache = std::make_unique<gl::weak_vertex_cache>();
Expand Down
4 changes: 2 additions & 2 deletions rpcs3/Emu/RSX/GL/GLGSRender.h
Expand Up @@ -20,8 +20,8 @@

namespace gl
{
using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range<GLenum>, GLenum>;
using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache<GLenum>;
using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range>;
using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache;
using null_vertex_cache = vertex_cache;

using shader_cache = rsx::shaders_cache<void*, GLProgramBuffer>;
Expand Down
7 changes: 7 additions & 0 deletions rpcs3/Emu/RSX/GL/GLPresent.cpp
Expand Up @@ -364,10 +364,17 @@ void GLGSRender::flip(const rsx::display_flip_info_t& info)
const auto num_texture_upload = m_gl_texture_cache.get_texture_upload_calls_this_frame();
const auto num_texture_upload_miss = m_gl_texture_cache.get_texture_upload_misses_this_frame();
const auto texture_upload_miss_ratio = m_gl_texture_cache.get_texture_upload_miss_percentage();

println(fmt::format("Unreleased textures: %7d", num_dirty_textures));
println(fmt::format("Texture memory: %12dM", texture_memory_size));
println(fmt::format("Flush requests: %12d = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate));
println(fmt::format("Texture uploads: %15u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio));

const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count);
const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count
? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count
: 0;
println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio));
}

if (gl::debug::g_vis_texture)
Expand Down
4 changes: 2 additions & 2 deletions rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
Expand Up @@ -195,7 +195,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride);
storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset;

if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first))
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, required.first))
{
ensure(cached->local_address == storage_address);

Expand All @@ -216,7 +216,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
if (to_store)
{
//store ref in vertex cache
m_vertex_cache->store_range(storage_address, GL_R8UI, required.first, persistent_mapping.second);
m_vertex_cache->store_range(storage_address, required.first, persistent_mapping.second);
}
}

Expand Down
23 changes: 10 additions & 13 deletions rpcs3/Emu/RSX/RSXThread.cpp
Expand Up @@ -1218,10 +1218,10 @@ namespace rsx

std::span<const std::byte> thread::get_raw_index_array(const draw_clause& draw_indexed_clause) const
{
if (!element_push_buffer.empty())
if (!element_push_buffer.empty()) [[ unlikely ]]
{
//Indices provided via immediate mode
return{reinterpret_cast<const std::byte*>(element_push_buffer.data()), ::narrow<u32>(element_push_buffer.size() * sizeof(u32))};
// Indices provided via immediate mode
return {reinterpret_cast<const std::byte*>(element_push_buffer.data()), ::narrow<u32>(element_push_buffer.size() * sizeof(u32))};
}

const rsx::index_array_type type = rsx::method_registers.index_type();
Expand All @@ -1230,32 +1230,29 @@ namespace rsx
// Force aligned indices as realhw
const u32 address = (0 - type_size) & get_address(rsx::method_registers.index_array_address(), rsx::method_registers.index_array_location());

//const bool is_primitive_restart_enabled = rsx::method_registers.restart_index_enabled();
//const u32 primitive_restart_index = rsx::method_registers.restart_index();

const u32 first = draw_indexed_clause.min_index();
const u32 count = draw_indexed_clause.get_elements_count();

const auto ptr = vm::_ptr<const std::byte>(address);
return{ ptr + first * type_size, count * type_size };
return { ptr + first * type_size, count * type_size };
}

std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
thread::get_draw_command(const rsx::rsx_state& state) const
{
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::array)
{
return draw_array_command{};
}

if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed)
if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]]
{
return draw_indexed_array_command
{
get_raw_index_array(state.current_draw_clause)
};
}

if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::array)
{
return draw_array_command{};
}

if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
{
return draw_inlined_array{};
Expand Down
2 changes: 1 addition & 1 deletion rpcs3/Emu/RSX/VK/VKGSRender.cpp
Expand Up @@ -696,7 +696,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
}
);

if (g_cfg.video.disable_vertex_cache || g_cfg.video.multithreaded_rsx)
if (g_cfg.video.disable_vertex_cache)
m_vertex_cache = std::make_unique<vk::null_vertex_cache>();
else
m_vertex_cache = std::make_unique<vk::weak_vertex_cache>();
Expand Down
4 changes: 2 additions & 2 deletions rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
Expand Up @@ -31,8 +31,8 @@ namespace vk
struct program_cache;
struct pipeline_props;

using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range<VkFormat>, VkFormat>;
using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache<VkFormat>;
using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range>;
using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache;
using null_vertex_cache = vertex_cache;

using shader_cache = rsx::shaders_cache<vk::pipeline_props, vk::program_cache>;
Expand Down
7 changes: 7 additions & 0 deletions rpcs3/Emu/RSX/VK/VKPresent.cpp
Expand Up @@ -801,11 +801,18 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
const auto num_texture_upload = m_texture_cache.get_texture_upload_calls_this_frame();
const auto num_texture_upload_miss = m_texture_cache.get_texture_upload_misses_this_frame();
const auto texture_upload_miss_ratio = m_texture_cache.get_texture_upload_miss_percentage();

println(fmt::format("Unreleased textures: %8d", num_dirty_textures));
println(fmt::format("Texture cache memory: %7dM", texture_memory_size));
println(fmt::format("Temporary texture memory: %3dM", tmp_texture_memory_size));
println(fmt::format("Flush requests: %13d = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate));
println(fmt::format("Texture uploads: %14u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio));

const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count);
const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count
? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count
: 0;
println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio));
}

direct_fbo->release();
Expand Down
8 changes: 6 additions & 2 deletions rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
Expand Up @@ -243,13 +243,15 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
bool to_store = false;
u32 storage_address = -1;

m_frame_stats.vertex_cache_request_count++;

if (m_vertex_layout.interleaved_blocks.size() == 1 &&
rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array)
{
const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride);
storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset;

if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, required.first))
{
ensure(cached->local_address == storage_address);

Expand All @@ -264,13 +266,15 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()

if (!in_cache)
{
m_frame_stats.vertex_cache_miss_count++;

persistent_offset = static_cast<u32>(m_attrib_ring_info.alloc<256>(required.first));
persistent_range_base = static_cast<u32>(persistent_offset);

if (to_store)
{
//store ref in vertex cache
m_vertex_cache->store_range(storage_address, VK_FORMAT_R8_UINT, required.first, static_cast<u32>(persistent_offset));
m_vertex_cache->store_range(storage_address, required.first, static_cast<u32>(persistent_offset));
}
}
}
Expand Down
49 changes: 25 additions & 24 deletions rpcs3/Emu/RSX/rsx_cache.h
Expand Up @@ -3,14 +3,14 @@
#include "Utilities/lockless.h"
#include "Utilities/Thread.h"
#include "Common/bitfield.hpp"
#include "Common/unordered_map.hpp"
#include "Emu/System.h"
#include "Emu/cache_utils.hpp"
#include "Program/ProgramStateCache.h"
#include "Common/texture_cache_checker.h"
#include "Overlays/Shaders/shader_loading_dialog.h"

#include <chrono>
#include <unordered_map>

#include "util/sysinfo.hpp"
#include "util/fnv_hash.hpp"
Expand Down Expand Up @@ -447,61 +447,62 @@ namespace rsx
namespace vertex_cache
{
// A null vertex cache
template <typename storage_type, typename upload_format>
template <typename storage_type>
class default_vertex_cache
{
public:
virtual ~default_vertex_cache() = default;
virtual storage_type* find_vertex_range(uptr /*local_addr*/, upload_format, u32 /*data_length*/) { return nullptr; }
virtual void store_range(uptr /*local_addr*/, upload_format, u32 /*data_length*/, u32 /*offset_in_heap*/) {}
virtual const storage_type* find_vertex_range(u32 /*local_addr*/, u32 /*data_length*/) { return nullptr; }
virtual void store_range(u32 /*local_addr*/, u32 /*data_length*/, u32 /*offset_in_heap*/) {}
virtual void purge() {}
};

// A weak vertex cache with no data checks or memory range locks
// Of limited use since contents are only guaranteed to be valid once per frame
// TODO: Strict vertex cache with range locks
template <typename upload_format>
struct uploaded_range
{
uptr local_address;
upload_format buffer_format;
u32 offset_in_heap;
u32 data_length;
};

template <typename upload_format>
class weak_vertex_cache : public default_vertex_cache<uploaded_range<upload_format>, upload_format>
// A weak vertex cache with no data checks or memory range locks
// Of limited use since contents are only guaranteed to be valid once per frame
// Supports upto 1GiB block lengths if typed and full 4GiB otherwise.
// Using a 1:1 hash-value with robin-hood is 2x faster than what we had before with std-map-of-arrays.
class weak_vertex_cache : public default_vertex_cache<uploaded_range>
{
using storage_type = uploaded_range<upload_format>;
using storage_type = uploaded_range;

private:
std::unordered_map<uptr, std::vector<storage_type>> vertex_ranges;
rsx::unordered_map<uptr, storage_type> vertex_ranges;

FORCE_INLINE u64 hash(u32 local_addr, u32 data_length) const
{
return u64(local_addr) | (u64(data_length) << 32);
}

public:

storage_type* find_vertex_range(uptr local_addr, upload_format fmt, u32 data_length) override
const storage_type* find_vertex_range(u32 local_addr, u32 data_length) override
{
//const auto data_end = local_addr + data_length;

for (auto &v : vertex_ranges[local_addr])
const auto key = hash(local_addr, data_length);
const auto found = vertex_ranges.find(key);
if (found == vertex_ranges.end())
{
// NOTE: This has to match exactly. Using sized shortcuts such as >= comparison causes artifacting in some applications (UC1)
if (v.buffer_format == fmt && v.data_length == data_length)
return &v;
return nullptr;
}

return nullptr;
return std::addressof(found->second);
}

void store_range(uptr local_addr, upload_format fmt, u32 data_length, u32 offset_in_heap) override
void store_range(u32 local_addr, u32 data_length, u32 offset_in_heap) override
{
storage_type v = {};
v.buffer_format = fmt;
v.data_length = data_length;
v.local_address = local_addr;
v.offset_in_heap = offset_in_heap;

vertex_ranges[local_addr].push_back(v);
const auto key = hash(local_addr, data_length);
vertex_ranges[key] = v;
}

void purge() override
Expand Down
1 change: 1 addition & 0 deletions rpcs3/emucore.vcxproj
Expand Up @@ -556,6 +556,7 @@
<ClInclude Include="Emu\RSX\Common\simple_array.hpp" />
<ClInclude Include="Emu\RSX\Common\surface_cache_dma.hpp" />
<ClInclude Include="Emu\RSX\Common\time.hpp" />
<ClInclude Include="Emu\RSX\Common\unordered_map.hpp" />
<ClInclude Include="Emu\RSX\Core\RSXEngLock.hpp" />
<ClInclude Include="Emu\RSX\Core\RSXFrameBuffer.h" />
<ClInclude Include="Emu\RSX\Core\RSXIOMap.hpp" />
Expand Down
3 changes: 3 additions & 0 deletions rpcs3/emucore.vcxproj.filters
Expand Up @@ -2365,6 +2365,9 @@
<ClInclude Include="Emu\Io\emulated_pad_config.h">
<Filter>Emu\Io</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Common\unordered_map.hpp">
<Filter>Emu\GPU\RSX\Common</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">
Expand Down
5 changes: 0 additions & 5 deletions rpcs3/rpcs3qt/settings_dialog.cpp
Expand Up @@ -648,11 +648,6 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std

m_emu_settings->EnhanceCheckBox(ui->multithreadedRSX, emu_settings_type::MultithreadedRSX);
SubscribeTooltip(ui->multithreadedRSX, tooltips.settings.multithreaded_rsx);
connect(ui->multithreadedRSX, &QCheckBox::toggled, [this](bool checked)
{
ui->disableVertexCache->setEnabled(!checked);
});
ui->disableVertexCache->setEnabled(!ui->multithreadedRSX->isChecked());

m_emu_settings->EnhanceCheckBox(ui->strictModeRendering, emu_settings_type::StrictRenderingMode);
SubscribeTooltip(ui->strictModeRendering, tooltips.settings.strict_rendering_mode);
Expand Down