RPCS3 · kd-11 · Jun 9, 2023 · Jun 7, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/3rdparty/robin_hood/include/robin_hood.h b/3rdparty/robin_hood/include/robin_hood.h
diff --git a/rpcs3/Emu/RSX/Common/unordered_map.hpp b/rpcs3/Emu/RSX/Common/unordered_map.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#ifdef RSX_USE_STD_MAP
+#include <unordered_map>
+
+namespace rsx
+{
+	template<typename T, typename U>
+	using unordered_map = std::unordered_map<T, U>;
+}
+#else
+#include "3rdparty/robin_hood/include/robin_hood.h"
+
+namespace rsx
+{
+	template<typename T, typename U>
+	using unordered_map = ::robin_hood::unordered_map<T, U>;
+}
+#endif
diff --git a/rpcs3/Emu/RSX/Core/RSXDisplay.h b/rpcs3/Emu/RSX/Core/RSXDisplay.h
@@ -16,6 +16,9 @@ namespace rsx
 		s64 textures_upload_time;
 		s64 draw_exec_time;
 		s64 flip_time;
+
+		u32 vertex_cache_request_count;
+		u32 vertex_cache_miss_count;
 	};
 
 	struct frame_time_t

diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -38,7 +38,7 @@ GLGSRender::GLGSRender(utils::serial* ar) noexcept : GSRender(ar)
 {
 	m_shaders_cache = std::make_unique<gl::shader_cache>(m_prog_buffer, "opengl", "v1.94");
 
-	if (g_cfg.video.disable_vertex_cache || g_cfg.video.multithreaded_rsx)
+	if (g_cfg.video.disable_vertex_cache)
 		m_vertex_cache = std::make_unique<gl::null_vertex_cache>();
 	else
 		m_vertex_cache = std::make_unique<gl::weak_vertex_cache>();

diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h
@@ -20,8 +20,8 @@
 
 namespace gl
 {
-	using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range<GLenum>, GLenum>;
-	using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache<GLenum>;
+	using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range>;
+	using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache;
 	using null_vertex_cache = vertex_cache;
 
 	using shader_cache = rsx::shaders_cache<void*, GLProgramBuffer>;

diff --git a/rpcs3/Emu/RSX/GL/GLPresent.cpp b/rpcs3/Emu/RSX/GL/GLPresent.cpp
@@ -364,10 +364,17 @@ void GLGSRender::flip(const rsx::display_flip_info_t& info)
 		const auto num_texture_upload = m_gl_texture_cache.get_texture_upload_calls_this_frame();
 		const auto num_texture_upload_miss = m_gl_texture_cache.get_texture_upload_misses_this_frame();
 		const auto texture_upload_miss_ratio = m_gl_texture_cache.get_texture_upload_miss_percentage();
+
 		println(fmt::format("Unreleased textures: %7d", num_dirty_textures));
 		println(fmt::format("Texture memory: %12dM", texture_memory_size));
 		println(fmt::format("Flush requests: %12d  = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate));
 		println(fmt::format("Texture uploads: %15u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio));
+
+		const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count);
+		const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count
+			? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count
+			: 0;
+		println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio));
 	}
 
 	if (gl::debug::g_vis_texture)

diff --git a/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp b/rpcs3/Emu/RSX/GL/GLVertexBuffers.cpp
@@ -195,7 +195,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
 			const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride);
 			storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset;
 
-			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, GL_R8UI, required.first))
+			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, required.first))
 			{
 				ensure(cached->local_address == storage_address);
 
@@ -216,7 +216,7 @@ gl::vertex_upload_info GLGSRender::set_vertex_buffer()
 			if (to_store)
 			{
 				//store ref in vertex cache
-				m_vertex_cache->store_range(storage_address, GL_R8UI, required.first, persistent_mapping.second);
+				m_vertex_cache->store_range(storage_address, required.first, persistent_mapping.second);
 			}
 		}
 

diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp
@@ -1218,10 +1218,10 @@ namespace rsx
 
 	std::span<const std::byte> thread::get_raw_index_array(const draw_clause& draw_indexed_clause) const
 	{
-		if (!element_push_buffer.empty())
+		if (!element_push_buffer.empty()) [[ unlikely ]]
 		{
-			//Indices provided via immediate mode
-			return{reinterpret_cast<const std::byte*>(element_push_buffer.data()), ::narrow<u32>(element_push_buffer.size() * sizeof(u32))};
+			// Indices provided via immediate mode
+			return {reinterpret_cast<const std::byte*>(element_push_buffer.data()), ::narrow<u32>(element_push_buffer.size() * sizeof(u32))};
 		}
 
 		const rsx::index_array_type type = rsx::method_registers.index_type();
@@ -1230,32 +1230,29 @@ namespace rsx
 		// Force aligned indices as realhw
 		const u32 address = (0 - type_size) & get_address(rsx::method_registers.index_array_address(), rsx::method_registers.index_array_location());
 
-		//const bool is_primitive_restart_enabled = rsx::method_registers.restart_index_enabled();
-		//const u32 primitive_restart_index = rsx::method_registers.restart_index();
-
 		const u32 first = draw_indexed_clause.min_index();
 		const u32 count = draw_indexed_clause.get_elements_count();
 
 		const auto ptr = vm::_ptr<const std::byte>(address);
-		return{ ptr + first * type_size, count * type_size };
+		return { ptr + first * type_size, count * type_size };
 	}
 
 	std::variant<draw_array_command, draw_indexed_array_command, draw_inlined_array>
 	thread::get_draw_command(const rsx::rsx_state& state) const
 	{
-		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::array)
-		{
-			return draw_array_command{};
-		}
-
-		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed)
+		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::indexed) [[ likely ]]
 		{
 			return draw_indexed_array_command
 			{
 				get_raw_index_array(state.current_draw_clause)
 			};
 		}
 
+		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::array)
+		{
+			return draw_array_command{};
+		}
+
 		if (rsx::method_registers.current_draw_clause.command == rsx::draw_command::inlined_array)
 		{
 			return draw_inlined_array{};

diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -696,7 +696,7 @@ VKGSRender::VKGSRender(utils::serial* ar) noexcept : GSRender(ar)
 		}
 	);
 
-	if (g_cfg.video.disable_vertex_cache || g_cfg.video.multithreaded_rsx)
+	if (g_cfg.video.disable_vertex_cache)
 		m_vertex_cache = std::make_unique<vk::null_vertex_cache>();
 	else
 		m_vertex_cache = std::make_unique<vk::weak_vertex_cache>();

diff --git a/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp b/rpcs3/Emu/RSX/VK/VKGSRenderTypes.hpp
@@ -31,8 +31,8 @@ namespace vk
 	struct program_cache;
 	struct pipeline_props;
 
-	using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range<VkFormat>, VkFormat>;
-	using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache<VkFormat>;
+	using vertex_cache = rsx::vertex_cache::default_vertex_cache<rsx::vertex_cache::uploaded_range>;
+	using weak_vertex_cache = rsx::vertex_cache::weak_vertex_cache;
 	using null_vertex_cache = vertex_cache;
 
 	using shader_cache = rsx::shaders_cache<vk::pipeline_props, vk::program_cache>;

diff --git a/rpcs3/Emu/RSX/VK/VKPresent.cpp b/rpcs3/Emu/RSX/VK/VKPresent.cpp
@@ -801,11 +801,18 @@ void VKGSRender::flip(const rsx::display_flip_info_t& info)
 			const auto num_texture_upload = m_texture_cache.get_texture_upload_calls_this_frame();
 			const auto num_texture_upload_miss = m_texture_cache.get_texture_upload_misses_this_frame();
 			const auto texture_upload_miss_ratio = m_texture_cache.get_texture_upload_miss_percentage();
+
 			println(fmt::format("Unreleased textures: %8d", num_dirty_textures));
 			println(fmt::format("Texture cache memory: %7dM", texture_memory_size));
 			println(fmt::format("Temporary texture memory: %3dM", tmp_texture_memory_size));
 			println(fmt::format("Flush requests: %13d  = %2d (%3d%%) hard faults, %2d unavoidable, %2d misprediction(s), %2d speculation(s)", num_flushes, num_misses, cache_miss_ratio, num_unavoidable, num_mispredict, num_speculate));
 			println(fmt::format("Texture uploads: %14u (%u from CPU - %02u%%)", num_texture_upload, num_texture_upload_miss, texture_upload_miss_ratio));
+
+			const auto vertex_cache_hit_count = (info.stats.vertex_cache_request_count - info.stats.vertex_cache_miss_count);
+			const auto vertex_cache_hit_ratio = info.stats.vertex_cache_request_count
+				? (vertex_cache_hit_count * 100) / info.stats.vertex_cache_request_count
+				: 0;
+			println(fmt::format("Vertex cache hits: %12u/%u (%u%%)", vertex_cache_hit_count, info.stats.vertex_cache_request_count, vertex_cache_hit_ratio));
 		}
 
 		direct_fbo->release();

diff --git a/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp b/rpcs3/Emu/RSX/VK/VKVertexBuffers.cpp
@@ -243,13 +243,15 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
 		bool to_store = false;
 		u32  storage_address = -1;
 
+		m_frame_stats.vertex_cache_request_count++;
+
 		if (m_vertex_layout.interleaved_blocks.size() == 1 &&
 			rsx::method_registers.current_draw_clause.command != rsx::draw_command::inlined_array)
 		{
 			const auto data_offset = (vertex_base * m_vertex_layout.interleaved_blocks[0]->attribute_stride);
 			storage_address = m_vertex_layout.interleaved_blocks[0]->real_offset_address + data_offset;
 
-			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
+			if (auto cached = m_vertex_cache->find_vertex_range(storage_address, required.first))
 			{
 				ensure(cached->local_address == storage_address);
 
@@ -264,13 +266,15 @@ vk::vertex_upload_info VKGSRender::upload_vertex_data()
 
 		if (!in_cache)
 		{
+			m_frame_stats.vertex_cache_miss_count++;
+
 			persistent_offset = static_cast<u32>(m_attrib_ring_info.alloc<256>(required.first));
 			persistent_range_base = static_cast<u32>(persistent_offset);
 
 			if (to_store)
 			{
 				//store ref in vertex cache
-				m_vertex_cache->store_range(storage_address, VK_FORMAT_R8_UINT, required.first, static_cast<u32>(persistent_offset));
+				m_vertex_cache->store_range(storage_address, required.first, static_cast<u32>(persistent_offset));
 			}
 		}
 	}

diff --git a/rpcs3/Emu/RSX/rsx_cache.h b/rpcs3/Emu/RSX/rsx_cache.h
@@ -3,14 +3,14 @@
 #include "Utilities/lockless.h"
 #include "Utilities/Thread.h"
 #include "Common/bitfield.hpp"
+#include "Common/unordered_map.hpp"
 #include "Emu/System.h"
 #include "Emu/cache_utils.hpp"
 #include "Program/ProgramStateCache.h"
 #include "Common/texture_cache_checker.h"
 #include "Overlays/Shaders/shader_loading_dialog.h"
 
 #include <chrono>
-#include <unordered_map>
 
 #include "util/sysinfo.hpp"
 #include "util/fnv_hash.hpp"
@@ -447,61 +447,62 @@ namespace rsx
 	namespace vertex_cache
 	{
 		// A null vertex cache
-		template <typename storage_type, typename upload_format>
+		template <typename storage_type>
 		class default_vertex_cache
 		{
 		public:
 			virtual ~default_vertex_cache() = default;
-			virtual storage_type* find_vertex_range(uptr /*local_addr*/, upload_format, u32 /*data_length*/) { return nullptr; }
-			virtual void store_range(uptr /*local_addr*/, upload_format, u32 /*data_length*/, u32 /*offset_in_heap*/) {}
+			virtual const storage_type* find_vertex_range(u32 /*local_addr*/, u32 /*data_length*/) { return nullptr; }
+			virtual void store_range(u32 /*local_addr*/, u32 /*data_length*/, u32 /*offset_in_heap*/) {}
 			virtual void purge() {}
 		};
 
-		// A weak vertex cache with no data checks or memory range locks
-		// Of limited use since contents are only guaranteed to be valid once per frame
-		// TODO: Strict vertex cache with range locks
-		template <typename upload_format>
 		struct uploaded_range
 		{
 			uptr local_address;
-			upload_format buffer_format;
 			u32 offset_in_heap;
 			u32 data_length;
 		};
 
-		template <typename upload_format>
-		class weak_vertex_cache : public default_vertex_cache<uploaded_range<upload_format>, upload_format>
+		// A weak vertex cache with no data checks or memory range locks
+		// Of limited use since contents are only guaranteed to be valid once per frame
+		// Supports upto 1GiB block lengths if typed and full 4GiB otherwise.
+		// Using a 1:1 hash-value with robin-hood is 2x faster than what we had before with std-map-of-arrays.
+		class weak_vertex_cache : public default_vertex_cache<uploaded_range>
 		{
-			using storage_type = uploaded_range<upload_format>;
+			using storage_type = uploaded_range;
 
 		private:
-			std::unordered_map<uptr, std::vector<storage_type>> vertex_ranges;
+			rsx::unordered_map<uptr, storage_type> vertex_ranges;
+
+			FORCE_INLINE u64 hash(u32 local_addr, u32 data_length) const
+			{
+				return u64(local_addr) | (u64(data_length) << 32);
+			}
 
 		public:
 
-			storage_type* find_vertex_range(uptr local_addr, upload_format fmt, u32 data_length) override
+			const storage_type* find_vertex_range(u32 local_addr, u32 data_length) override
 			{
-				//const auto data_end = local_addr + data_length;
-
-				for (auto &v : vertex_ranges[local_addr])
+				const auto key = hash(local_addr, data_length);
+				const auto found = vertex_ranges.find(key);
+				if (found == vertex_ranges.end())
 				{
-					// NOTE: This has to match exactly. Using sized shortcuts such as >= comparison causes artifacting in some applications (UC1)
-					if (v.buffer_format == fmt && v.data_length == data_length)
-						return &v;
+					return nullptr;
 				}
 
-				return nullptr;
+				return std::addressof(found->second);
 			}
 
-			void store_range(uptr local_addr, upload_format fmt, u32 data_length, u32 offset_in_heap) override
+			void store_range(u32 local_addr, u32 data_length, u32 offset_in_heap) override
 			{
 				storage_type v = {};
-				v.buffer_format = fmt;
 				v.data_length = data_length;
 				v.local_address = local_addr;
 				v.offset_in_heap = offset_in_heap;
 
-				vertex_ranges[local_addr].push_back(v);
+				const auto key = hash(local_addr, data_length);
+				vertex_ranges[key] = v;
 			}
 
 			void purge() override

diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
@@ -556,6 +556,7 @@
     <ClInclude Include="Emu\RSX\Common\simple_array.hpp" />
     <ClInclude Include="Emu\RSX\Common\surface_cache_dma.hpp" />
     <ClInclude Include="Emu\RSX\Common\time.hpp" />
+    <ClInclude Include="Emu\RSX\Common\unordered_map.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXEngLock.hpp" />
     <ClInclude Include="Emu\RSX\Core\RSXFrameBuffer.h" />
     <ClInclude Include="Emu\RSX\Core\RSXIOMap.hpp" />

diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
@@ -2365,6 +2365,9 @@
     <ClInclude Include="Emu\Io\emulated_pad_config.h">
       <Filter>Emu\Io</Filter>
     </ClInclude>
+    <ClInclude Include="Emu\RSX\Common\unordered_map.hpp">
+      <Filter>Emu\GPU\RSX\Common</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">

diff --git a/rpcs3/rpcs3qt/settings_dialog.cpp b/rpcs3/rpcs3qt/settings_dialog.cpp
@@ -648,11 +648,6 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
 
 	m_emu_settings->EnhanceCheckBox(ui->multithreadedRSX, emu_settings_type::MultithreadedRSX);
 	SubscribeTooltip(ui->multithreadedRSX, tooltips.settings.multithreaded_rsx);
-	connect(ui->multithreadedRSX, &QCheckBox::toggled, [this](bool checked)
-	{
-		ui->disableVertexCache->setEnabled(!checked);
-	});
-	ui->disableVertexCache->setEnabled(!ui->multithreadedRSX->isChecked());
 
 	m_emu_settings->EnhanceCheckBox(ui->strictModeRendering, emu_settings_type::StrictRenderingMode);
 	SubscribeTooltip(ui->strictModeRendering, tooltips.settings.strict_rendering_mode);