RPCS3 · Nekotekina · May 6, 2023 · Apr 26, 2023
diff --git a/rpcs3/Emu/Cell/SPURecompiler.cpp b/rpcs3/Emu/Cell/SPURecompiler.cpp
@@ -8860,19 +8860,24 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			return;
 		}
 
-		// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
-		if (g_cfg.net.net_active == np_internet_status::enabled)
+		if (g_cfg.core.spu_approx_xfloat)
 		{
 			register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
 			{
-				return fsplat<f32[4]>(1.0) / value<f32[4]>(ci->getOperand(0));
+				const auto a = value<f32[4]>(ci->getOperand(0));
+				const auto acc_result = fsplat<f32[4]>(1.0) / a;
+				// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
+				const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
+				// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
+				return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
 			});
 		}
 		else
 		{
 			register_intrinsic("spu_frest", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));
+				// Fast but this makes the result vary per cpu
 				return fre(a);
 			});
 		}
@@ -8895,19 +8900,24 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			return;
 		}
 
-		// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
-		if (g_cfg.net.net_active == np_internet_status::enabled)
+		if (g_cfg.core.spu_approx_xfloat)
 		{
 			register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
 			{
-				return fsplat<f32[4]>(1.0) / fsqrt(fabs(value<f32[4]>(ci->getOperand(0))));
+				const auto a = value<f32[4]>(ci->getOperand(0));
+				const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
+				// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
+				const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
+				// Zeroes the last 11 bytes of the mantissa so FI calculations end up correct if needed
+				return bitcast<f32[4]>(bitcast<u32[4]>(acc_result - acc_penalty) & splat<u32[4]>(0xFFFFF800));
 			});
 		}
 		else
 		{
 			register_intrinsic("spu_frsqest", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));
+				// Fast but this makes the result vary per cpu
 				return frsqe(fabs(a));
 			});
 		}
@@ -9633,23 +9643,29 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
 			return bitcast<f32[4]>((b & 0xff800000u) | (bitcast<u32[4]>(fpcast<f32[4]>(bnew)) & ~0xff800000u)); // Inject old sign and exponent
 		});
 
-		// To avoid divergence in online play don't use divergent intel/amd intrinsics when online
-		if (g_cfg.net.net_active == np_internet_status::enabled)
+		if (g_cfg.core.spu_approx_xfloat)
 		{
 			register_intrinsic("spu_re", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));
-				return fsplat<f32[4]>(1.0) / a;
+				const auto acc_result = fsplat<f32[4]>(1.0) / a;
+				// Determines accuracy penalty, frest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
+				const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
+				return acc_result - acc_penalty;
 			});
 
 			register_intrinsic("spu_rsqrte", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));
-				return fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
+				const auto acc_result = fsplat<f32[4]>(1.0) / fsqrt(fabs(a));
+				// Determines accuracy penalty, frsqest result is always slightly closer to 0 than actual value and provides ~12 bits accuracy
+				const auto acc_penalty = fsplat<f32[4]>(0x1p-13f) * acc_result;
+				return acc_result - acc_penalty;
 			});
 		}
 		else
 		{
+			// For relaxed use intrinsics, those make the results vary per cpu
 			register_intrinsic("spu_re", [&](llvm::CallInst* ci)
 			{
 				const auto a = value<f32[4]>(ci->getOperand(0));

diff --git a/rpcs3/Emu/system_config.h b/rpcs3/Emu/system_config.h
@@ -68,7 +68,7 @@ struct cfg_root : cfg::node
 		cfg::_enum<tsx_usage> enable_TSX{ this, "Enable TSX", enable_tsx_by_default() ? tsx_usage::enabled : tsx_usage::disabled }; // Enable TSX. Forcing this on Haswell/Broadwell CPUs should be used carefully
 		cfg::_bool spu_accurate_xfloat{ this, "Accurate xfloat", false };
 		cfg::_bool spu_approx_xfloat{ this, "Approximate xfloat", true };
-		cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT" and "FNMS" instructions
+		cfg::_bool spu_relaxed_xfloat{ this, "Relaxed xfloat", true }; // Approximate accuracy for only the "FCGT", "FNMS", "FREST" AND "FRSQEST" instructions
 		cfg::_int<-1, 14> ppu_128_reservations_loop_max_length{ this, "Accurate PPU 128-byte Reservation Op Max Length", 0, true }; // -1: Always accurate, 0: Never accurate, 1-14: max accurate loop length
 		cfg::_int<-64, 64> stub_ppu_traps{ this, "Stub PPU Traps", 0, true }; // Hack, skip PPU traps for rare cases where the trap is continueable (specify relative instructions to skip)
 		cfg::_bool full_width_avx512{ this, "Full Width AVX-512", false };