RPCS3 · AniLeo · Jul 25, 2020 · Jul 23, 2020
diff --git a/rpcs3/Emu/Cell/PPUInterpreter.cpp b/rpcs3/Emu/Cell/PPUInterpreter.cpp
@@ -359,6 +359,8 @@ class ppu_scale_table_t
 }
 const g_ppu_scale_table;
 
+constexpr u32 ppu_inf_u32 = 0x7F800000u;
+static const f32 ppu_inf_f32 = std::bit_cast<f32>(ppu_inf_u32);
 constexpr u32 ppu_nan_u32 = 0x7FC00000u;
 static const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
 static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32);
@@ -403,6 +405,14 @@ v128 vec_handle_nan(__m128 result, Args... args)
 	return vec_handle_nan(v128::fromF(result), v128::fromF(args)...);
 }
 
+// Flush denormals to zero if NJ is 1
+inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)
+{
+	const auto mask = v128::from32p(ppu.jm_mask);
+	const auto nz = v128::fromV(_mm_srli_epi32(v128::eq32(mask & a, v128{}).vi, 1));
+	return v128::andnot(nz, a);
+}
+
 bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
 {
 	ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
@@ -414,6 +424,7 @@ bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
 	const u32 vscr = ppu.vr[op.vb]._u32[3];
 	ppu.sat = (vscr & 1) != 0;
 	ppu.nj  = (vscr & 0x10000) != 0;
+	ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
 	return true;
 }
 
@@ -427,10 +438,10 @@ bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)
 
 bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto a = ppu.vr[op.va];
-	const auto b = ppu.vr[op.vb];
+	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
 	const auto result = v128::addfs(a, b);
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
 	return true;
 }
 
@@ -958,26 +969,26 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)
 
 bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto a = ppu.vr[op.va].vf;
-	const auto b = ppu.vr[op.vb].vf;
-	const auto c = ppu.vr[op.vc].vf;
+	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]).vf;
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
+	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]).vf;
 	const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
-	ppu.vr[op.vd] = vec_handle_nan(result);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result));
 	return true;
 }
 
 bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto a = ppu.vr[op.va];
-	const auto b = ppu.vr[op.vb];
-	const auto c = ppu.vr[op.vc];
-	ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c);
+	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
+	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
+	ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(v128::fma32f(a, c, b), a, b, c));
 	return true;
 }
 
 bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	ppu.vr[op.vd] = vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf)));
 	return true;
 }
 
@@ -1123,7 +1134,7 @@ bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
 	const auto a = ppu.vr[op.va].vf;
 	const auto b = ppu.vr[op.vb].vf;
 	const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
 	return true;
 }
 
@@ -1463,18 +1474,18 @@ bool ppu_interpreter_fast::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
 	const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
 	const auto b = _mm_set1_ps(-0.0f);
 	const auto result = _mm_xor_ps(a, b);
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_nan(result);
 	return true;
 }
 
 bool ppu_interpreter_precise::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const auto m = _mm_set1_ps(-0.0f);
-	const auto a = ppu.vr[op.va];
-	const auto c = ppu.vr[op.vc];
+	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
+	const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
 	const auto b = v128::fromF(_mm_xor_ps(ppu.vr[op.vb].vf, m));
 	const auto r = v128::fromF(_mm_xor_ps(v128::fma32f(a, c, b).vf, m));
-	ppu.vr[op.rd] = vec_handle_nan(r, a, b, c);
+	ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(r, a, b, c));
 	return true;
 }
 
@@ -1874,23 +1885,23 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
 bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
-	const auto b = ppu.vr[op.vb].vf;
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
 	const auto result = _mm_div_ps(a, b);
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
 	return true;
 }
 
 bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto b = ppu.vr[op.vb];
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
 	v128 d;
 
 	for (uint w = 0; w < 4; w++)
 	{
 		d._f[w] = std::floor(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_nan(d, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
 	return true;
 }
 
@@ -1904,21 +1915,21 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::nearbyint(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_nan(d, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
 	return true;
 }
 
 bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto b = ppu.vr[op.vb];
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
 	v128 d;
 
 	for (uint w = 0; w < 4; w++)
 	{
 		d._f[w] = std::ceil(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_nan(d, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
 	return true;
 }
 
@@ -1932,7 +1943,7 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
 		d._f[w] = std::truncf(b._f[w]);
 	}
 
-	ppu.vr[op.vd] = vec_handle_nan(d, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
 	return true;
 }
 
@@ -1978,9 +1989,9 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
 bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
 {
 	const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
-	const auto b = ppu.vr[op.vb].vf;
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
 	const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
 	return true;
 }
 
@@ -2277,10 +2288,10 @@ bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)
 
 bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
 {
-	const auto a = ppu.vr[op.va];
-	const auto b = ppu.vr[op.vb];
+	const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
+	const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
 	const auto result = v128::subfs(a, b);
-	ppu.vr[op.vd] = vec_handle_nan(result, a, b);
+	ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
 	return true;
 }
 

diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp
@@ -1613,6 +1613,7 @@ extern void ppu_initialize(const ppu_module& info)
 				non_win32,
 				accurate_fma,
 				accurate_ppu_vector_nan,
+				java_mode_handling,
 
 				__bitset_enum_max
 			};
@@ -1630,6 +1631,10 @@ extern void ppu_initialize(const ppu_module& info)
 			{
 				settings += ppu_settings::accurate_ppu_vector_nan;
 			}
+			if (g_cfg.core.llvm_ppu_jm_handling)
+			{
+				settings += ppu_settings::java_mode_handling;
+			}
 
 			// Write version, hash, CPU, settings
 			fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));

diff --git a/rpcs3/Emu/Cell/PPUThread.h b/rpcs3/Emu/Cell/PPUThread.h
@@ -186,7 +186,10 @@ class ppu_thread : public cpu_thread
 			exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0'
 			has the same sign as the denormalized or underflowing value.
 	*/
-	bool nj = false;
+	bool nj = true;
+
+	// Optimization: precomputed java-mode mask for handling denormals
+	u32 jm_mask = 0x7f80'0000;
 
 	u32 raddr{0}; // Reservation addr
 	u64 rtime{0};