Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PPU Precise/LLVM: Support NJ modes #8617

Merged
merged 1 commit into from Jul 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
71 changes: 41 additions & 30 deletions rpcs3/Emu/Cell/PPUInterpreter.cpp
Expand Up @@ -359,6 +359,8 @@ class ppu_scale_table_t
}
const g_ppu_scale_table;

constexpr u32 ppu_inf_u32 = 0x7F800000u;
static const f32 ppu_inf_f32 = std::bit_cast<f32>(ppu_inf_u32);
constexpr u32 ppu_nan_u32 = 0x7FC00000u;
static const f32 ppu_nan_f32 = std::bit_cast<f32>(ppu_nan_u32);
static const v128 ppu_vec_nans = v128::from32p(ppu_nan_u32);
Expand Down Expand Up @@ -403,6 +405,14 @@ v128 vec_handle_nan(__m128 result, Args... args)
return vec_handle_nan(v128::fromF(result), v128::fromF(args)...);
}

// Flush denormals to zero if NJ is 1
inline v128 vec_handle_denormal(ppu_thread& ppu, v128 a)
{
const auto mask = v128::from32p(ppu.jm_mask);
const auto nz = v128::fromV(_mm_srli_epi32(v128::eq32(mask & a, v128{}).vi, 1));
return v128::andnot(nz, a);
}

bool ppu_interpreter::MFVSCR(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = v128::from32(0, 0, 0, u32{ppu.sat} | (u32{ppu.nj} << 16));
Expand All @@ -414,6 +424,7 @@ bool ppu_interpreter::MTVSCR(ppu_thread& ppu, ppu_opcode_t op)
const u32 vscr = ppu.vr[op.vb]._u32[3];
ppu.sat = (vscr & 1) != 0;
ppu.nj = (vscr & 0x10000) != 0;
ppu.jm_mask = ppu.nj ? ppu_inf_u32 : 0x7fff'ffff;
return true;
}

Expand All @@ -427,10 +438,10 @@ bool ppu_interpreter::VADDCUW(ppu_thread& ppu, ppu_opcode_t op)

bool ppu_interpreter::VADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto result = v128::addfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true;
}

Expand Down Expand Up @@ -958,26 +969,26 @@ bool ppu_interpreter::VLOGEFP(ppu_thread& ppu, ppu_opcode_t op)

bool ppu_interpreter_fast::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vb].vf;
const auto c = ppu.vr[op.vc].vf;
const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]).vf;
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]).vf;
const auto result = _mm_add_ps(_mm_mul_ps(a, c), b);
ppu.vr[op.vd] = vec_handle_nan(result);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result));
return true;
}

bool ppu_interpreter_precise::VMADDFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto c = ppu.vr[op.vc];
ppu.vr[op.rd] = vec_handle_nan(v128::fma32f(a, c, b), a, b, c);
const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(v128::fma32f(a, c, b), a, b, c));
return true;
}

bool ppu_interpreter::VMAXFP(ppu_thread& ppu, ppu_opcode_t op)
{
ppu.vr[op.vd] = vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf));
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(_mm_max_ps(ppu.vr[op.va].vf, ppu.vr[op.vb].vf)));
return true;
}

Expand Down Expand Up @@ -1123,7 +1134,7 @@ bool ppu_interpreter::VMINFP(ppu_thread& ppu, ppu_opcode_t op)
const auto a = ppu.vr[op.va].vf;
const auto b = ppu.vr[op.vb].vf;
const auto result = _mm_or_ps(_mm_min_ps(a, b), _mm_min_ps(b, a));
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true;
}

Expand Down Expand Up @@ -1463,18 +1474,18 @@ bool ppu_interpreter_fast::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
const auto a = _mm_sub_ps(_mm_mul_ps(ppu.vr[op.va].vf, ppu.vr[op.vc].vf), ppu.vr[op.vb].vf);
const auto b = _mm_set1_ps(-0.0f);
const auto result = _mm_xor_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_nan(result);
return true;
}

bool ppu_interpreter_precise::VNMSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto m = _mm_set1_ps(-0.0f);
const auto a = ppu.vr[op.va];
const auto c = ppu.vr[op.vc];
const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto c = vec_handle_denormal(ppu, ppu.vr[op.vc]);
const auto b = v128::fromF(_mm_xor_ps(ppu.vr[op.vb].vf, m));
const auto r = v128::fromF(_mm_xor_ps(v128::fma32f(a, c, b).vf, m));
ppu.vr[op.rd] = vec_handle_nan(r, a, b, c);
ppu.vr[op.rd] = vec_handle_denormal(ppu, vec_handle_nan(r, a, b, c));
return true;
}

Expand Down Expand Up @@ -1874,23 +1885,23 @@ bool ppu_interpreter_precise::VPKUWUS(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VREFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf;
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto result = _mm_div_ps(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true;
}

bool ppu_interpreter::VRFIM(ppu_thread& ppu, ppu_opcode_t op)
{
const auto b = ppu.vr[op.vb];
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
v128 d;

for (uint w = 0; w < 4; w++)
{
d._f[w] = std::floor(b._f[w]);
}

ppu.vr[op.vd] = vec_handle_nan(d, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true;
}

Expand All @@ -1904,21 +1915,21 @@ bool ppu_interpreter::VRFIN(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::nearbyint(b._f[w]);
}

ppu.vr[op.vd] = vec_handle_nan(d, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true;
}

bool ppu_interpreter::VRFIP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto b = ppu.vr[op.vb];
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
v128 d;

for (uint w = 0; w < 4; w++)
{
d._f[w] = std::ceil(b._f[w]);
}

ppu.vr[op.vd] = vec_handle_nan(d, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true;
}

Expand All @@ -1932,7 +1943,7 @@ bool ppu_interpreter::VRFIZ(ppu_thread& ppu, ppu_opcode_t op)
d._f[w] = std::truncf(b._f[w]);
}

ppu.vr[op.vd] = vec_handle_nan(d, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(d, b));
return true;
}

Expand Down Expand Up @@ -1978,9 +1989,9 @@ bool ppu_interpreter::VRLW(ppu_thread& ppu, ppu_opcode_t op)
bool ppu_interpreter::VRSQRTEFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f);
const auto b = ppu.vr[op.vb].vf;
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]).vf;
const auto result = _mm_div_ps(a, _mm_sqrt_ps(b));
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true;
}

Expand Down Expand Up @@ -2277,10 +2288,10 @@ bool ppu_interpreter::VSUBCUW(ppu_thread& ppu, ppu_opcode_t op)

bool ppu_interpreter::VSUBFP(ppu_thread& ppu, ppu_opcode_t op)
{
const auto a = ppu.vr[op.va];
const auto b = ppu.vr[op.vb];
const auto a = vec_handle_denormal(ppu, ppu.vr[op.va]);
const auto b = vec_handle_denormal(ppu, ppu.vr[op.vb]);
const auto result = v128::subfs(a, b);
ppu.vr[op.vd] = vec_handle_nan(result, a, b);
ppu.vr[op.vd] = vec_handle_denormal(ppu, vec_handle_nan(result, a, b));
return true;
}

Expand Down
5 changes: 5 additions & 0 deletions rpcs3/Emu/Cell/PPUThread.cpp
Expand Up @@ -1613,6 +1613,7 @@ extern void ppu_initialize(const ppu_module& info)
non_win32,
accurate_fma,
accurate_ppu_vector_nan,
java_mode_handling,

__bitset_enum_max
};
Expand All @@ -1630,6 +1631,10 @@ extern void ppu_initialize(const ppu_module& info)
{
settings += ppu_settings::accurate_ppu_vector_nan;
}
if (g_cfg.core.llvm_ppu_jm_handling)
{
settings += ppu_settings::java_mode_handling;
}

// Write version, hash, CPU, settings
fmt::append(obj_name, "v3-tane-%s-%s-%s.obj", fmt::base57(output, 16), fmt::base57(settings), jit_compiler::cpu(g_cfg.core.llvm_cpu));
Expand Down
5 changes: 4 additions & 1 deletion rpcs3/Emu/Cell/PPUThread.h
Expand Up @@ -186,7 +186,10 @@ class ppu_thread : public cpu_thread
exception, the corresponding element in the target vr is cleared to '0'. In both cases, the '0'
has the same sign as the denormalized or underflowing value.
*/
bool nj = false;
bool nj = true;

// Optimization: precomputed java-mode mask for handling denormals
u32 jm_mask = 0x7f80'0000;

u32 raddr{0}; // Reservation addr
u64 rtime{0};
Expand Down