Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PPU LLVM arm64+macOS port #12115

Merged
merged 16 commits into from Jun 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions 3rdparty/MoltenVK/CMakeLists.txt
Expand Up @@ -7,9 +7,8 @@ ExternalProject_Add(moltenvk
GIT_TAG 1236d2f
BUILD_IN_SOURCE 1
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK
PATCH_COMMAND git apply "${CMAKE_CURRENT_SOURCE_DIR}/patches.patch"
CONFIGURE_COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/fetchDependencies" --macos
BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "x86_64"
BUILD_COMMAND xcodebuild build -quiet -project "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVKPackaging.xcodeproj" -scheme "MoltenVK Package \(macOS only\)" -configuration "Release" -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}"
COMMAND ln -f "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/MoltenVK/dylib/macOS/libMoltenVK.dylib" "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib"
INSTALL_COMMAND ""
BUILD_BYPRODUCTS "${CMAKE_CURRENT_SOURCE_DIR}/MoltenVK/Build/Products/Release/dynamic/libMoltenVK.dylib"
Expand Down
34 changes: 33 additions & 1 deletion Utilities/JIT.cpp
Expand Up @@ -196,7 +196,11 @@ void* jit_runtime_base::_add(asmjit::CodeHolder* code) noexcept
ensure(!code->relocateToBase(uptr(p)));

{
// We manage rw <-> rx transitions manually on Apple
// because it's easier to keep track of when and where we need to toggle W^X
#if !(defined(ARCH_ARM64) && defined(__APPLE__))
asmjit::VirtMem::ProtectJitReadWriteScope rwScope(p, codeSize);
#endif

for (asmjit::Section* section : code->_sections)
{
Expand Down Expand Up @@ -248,6 +252,9 @@ void jit_runtime::initialize()

void jit_runtime::finalize() noexcept
{
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
// Reset JIT memory
#ifdef CAN_OVERCOMMIT
utils::memory_reset(get_jit_memory(), 0x80000000);
Expand All @@ -262,6 +269,15 @@ void jit_runtime::finalize() noexcept
// Restore code/data snapshot
std::memcpy(alloc(s_code_init.size(), 1, true), s_code_init.data(), s_code_init.size());
std::memcpy(alloc(s_data_init.size(), 1, false), s_data_init.data(), s_data_init.size());

#ifdef __APPLE__
pthread_jit_write_protect_np(true);
#endif
#ifdef ARCH_ARM64
sguo35 marked this conversation as resolved.
Show resolved Hide resolved
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#endif
}

jit_runtime_base& asmjit::get_global_runtime()
Expand Down Expand Up @@ -432,6 +448,21 @@ static u64 make_null_function(const std::string& name)
c.db(ch);
c.db(0);
c.align(AlignMode::kData, 16);
#else
// AArch64 implementation
Label jmp_address = c.newLabel();
Label data = c.newLabel();
// Force absolute jump to prevent out of bounds PC-rel jmp
c.ldr(args[0], arm::ptr(jmp_address));
c.br(args[0]);
c.align(AlignMode::kCode, 16);

c.bind(data);
c.embed(name.c_str(), name.size());
c.embedUInt8(0U);
c.bind(jmp_address);
c.embedUInt64(reinterpret_cast<u64>(&null));
c.align(AlignMode::kData, 16);
#endif
});

Expand Down Expand Up @@ -840,6 +871,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
std::string result;

auto null_mod = std::make_unique<llvm::Module> ("null_", *m_context);
null_mod->setTargetTriple(utils::c_llvm_default_triple);

if (_link.empty())
{
Expand All @@ -852,7 +884,7 @@ jit_compiler::jit_compiler(const std::unordered_map<std::string, u64>& _link, co
else
{
mem = std::make_unique<MemoryManager2>();
null_mod->setTargetTriple(llvm::Triple::normalize("x86_64-unknown-linux-gnu"));
null_mod->setTargetTriple(utils::c_llvm_default_triple);
}

// Auxiliary JIT (does not use custom memory manager, only writes the objects)
Expand Down
3 changes: 3 additions & 0 deletions Utilities/JIT.h
Expand Up @@ -269,6 +269,9 @@ namespace asmjit
template <typename FT, typename Asm = native_asm, typename F>
inline FT build_function_asm(std::string_view name, F&& builder)
{
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
using namespace asmjit;

auto& rt = get_global_runtime();
Expand Down
7 changes: 6 additions & 1 deletion rpcs3/CMakeLists.txt
Expand Up @@ -138,6 +138,11 @@ find_program(MACDEPLOYQT_EXECUTABLE macdeployqt HINTS "${_qt_bin_dir}")

# Copy icons to executable directory
if(APPLE)
if (CMAKE_BUILD_TYPE MATCHES "Debug" OR CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
set(QT_DEPLOY_FLAGS "-no-strip")
else()
set(QT_DEPLOY_FLAGS "")
endif()
add_custom_command(TARGET rpcs3 POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${RPCS3_SRC_DIR}/rpcs3.icns $<TARGET_FILE_DIR:rpcs3>/../Resources/rpcs3.icns
Expand All @@ -147,7 +152,7 @@ if(APPLE)
${CMAKE_SOURCE_DIR}/bin/GuiConfigs $<TARGET_FILE_DIR:rpcs3>/../Resources/GuiConfigs
COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_SOURCE_DIR}/bin/git $<TARGET_FILE_DIR:rpcs3>/../Resources/git
COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app")
COMMAND "${MACDEPLOYQT_EXECUTABLE}" "${PROJECT_BINARY_DIR}/bin/rpcs3.app" "${QT_DEPLOY_FLAGS}")
elseif(UNIX)
add_custom_command(TARGET rpcs3 POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
Expand Down
24 changes: 23 additions & 1 deletion rpcs3/Emu/CPU/CPUTranslator.h
Expand Up @@ -21,6 +21,8 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/IntrinsicsAArch64.h"

#ifdef _MSC_VER
#pragma warning(pop)
#else
Expand Down Expand Up @@ -2894,8 +2896,12 @@ class cpu_translator
bool m_is_be;

// Allow PSHUFB intrinsic
#ifdef ARCH_X64
bool m_use_ssse3 = true;

#else
// TODO: fix the pshufb arm64 native impl using TBL instruction
bool m_use_ssse3 = false;
#endif
// Allow FMA
bool m_use_fma = false;

Expand Down Expand Up @@ -3640,25 +3646,41 @@ class cpu_translator
template <typename T, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T>, f32[4]>>>
static auto fre(T&& a)
{
#if defined(ARCH_X64)
return llvm_calli<f32[4], T>{"llvm.x86.sse.rcp.ps", {std::forward<T>(a)}};
#elif defined(ARCH_ARM64)
return llvm_calli<f32[4], T>{"llvm.aarch64.neon.frecpe.v4f32", {std::forward<T>(a)}};
#endif
}

template <typename T, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T>, f32[4]>>>
static auto frsqe(T&& a)
{
#if defined(ARCH_X64)
return llvm_calli<f32[4], T>{"llvm.x86.sse.rsqrt.ps", {std::forward<T>(a)}};
#elif defined(ARCH_ARM64)
return llvm_calli<f32[4], T>{"llvm.aarch64.neon.frsqrte.v4f32", {std::forward<T>(a)}};
#endif
}

template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, f32[4]>>>
static auto fmax(T&& a, U&& b)
{
#if defined(ARCH_X64)
return llvm_calli<f32[4], T, U>{"llvm.x86.sse.max.ps", {std::forward<T>(a), std::forward<U>(b)}};
#elif defined(ARCH_ARM64)
return llvm_calli<f32[4], T, U>{"llvm.aarch64.neon.fmax.v4f32", {std::forward<T>(a), std::forward<U>(b)}};
#endif
}

template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, f32[4]>>>
static auto fmin(T&& a, U&& b)
{
#if defined(ARCH_X64)
return llvm_calli<f32[4], T, U>{"llvm.x86.sse.min.ps", {std::forward<T>(a), std::forward<U>(b)}};
#elif defined(ARCH_ARM64)
return llvm_calli<f32[4], T, U>{"llvm.aarch64.neon.fmin.v4f32", {std::forward<T>(a), std::forward<U>(b)}};
#endif
}

template <typename T, typename U, typename = std::enable_if_t<std::is_same_v<llvm_common_t<T, U>, u8[16]>>>
Expand Down
138 changes: 138 additions & 0 deletions rpcs3/Emu/Cell/PPUThread.cpp
Expand Up @@ -65,6 +65,10 @@
#include "util/simd.hpp"
#include "util/sysinfo.hpp"

#ifdef __APPLE__
#include <libkern/OSCacheControl.h>
#endif

extern atomic_t<u64> g_watchdog_hold_ctr;

// Should be of the same type
Expand Down Expand Up @@ -247,7 +251,104 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",

c.ret();
#else
// See https://github.com/ghc/ghc/blob/master/rts/include/stg/MachRegs.h
// for GHC calling convention definitions on Aarch64
// and https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers
// for AArch64 calling convention

// Push callee saved registers to the stack
// We need to save x18-x30 = 13 x 8B each + 8 bytes for 16B alignment = 112B
c.sub(a64::sp, a64::sp, Imm(112));
c.stp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.stp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.stp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.stp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.stp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.stp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.str(a64::x30, arm::Mem(a64::sp, 96));

// Save sp for native longjmp emulation
Label native_sp_offset = c.newLabel();
c.ldr(a64::x10, arm::Mem(native_sp_offset));
c.str(a64::sp, arm::Mem(args[0], a64::x10));

// Load REG_Base - use absolute jump target to bypass rel jmp range limits
Label exec_addr = c.newLabel();
c.ldr(a64::x19, arm::Mem(exec_addr));
c.ldr(a64::x19, arm::Mem(a64::x19));
// Load PPUThread struct base -> REG_Sp
const arm::GpX ppu_t_base = a64::x20;
c.mov(ppu_t_base, args[0]);
// Load PC
const arm::GpX pc = a64::x26;
Label cia_offset = c.newLabel();
const arm::GpX cia_addr_reg = a64::x11;
// Load offset value
c.ldr(cia_addr_reg, arm::Mem(cia_offset));
// Load cia
c.ldr(pc, arm::Mem(ppu_t_base, cia_addr_reg));
// Zero top 32 bits
c.mov(a64::w26, a64::w26);
// Multiply by 2 to index into ptr table
const arm::GpX index_shift = a64::x27;
c.mov(index_shift, Imm(2));
c.mul(pc, pc, index_shift);

// Load call target
const arm::GpX call_target = a64::x28;
c.ldr(call_target, arm::Mem(a64::x19, pc));
// Compute REG_Hp
const arm::GpX reg_hp = a64::x21;
c.mov(reg_hp, call_target);
c.lsr(reg_hp, reg_hp, 48);
c.lsl(reg_hp, reg_hp, 13);

// Zero top 16 bits of call target
c.lsl(call_target, call_target, Imm(16));
c.lsr(call_target, call_target, Imm(16));

// Load registers
Label base_addr = c.newLabel();
c.ldr(a64::x22, arm::Mem(base_addr));
c.ldr(a64::x22, arm::Mem(a64::x22));

Label gpr_addr_offset = c.newLabel();
const arm::GpX gpr_addr_reg = a64::x9;
c.ldr(gpr_addr_reg, arm::Mem(gpr_addr_offset));
c.add(gpr_addr_reg, gpr_addr_reg, ppu_t_base);
c.ldr(a64::x23, arm::Mem(gpr_addr_reg));
c.ldr(a64::x24, arm::Mem(gpr_addr_reg, 8));
c.ldr(a64::x25, arm::Mem(gpr_addr_reg, 16));

// Execute LLE call
c.blr(call_target);

// Restore stack ptr
c.ldr(a64::x10, arm::Mem(native_sp_offset));
c.ldr(a64::sp, arm::Mem(args[0], a64::x10));
// Restore registers from the stack
c.ldp(a64::x18, a64::x19, arm::Mem(a64::sp));
c.ldp(a64::x20, a64::x21, arm::Mem(a64::sp, 16));
c.ldp(a64::x22, a64::x23, arm::Mem(a64::sp, 32));
c.ldp(a64::x24, a64::x25, arm::Mem(a64::sp, 48));
c.ldp(a64::x26, a64::x27, arm::Mem(a64::sp, 64));
c.ldp(a64::x28, a64::x29, arm::Mem(a64::sp, 80));
c.ldr(a64::x30, arm::Mem(a64::sp, 96));
// Restore stack ptr
c.add(a64::sp, a64::sp, Imm(112));
// Return
c.ret(a64::x30);

c.bind(exec_addr);
c.embedUInt64(reinterpret_cast<u64>(&vm::g_exec_addr));
c.bind(base_addr);
c.embedUInt64(reinterpret_cast<u64>(&vm::g_base_addr));
c.bind(cia_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::cia)));
c.bind(gpr_addr_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::gpr)));
c.bind(native_sp_offset);
c.embedUInt64(static_cast<u64>(::offset32(&ppu_thread::saved_native_sp)));
#endif
});

Expand Down Expand Up @@ -1252,6 +1353,9 @@ void ppu_thread::cpu_task()
}
case ppu_cmd::initialize:
{
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
cmd_pop();

while (!g_fxo->get<rsx::thread>().is_inited && !is_stopped())
Expand All @@ -1267,6 +1371,15 @@ void ppu_thread::cpu_task()
thread_ctrl::wait_on<atomic_wait::op_ne>(g_progr_ptotal, 0);
g_fxo->get<progress_dialog_workaround>().skip_the_progress_dialog = true;

#ifdef __APPLE__
pthread_jit_write_protect_np(true);
#endif
#ifdef ARCH_ARM64
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#endif

break;
}
case ppu_cmd::sleep:
Expand Down Expand Up @@ -1396,6 +1509,15 @@ ppu_thread::ppu_thread(const ppu_thread_params& param, std::string_view name, u3
{
call_history.data.resize(call_history_max_size);
}

#ifdef __APPLE__
pthread_jit_write_protect_np(true);
#endif
#ifdef ARCH_ARM64
// Flush all cache lines after potentially writing executable code
asm("ISB");
asm("DSB ISH");
#endif
}

ppu_thread::thread_name_t::operator std::string() const
Expand Down Expand Up @@ -1974,6 +2096,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
#endif
c.ret();
#else
// Unimplemented should fail.
c.brk(Imm(0x42));
c.ret(a64::x30);
#endif
});
Expand Down Expand Up @@ -2552,6 +2676,9 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<lv2_

named_thread_group workers("SPRX Worker ", std::min<u32>(utils::get_thread_count(), ::size32(file_queue)), [&]
{
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
// Set low priority
thread_ctrl::scoped_priority low_prio(-1);

Expand Down Expand Up @@ -3226,6 +3353,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
// Set low priority
thread_ctrl::scoped_priority low_prio(-1);

#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
for (u32 i = work_cv++; i < workload.size(); i = work_cv++, g_progr_pdone++)
{
if (Emu.IsStopped())
Expand Down Expand Up @@ -3287,6 +3417,9 @@ bool ppu_initialize(const ppu_module& info, bool check_only)
}

// Jit can be null if the loop doesn't ever enter.
#ifdef __APPLE__
pthread_jit_write_protect_np(false);
#endif
if (jit && !jit_mod.init)
{
jit->fin();
Expand Down Expand Up @@ -3345,7 +3478,12 @@ static void ppu_initialize2(jit_compiler& jit, const ppu_module& module_part, co
std::unique_ptr<Module> _module = std::make_unique<Module>(obj_name, jit.get_context());

// Initialize target
#if defined(__APPLE__) && defined(ARCH_ARM64)
// Force target linux on macOS arm64 to bypass some 64-bit address space linking issues
_module->setTargetTriple(utils::c_llvm_default_triple);
#else
_module->setTargetTriple(Triple::normalize(sys::getProcessTriple()));
Nekotekina marked this conversation as resolved.
Show resolved Hide resolved
#endif
_module->setDataLayout(jit.get_engine().getTargetMachine()->createDataLayout());

// Initialize translator
Expand Down