Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YJIT: Add ability to exit to interpreter from stubs #5180

Merged
merged 3 commits into from Nov 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions bootstraptest/test_yjit.rb
Expand Up @@ -2434,6 +2434,24 @@ def use(arg)
A.new.use 1
}

assert_equal 'ok', %q{
# test hitting a branch stub when out of memory
def nimai(jita)
if jita
:ng
else
:ok
end
end

nimai(true)
nimai(true)

RubyVM::YJIT.simulate_oom! if defined?(RubyVM::YJIT)

nimai(false)
} if false # disabled for now since OOM crashes in the test harness

# block invalidation while out of memory
assert_equal 'new', %q{
def foo
Expand Down
2 changes: 2 additions & 0 deletions yjit.c
Expand Up @@ -123,6 +123,8 @@ YJIT_DECLARE_COUNTERS(
compiled_iseq_count,
compiled_block_count,

exit_from_branch_stub,

invalidation_count,
invalidate_method_lookup,
invalidate_bop_redefined,
Expand Down
20 changes: 20 additions & 0 deletions yjit_codegen.c
Expand Up @@ -382,6 +382,26 @@ yjit_gen_leave_exit(codeblock_t *cb)
return code_ptr;
}

// Fill code_for_exit_from_stub. This is used by branch_stub_hit() to exit
// to the interpreter when it cannot service a stub by generating new code.
// Before coming here, branch_stub_hit() takes care of fully reconstructing
// interpreter state.
static void
gen_code_for_exit_from_stub(void)
{
codeblock_t *cb = ocb;
code_for_exit_from_stub = cb_get_ptr(cb, cb->write_pos);

GEN_COUNTER_INC(cb, exit_from_branch_stub);

pop(cb, REG_SP);
pop(cb, REG_EC);
pop(cb, REG_CFP);

mov(cb, RAX, imm_opnd(Qundef));
ret(cb);
}

// :side-exit:
// Get an exit for the current instruction in the outlined block. The code
// for each instruction often begins with several guards before proceeding
Expand Down
2 changes: 2 additions & 0 deletions yjit_codegen.h
Expand Up @@ -16,6 +16,8 @@ static uint8_t *yjit_entry_prologue(codeblock_t *cb, const rb_iseq_t *iseq);

static void yjit_gen_block(block_t *block, rb_execution_context_t *ec);

static void gen_code_for_exit_from_stub(void);

static void yjit_init_codegen(void);

#endif // #ifndef YJIT_CODEGEN_H
152 changes: 107 additions & 45 deletions yjit_core.c
Expand Up @@ -9,6 +9,10 @@
#include "yjit_core.h"
#include "yjit_codegen.h"

// For exiting from YJIT frame from branch_stub_hit().
// Filled by gen_code_for_exit_from_stub().
static uint8_t *code_for_exit_from_stub = NULL;

/*
Get an operand for the adjusted stack pointer address
*/
Expand Down Expand Up @@ -597,6 +601,52 @@ add_block_version(blockid_t blockid, block_t *block)
#endif
}

static ptrdiff_t
branch_code_size(const branch_t *branch)
{
return branch->end_addr - branch->start_addr;
}

// Generate code for a branch, possibly rewriting and changing the size of it
static void
regenerate_branch(codeblock_t *cb, branch_t *branch)
Comment on lines +611 to +612
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Factoring this into its own function is a good idea 👍 🙂

{
if (branch->start_addr < cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
// Generating this branch would modify frozen bytes. Do nothing.
return;
}

const uint32_t old_write_pos = cb->write_pos;
const bool branch_terminates_block = branch->end_addr == branch->block->end_addr;

RUBY_ASSERT(branch->dst_addrs[0] != NULL);

cb_set_write_ptr(cb, branch->start_addr);
branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
branch->end_addr = cb_get_write_ptr(cb);

if (branch_terminates_block) {
// Adjust block size
branch->block->end_addr = branch->end_addr;
}

// cb->write_pos is both a write cursor and a marker for the end of
// everything written out so far. Leave cb->write_pos at the end of the
// block before returning. This function only ever bump or retain the end
// of block marker since that's what the majority of callers want. When the
// branch sits at the very end of the codeblock and it shrinks after
// regeneration, it's up to the caller to drop bytes off the end to
// not leave a gap and implement branch->shape.
if (old_write_pos > cb->write_pos) {
// We rewound cb->write_pos to generate the branch, now restore it.
cb_set_pos(cb, old_write_pos);
}
Comment on lines +640 to +643
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should add a comment explaining why this if statement is there. Otherwise it seems a little strange without detailed prior knowledge of BBV.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call! How do you like the comment in f423be9?

I feel like this is not a BBV issue per se and more that the code block abstraction is a bit tricky and has this gocha. I feel like we may want a setup where you derive a short-lived write cursor from the code block to make this more natural. Would be good for tracking which virtual memory pages we have written to if we want a more granular implementation of #5032 too. Totally out of scope for this PR of course, I just wanted to get this idea out there.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO the comment should explain that if the branch is at the end of the code block and shrinks, we want to recover that space instead of leaving a gap.

You're right that the current setup with the write position tracking the size of the generated code and also being adjusted when patching code is not ideal. I also think you're right that it would be nice to have a more granular mechanism for handling the W^X issue, because otherwise we're sadly going to be leaving performance on the table. Very much open to it if you have ideas for what this better abstraction would look like. Any way that you could sketch the API?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, it might be too early to go for heavier abstraction. I searched around and this function seems to be the only place where we seek around in the code block. This if handles and documents this quirk and it should be okay enough for now.

Tracking writes probably involves passing around an array of pointers to start of pages like we currently pass around codeblock_t *. I haven't given it that much thought. #5032 probably needs benchmarking to see if it's already fast enough.

else {
// The branch sits at the end of cb and consumed some memory.
// Keep cb->write_pos.
}
}

// Create a new outgoing branch entry for a block
static branch_t*
make_branch_entry(block_t *block, const ctx_t *src_ctx, branchgen_fn gen_fn)
Expand Down Expand Up @@ -777,13 +827,15 @@ gen_entry_point(const rb_iseq_t *iseq, uint32_t insn_idx, rb_execution_context_t
static uint8_t *
branch_stub_hit(branch_t *branch, const uint32_t target_idx, rb_execution_context_t *ec)
{
uint8_t *dst_addr;
uint8_t *dst_addr = NULL;

// Stop other ractors since we are going to patch machine code.
// This is how the GC does it.
RB_VM_LOCK_ENTER();
rb_vm_barrier();

const ptrdiff_t branch_size_on_entry = branch_code_size(branch);

RUBY_ASSERT(branch != NULL);
RUBY_ASSERT(target_idx < 2);
blockid_t target = branch->targets[target_idx];
Expand All @@ -794,18 +846,13 @@ branch_stub_hit(branch_t *branch, const uint32_t target_idx, rb_execution_contex
if (branch->blocks[target_idx]) {
dst_addr = branch->dst_addrs[target_idx];
}
else
{
//fprintf(stderr, "\nstub hit, branch: %p, target idx: %d\n", branch, target_idx);
//fprintf(stderr, "blockid.iseq=%p, blockid.idx=%d\n", target.iseq, target.idx);
//fprintf(stderr, "chain_depth=%d\n", target_ctx->chain_depth);

else {
// :stub-sp-flush:
// Generated code do stack operations without modifying cfp->sp, while the
// cfp->sp tells the GC what values on the stack to root. Generated code
// generally takes care of updating cfp->sp when it calls runtime routines that
// could trigger GC, but for the case of branch stubs, it's inconvenient. So
// we do it here.
// could trigger GC, but it's inconvenient to do it before calling this function.
// So we do it here instead.
VALUE *const original_interp_sp = ec->cfp->sp;
ec->cfp->sp += target_ctx->sp_offset;

Expand All @@ -818,52 +865,74 @@ branch_stub_hit(branch_t *branch, const uint32_t target_idx, rb_execution_contex

// If this block hasn't yet been compiled
if (!p_block) {
const uint8_t branch_old_shape = branch->shape;
bool branch_modified = false;

// If the new block can be generated right after the branch (at cb->write_pos)
if (cb_get_write_ptr(cb) == branch->end_addr && branch->start_addr >= cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
if (cb_get_write_ptr(cb) == branch->end_addr) {
// This branch should be terminating its block
RUBY_ASSERT(branch->end_addr == branch->block->end_addr);

// Change the branch shape to indicate the target block will be placed next
branch->shape = (uint8_t)target_idx;

// Rewrite the branch with the new, potentially more compact shape
cb_set_write_ptr(cb, branch->start_addr);
branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
RUBY_ASSERT(cb_get_write_ptr(cb) <= branch->end_addr && "can't enlarge branches");
branch->end_addr = cb_get_write_ptr(cb);
branch->block->end_addr = cb_get_write_ptr(cb);
regenerate_branch(cb, branch);
branch_modified = true;

// Ensure that the branch terminates the codeblock just like
// before entering this if block. This drops bytes off the end
// in case we shrank the branch when regenerating.
cb_set_write_ptr(cb, branch->end_addr);
}

// Compile the new block version
p_block = gen_block_version(target, target_ctx, ec);
RUBY_ASSERT(p_block);
RUBY_ASSERT(!(branch->shape == (uint8_t)target_idx && p_block->start_addr != branch->end_addr));

if (!p_block && branch_modified) {
// We couldn't generate a new block for the branch, but we modified the branch.
// Restore the branch by regenerating it.
branch->shape = branch_old_shape;
regenerate_branch(cb, branch);
}
}

// Add this branch to the list of incoming branches for the target
rb_darray_append(&p_block->incoming, branch);
if (p_block) {
// Branch shape should reflect layout
RUBY_ASSERT(!(branch->shape == (uint8_t)target_idx && p_block->start_addr != branch->end_addr));

// Update the branch target address
dst_addr = p_block->start_addr;
branch->dst_addrs[target_idx] = dst_addr;
// Add this branch to the list of incoming branches for the target
rb_darray_append(&p_block->incoming, branch);

// Rewrite the branch with the new jump target address
if (branch->start_addr >= cb_get_ptr(cb, yjit_codepage_frozen_bytes)) {
RUBY_ASSERT(branch->dst_addrs[0] != NULL);
uint32_t cur_pos = cb->write_pos;
cb_set_write_ptr(cb, branch->start_addr);
branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
RUBY_ASSERT(cb_get_write_ptr(cb) == branch->end_addr && "branch can't change size");
cb_set_pos(cb, cur_pos);
}
// Update the branch target address
dst_addr = p_block->start_addr;
branch->dst_addrs[target_idx] = dst_addr;

// Mark this branch target as patched (no longer a stub)
branch->blocks[target_idx] = p_block;
// Mark this branch target as patched (no longer a stub)
branch->blocks[target_idx] = p_block;

// Restore interpreter sp, since the code hitting the stub expects the original.
ec->cfp->sp = original_interp_sp;
// Rewrite the branch with the new jump target address
regenerate_branch(cb, branch);

// Restore interpreter sp, since the code hitting the stub expects the original.
ec->cfp->sp = original_interp_sp;
}
else {
// Failed to service the stub by generating a new block so now we
// need to exit to the interpreter at the stubbed location. We are
// intentionally *not* restoring original_interp_sp. At the time of
// writing, reconstructing interpreter state only involves setting
// cfp->sp and cfp->pc. We set both before trying to generate the
// block. All there is left to do to exit is to pop the native
// frame. We do that in code_for_exit_from_stub.
dst_addr = code_for_exit_from_stub;
}
}

const ptrdiff_t new_branch_size = branch_code_size(branch);
RUBY_ASSERT_ALWAYS(new_branch_size >= 0);
RUBY_ASSERT_ALWAYS(new_branch_size <= branch_size_on_entry && "branch stubs should not enlarge branches");

RB_VM_LOCK_LEAVE();

// Return a pointer to the compiled block version
Expand Down Expand Up @@ -942,8 +1011,7 @@ gen_branch(

// Call the branch generation function
branch->start_addr = cb_get_write_ptr(cb);
gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], SHAPE_DEFAULT);
branch->end_addr = cb_get_write_ptr(cb);
regenerate_branch(cb, branch);
}

static void
Expand Down Expand Up @@ -1191,13 +1259,7 @@ invalidate_block_version(block_t *block)
}

// Rewrite the branch with the new jump target address
RUBY_ASSERT(branch->dst_addrs[0] != NULL);
uint32_t cur_pos = cb->write_pos;
cb_set_write_ptr(cb, branch->start_addr);
branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
branch->end_addr = cb_get_write_ptr(cb);
branch->block->end_addr = cb_get_write_ptr(cb);
cb_set_pos(cb, cur_pos);
regenerate_branch(cb, branch);

if (target_next && branch->end_addr > block->end_addr) {
fprintf(stderr, "branch_block_idx=%u block_idx=%u over=%ld block_size=%ld\n",
Expand Down Expand Up @@ -1243,5 +1305,5 @@ invalidate_block_version(block_t *block)
static void
yjit_init_core(void)
{
// Nothing yet
gen_code_for_exit_from_stub();
}
6 changes: 3 additions & 3 deletions yjit_core.h
Expand Up @@ -192,8 +192,8 @@ typedef struct yjit_branch_entry
struct yjit_block_version *block;

// Positions where the generated code starts and ends
uint8_t* start_addr;
uint8_t* end_addr;
uint8_t *start_addr;
uint8_t *end_addr;

// Context right after the branch instruction
ctx_t src_ctx;
Expand All @@ -204,7 +204,7 @@ typedef struct yjit_branch_entry
struct yjit_block_version *blocks[2];

// Jump target addresses
uint8_t* dst_addrs[2];
uint8_t *dst_addrs[2];

// Branch code generation function
branchgen_fn gen_fn;
Expand Down
5 changes: 2 additions & 3 deletions yjit_iface.c
Expand Up @@ -1232,17 +1232,16 @@ rb_yjit_init(struct rb_yjit_options *options)
}

// If type propagation is disabled, max 1 version per block
if (rb_yjit_opts.no_type_prop)
{
if (rb_yjit_opts.no_type_prop) {
rb_yjit_opts.max_versions = 1;
}

blocks_assuming_stable_global_constant_state = st_init_numtable();
blocks_assuming_single_ractor_mode = st_init_numtable();
blocks_assuming_bops = st_init_numtable();

yjit_init_core();
yjit_init_codegen();
yjit_init_core();

// YJIT Ruby module
mYjit = rb_define_module_under(rb_cRubyVM, "YJIT");
Expand Down
4 changes: 0 additions & 4 deletions yjit_utils.c
Expand Up @@ -101,13 +101,9 @@ print_str(codeblock_t *cb, const char *str)
cb_write_byte(cb, (uint8_t)str[i]);
cb_write_byte(cb, 0);

push(cb, RSP); // Alignment

// Call the print function
mov(cb, RAX, const_ptr_opnd((void*)&print_str_cfun));
call(cb, RAX);

pop(cb, RSP); // Alignment

pop_regs(cb);
}