add --yjit-dump-iseqs param (Shopify#332)

ruby · Aug 24, 2022 · b4be3c0 · b4be3c0
1 parent 0ad9cc1
commit b4be3c0
Show file tree

Hide file tree

Showing 7 changed files with 170 additions and 41 deletions.
diff --git a/yjit.c b/yjit.c
@@ -399,6 +399,18 @@ rb_str_bytesize(VALUE str)
     return LONG2NUM(RSTRING_LEN(str));
 }
 
+unsigned long
+rb_RSTRING_LEN(VALUE str)
+{
+    return RSTRING_LEN(str);
+}
+
+char *
+rb_RSTRING_PTR(VALUE str)
+{
+    return RSTRING_PTR(str);
+}
+
 // This is defined only as a named struct inside rb_iseq_constant_body.
 // By giving it a separate typedef, we make it nameable by rust-bindgen.
 // Bindgen's temp/anon name isn't guaranteed stable.

diff --git a/yjit/bindgen/src/main.rs b/yjit/bindgen/src/main.rs
@@ -70,6 +70,9 @@ fn main() {
         .allowlist_function("rb_str_buf_append")
         .allowlist_function("rb_str_dup")
 
+        // From encindex.h
+        .allowlist_type("ruby_preserved_encindex")
+
         // This struct is public to Ruby C extensions
         // From include/ruby/internal/core/rbasic.h
         .allowlist_type("RBasic")
@@ -240,6 +243,7 @@ fn main() {
         .allowlist_var("VM_ENV_DATA_INDEX_SPECVAL")
         .allowlist_var("VM_ENV_DATA_INDEX_FLAGS")
         .allowlist_var("VM_ENV_DATA_SIZE")
+        .allowlist_function("rb_iseq_path")
 
         // From yjit.c
         .allowlist_function("rb_iseq_(get|set)_yjit_payload")
@@ -265,6 +269,8 @@ fn main() {
         .allowlist_function("rb_yjit_for_each_iseq")
         .allowlist_function("rb_yjit_obj_written")
         .allowlist_function("rb_yjit_str_simple_append")
+        .allowlist_function("rb_RSTRING_PTR")
+        .allowlist_function("rb_RSTRING_LEN")
         .allowlist_function("rb_ENCODING_GET")
         .allowlist_function("rb_yjit_exit_locations_dict")
 
@@ -282,6 +288,7 @@ fn main() {
         .allowlist_function("rb_vm_insn_addr2opcode")
         .allowlist_function("rb_iseqw_to_iseq")
         .allowlist_function("rb_iseq_each")
+        .allowlist_function("rb_iseq_method_name")
 
         // From builtin.h
         .allowlist_type("rb_builtin_function.*")

diff --git a/yjit/src/core.rs b/yjit/src/core.rs
@@ -6,6 +6,8 @@ use crate::cruby::*;
 use crate::options::*;
 use crate::stats::*;
 use crate::utils::*;
+#[cfg(feature="disasm")]
+use crate::disasm::*;
 use core::ffi::c_void;
 use std::cell::*;
 use std::hash::{Hash, Hasher};
@@ -1426,6 +1428,20 @@ fn gen_block_series_body(
         last_blockref = new_blockref;
     }
 
+    #[cfg(feature = "disasm")]
+    {
+        // If dump_iseq_disasm is active, see if this iseq's location matches the given substring.
+        // If so, we print the new blocks to the console.
+        if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
+            let iseq_location = iseq_get_location(blockid.iseq);
+            if iseq_location.contains(substr) {
+                let last_block = last_blockref.borrow();
+                println!("Compiling {} block(s) for {}, ISEQ offsets [{}, {})", batch.len(), iseq_location, blockid.idx, last_block.end_idx);
+                println!("{}", disasm_iseq_insn_range(blockid.iseq, blockid.idx, last_block.end_idx));
+            }
+        }
+    }
+
     Some(first_block)
 }
 
@@ -1956,6 +1972,17 @@ pub fn invalidate_block_version(blockref: &BlockRef) {
 
     verify_blockid(block.blockid);
 
+    #[cfg(feature = "disasm")]
+    {
+        // If dump_iseq_disasm is specified, print to console that blocks for matching ISEQ names were invalidated.
+        if let Some(substr) = get_option_ref!(dump_iseq_disasm).as_ref() {
+            let iseq_location = iseq_get_location(block.blockid.iseq);
+            if iseq_location.contains(substr) {
+                println!("Invalidating block from {}, ISEQ offsets [{}, {})", iseq_location, block.blockid.idx, block.end_idx);
+            }
+        }
+    }
+
     // Remove this block from the version array
     remove_block_version(blockref);
 

diff --git a/yjit/src/cruby_bindings.inc.rs b/yjit/src/cruby_bindings.inc.rs
@@ -246,6 +246,20 @@ pub const RUBY_ENCODING_SHIFT: ruby_encoding_consts = 22;
 pub const RUBY_ENCODING_MASK: ruby_encoding_consts = 532676608;
 pub const RUBY_ENCODING_MAXNAMELEN: ruby_encoding_consts = 42;
 pub type ruby_encoding_consts = u32;
+pub const RUBY_ENCINDEX_ASCII_8BIT: ruby_preserved_encindex = 0;
+pub const RUBY_ENCINDEX_UTF_8: ruby_preserved_encindex = 1;
+pub const RUBY_ENCINDEX_US_ASCII: ruby_preserved_encindex = 2;
+pub const RUBY_ENCINDEX_UTF_16BE: ruby_preserved_encindex = 3;
+pub const RUBY_ENCINDEX_UTF_16LE: ruby_preserved_encindex = 4;
+pub const RUBY_ENCINDEX_UTF_32BE: ruby_preserved_encindex = 5;
+pub const RUBY_ENCINDEX_UTF_32LE: ruby_preserved_encindex = 6;
+pub const RUBY_ENCINDEX_UTF_16: ruby_preserved_encindex = 7;
+pub const RUBY_ENCINDEX_UTF_32: ruby_preserved_encindex = 8;
+pub const RUBY_ENCINDEX_UTF8_MAC: ruby_preserved_encindex = 9;
+pub const RUBY_ENCINDEX_EUC_JP: ruby_preserved_encindex = 10;
+pub const RUBY_ENCINDEX_Windows_31J: ruby_preserved_encindex = 11;
+pub const RUBY_ENCINDEX_BUILTIN_MAX: ruby_preserved_encindex = 12;
+pub type ruby_preserved_encindex = u32;
 extern "C" {
     pub fn rb_obj_info_dump(obj: VALUE);
 }
@@ -649,6 +663,9 @@ pub const VM_ENV_FLAG_ESCAPED: vm_frame_env_flags = 4;
 pub const VM_ENV_FLAG_WB_REQUIRED: vm_frame_env_flags = 8;
 pub const VM_ENV_FLAG_ISOLATED: vm_frame_env_flags = 16;
 pub type vm_frame_env_flags = u32;
+extern "C" {
+    pub fn rb_iseq_path(iseq: *const rb_iseq_t) -> VALUE;
+}
 extern "C" {
     pub fn rb_vm_bh_to_procval(ec: *const rb_execution_context_t, block_handler: VALUE) -> VALUE;
 }
@@ -969,6 +986,9 @@ extern "C" {
 extern "C" {
     pub fn rb_iseqw_to_iseq(iseqw: VALUE) -> *const rb_iseq_t;
 }
+extern "C" {
+    pub fn rb_iseq_method_name(iseq: *const rb_iseq_t) -> VALUE;
+}
 extern "C" {
     pub fn rb_vm_barrier();
 }
@@ -1020,6 +1040,12 @@ extern "C" {
 extern "C" {
     pub fn rb_iseq_opcode_at_pc(iseq: *const rb_iseq_t, pc: *const VALUE) -> ::std::os::raw::c_int;
 }
+extern "C" {
+    pub fn rb_RSTRING_LEN(str_: VALUE) -> ::std::os::raw::c_ulong;
+}
+extern "C" {
+    pub fn rb_RSTRING_PTR(str_: VALUE) -> *mut ::std::os::raw::c_char;
+}
 pub type rb_seq_param_keyword_struct = rb_iseq_constant_body__bindgen_ty_1_rb_iseq_param_keyword;
 extern "C" {
     pub fn rb_leaf_invokebuiltin_iseq_p(iseq: *const rb_iseq_t) -> bool;

diff --git a/yjit/src/disasm.rs b/yjit/src/disasm.rs
@@ -26,15 +26,17 @@ pub extern "C" fn rb_yjit_disasm_iseq(_ec: EcPtr, _ruby_self: VALUE, iseqw: VALU
         // Get the iseq pointer from the wrapper
         let iseq = unsafe { rb_iseqw_to_iseq(iseqw) };
 
-        let out_string = disasm_iseq(iseq);
+        // This will truncate disassembly of methods with 10k+ bytecodes.
+        // That's a good thing - this prints to console.
+        let out_string = disasm_iseq_insn_range(iseq, 0, 9999);
 
         return rust_str_to_ruby(&out_string);
     }
 }
 
 #[cfg(feature = "disasm")]
-fn disasm_iseq(iseq: IseqPtr) -> String {
-    let mut out = String::from("");
+pub fn disasm_iseq_insn_range(iseq: IseqPtr, start_idx: u32, end_idx: u32) -> String {
+        let mut out = String::from("");
 
     // Get a list of block versions generated for this iseq
     let mut block_list = get_iseq_block_list(iseq);
@@ -84,47 +86,49 @@ fn disasm_iseq(iseq: IseqPtr) -> String {
     for block_idx in 0..block_list.len() {
         let block = block_list[block_idx].borrow();
         let blockid = block.get_blockid();
-        let end_idx = block.get_end_idx();
-        let start_addr = block.get_start_addr().unwrap().raw_ptr();
-        let end_addr = block.get_end_addr().unwrap().raw_ptr();
-        let code_size = block.code_size();
-
-        // Write some info about the current block
-        let block_ident = format!(
-            "BLOCK {}/{}, ISEQ RANGE [{},{}), {} bytes ",
-            block_idx + 1,
-            block_list.len(),
-            blockid.idx,
-            end_idx,
-            code_size
-        );
-        out.push_str(&format!("== {:=<60}\n", block_ident));
-
-        // Disassemble the instructions
-        let code_slice = unsafe { std::slice::from_raw_parts(start_addr, code_size) };
-        let insns = cs.disasm_all(code_slice, start_addr as u64).unwrap();
-
-        // For each instruction in this block
-        for insn in insns.as_ref() {
-            // Comments for this block
-            if let Some(comment_list) = global_cb.comments_at(insn.address() as usize) {
-                for comment in comment_list {
-                    out.push_str(&format!("  \x1b[1m# {}\x1b[0m\n", comment));
+        if blockid.idx >= start_idx && blockid.idx < end_idx {
+            let end_idx = block.get_end_idx();
+            let start_addr = block.get_start_addr().unwrap().raw_ptr();
+            let end_addr = block.get_end_addr().unwrap().raw_ptr();
+            let code_size = block.code_size();
+
+            // Write some info about the current block
+            let block_ident = format!(
+                "BLOCK {}/{}, ISEQ RANGE [{},{}), {} bytes ",
+                block_idx + 1,
+                block_list.len(),
+                blockid.idx,
+                end_idx,
+                code_size
+            );
+            out.push_str(&format!("== {:=<60}\n", block_ident));
+
+            // Disassemble the instructions
+            let code_slice = unsafe { std::slice::from_raw_parts(start_addr, code_size) };
+            let insns = cs.disasm_all(code_slice, start_addr as u64).unwrap();
+
+            // For each instruction in this block
+            for insn in insns.as_ref() {
+                // Comments for this block
+                if let Some(comment_list) = global_cb.comments_at(insn.address() as usize) {
+                    for comment in comment_list {
+                        out.push_str(&format!("  \x1b[1m# {}\x1b[0m\n", comment));
+                    }
                 }
+                out.push_str(&format!("  {}\n", insn));
             }
-            out.push_str(&format!("  {}\n", insn));
-        }
 
-        // If this is not the last block
-        if block_idx < block_list.len() - 1 {
-            // Compute the size of the gap between this block and the next
-            let next_block = block_list[block_idx + 1].borrow();
-            let next_start_addr = next_block.get_start_addr().unwrap().raw_ptr();
-            let gap_size = (next_start_addr as usize) - (end_addr as usize);
+            // If this is not the last block
+            if block_idx < block_list.len() - 1 {
+                // Compute the size of the gap between this block and the next
+                let next_block = block_list[block_idx + 1].borrow();
+                let next_start_addr = next_block.get_start_addr().unwrap().raw_ptr();
+                let gap_size = (next_start_addr as usize) - (end_addr as usize);
 
-            // Log the size of the gap between the blocks if nonzero
-            if gap_size > 0 {
-                out.push_str(&format!("... {} byte gap ...\n", gap_size));
+                // Log the size of the gap between the blocks if nonzero
+                if gap_size > 0 {
+                    out.push_str(&format!("... {} byte gap ...\n", gap_size));
+                }
             }
         }
     }

diff --git a/yjit/src/options.rs b/yjit/src/options.rs
@@ -1,7 +1,7 @@
 use std::ffi::CStr;
 
 // Command-line options
-#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+#[derive(Clone, PartialEq, Eq, Debug)]
 #[repr(C)]
 pub struct Options {
     // Size of the executable memory block to allocate in MiB
@@ -30,6 +30,9 @@ pub struct Options {
     /// Dump compiled and executed instructions for debugging
     pub dump_insns: bool,
 
+    /// Print when specific ISEQ items are compiled or invalidated
+    pub dump_iseq_disasm: Option<String>,
+
     /// Verify context objects (debug mode only)
     pub verify_ctx: bool,
 
@@ -52,6 +55,7 @@ pub static mut OPTIONS: Options = Options {
     dump_insns: false,
     verify_ctx: false,
     global_constant_state: false,
+    dump_iseq_disasm: None,
 };
 
 /// Macro to get an option value by name
@@ -64,6 +68,16 @@ macro_rules! get_option {
 }
 pub(crate) use get_option;
 
+/// Macro to reference an option value by name; we assume it's a cloneable type like String or an Option of same.
+macro_rules! get_option_ref {
+    // Unsafe is ok here because options are initialized
+    // once before any Ruby code executes
+    ($option_name:ident) => {
+        unsafe { &(OPTIONS.$option_name) }
+    };
+}
+pub(crate) use get_option_ref;
+
 /// Expected to receive what comes after the third dash in "--yjit-*".
 /// Empty string means user passed only "--yjit". C code rejects when
 /// they pass exact "--yjit-".
@@ -105,6 +119,10 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> {
             }
         },
 
+        ("dump-iseq-disasm", _) => unsafe {
+            OPTIONS.dump_iseq_disasm = Some(opt_val.to_string());
+        },
+
         ("greedy-versioning", "") => unsafe { OPTIONS.greedy_versioning = true },
         ("no-type-prop", "") => unsafe { OPTIONS.no_type_prop = true },
         ("stats", "") => unsafe { OPTIONS.gen_stats = true },

diff --git a/yjit/src/utils.rs b/yjit/src/utils.rs
@@ -71,6 +71,41 @@ macro_rules! offset_of {
 #[allow(unused)]
 pub(crate) use offset_of;
 
+// Convert a CRuby UTF-8-encoded RSTRING into a Rust string.
+// This should work fine on ASCII strings and anything else
+// that is considered legal UTF-8, including embedded nulls.
+fn ruby_str_to_rust(v: VALUE) -> String {
+    // Make sure the CRuby encoding is UTF-8 compatible
+    let encoding = unsafe { rb_ENCODING_GET(v) } as u32;
+    assert!(encoding == RUBY_ENCINDEX_ASCII_8BIT || encoding == RUBY_ENCINDEX_UTF_8 || encoding == RUBY_ENCINDEX_US_ASCII);
+
+    let str_ptr = unsafe { rb_RSTRING_PTR(v) } as *mut u8;
+    let str_len: usize = unsafe { rb_RSTRING_LEN(v) }.try_into().unwrap();
+    let str_slice: &[u8] = unsafe { slice::from_raw_parts(str_ptr, str_len) };
+    String::from_utf8(str_slice.to_vec()).unwrap() // does utf8 validation
+}
+
+// Location is the file defining the method, colon, method name.
+// Filenames are sometimes internal strings supplied to eval,
+// so be careful with them.
+pub fn iseq_get_location(iseq: IseqPtr) -> String {
+    let iseq_path = unsafe { rb_iseq_path(iseq) };
+    let iseq_method = unsafe { rb_iseq_method_name(iseq) };
+
+    let mut s = if iseq_path == Qnil {
+        "None".to_string()
+    } else {
+        ruby_str_to_rust(iseq_path)
+    };
+    s.push_str(":");
+    if iseq_method == Qnil {
+        s.push_str("None");
+    } else {
+        s.push_str(& ruby_str_to_rust(iseq_method));
+    }
+    s
+}
+
 #[cfg(test)]
 mod tests {
     #[test]