diff --git a/.gitignore b/.gitignore index 212a5324..bf2724af 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ build/path/ tests/**/*.elf tests/arch-test-target/config.ini __pycache__/ +src/rv32_jit_template.c diff --git a/Makefile b/Makefile index dc0ea912..ee473682 100644 --- a/Makefile +++ b/Makefile @@ -118,6 +118,21 @@ gdbstub-test: $(BIN) $(Q).ci/gdbstub-test.sh && $(call notice, [OK]) endif +ENABLE_JIT ?= 0 +$(call set-feature, JIT) +ifeq ($(call has, JIT), 1) +OBJS_EXT += jit_x64.o +ifneq ($(processor), x86_64) +$(error JIT mode only supports for x64 target currently.) +endif + +src/rv32_jit_template.c: + $(Q)tools/gen-jit-template.py $(CFLAGS) > $@ + +$(OUT)/jit_x64.o: src/jit_x64.c src/rv32_jit_template.c + $(VECHO) " CC\t$@\n" + $(Q)$(CC) -o $@ $(CFLAGS) -c -MMD -MF $@.d $< +endif # For tail-call elimination, we need a specific set of build flags applied. # FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact. $(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector @@ -214,7 +229,7 @@ endif endif clean: - $(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT) + $(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT) src/rv32_jit_template.c distclean: clean -$(RM) $(DOOM_DATA) $(QUAKE_DATA) $(RM) -r $(OUT)/id1 diff --git a/mk/tools.mk b/mk/tools.mk index e066286f..65577483 100644 --- a/mk/tools.mk +++ b/mk/tools.mk @@ -3,6 +3,7 @@ HIST_BIN := $(OUT)/rv_histogram # FIXME: riscv.o and map.o are dependencies of 'elf.o', not 'rv_histogram'. HIST_OBJS := \ riscv.o \ + utils.o \ map.o \ elf.o \ decode.o \ diff --git a/src/cache.c b/src/cache.c index df4e5cc2..26dad3b4 100644 --- a/src/cache.c +++ b/src/cache.c @@ -13,10 +13,12 @@ #include "mpool.h" #include "utils.h" -/* THRESHOLD is set to identify hot spots. Once the frequency of use for a block - * exceeds the THRESHOLD, the JIT compiler flow is triggered. +/* Currently, THRESHOLD is set to identify hot spots. Once the using frequency + * for a block exceeds the THRESHOLD, the JIT compiler process is triggered. + * FIXME: Implement effective profiler to detect hot spots, instead of simply + * relying on THRESHOLD. */ -#define THRESHOLD 1000 +#define THRESHOLD 4096 static uint32_t cache_size, cache_size_bits; static struct mpool *cache_mp; @@ -545,3 +547,62 @@ void cache_free(cache_t *cache, void (*callback)(void *)) free(cache->map); free(cache); } + +#if !RV32_HAS(ARC) +uint32_t cache_freq(struct cache *cache, uint32_t key) +{ + if (!cache->capacity || + hlist_empty(&cache->map->ht_list_head[cache_hash(key)])) + return 0; + lfu_entry_t *entry = NULL; +#ifdef __HAVE_TYPEOF + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list) +#else + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list, lfu_entry_t) +#endif + { + if (entry->key == key) + return entry->frequency; + } + return 0; +} +#endif + +#if RV32_HAS(JIT) +bool cache_hot(struct cache *cache, uint32_t key) +{ + if (!cache->capacity || + hlist_empty(&cache->map->ht_list_head[cache_hash(key)])) + return false; +#if RV32_HAS(ARC) + arc_entry_t *entry = NULL; +#ifdef __HAVE_TYPEOF + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list) +#else + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list, arc_entry_t) +#endif + { + if (entry->key == key && entry->frequency == THRESHOLD) + return true; + } +#else + lfu_entry_t *entry = NULL; +#ifdef __HAVE_TYPEOF + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list) +#else + hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)], + ht_list, lfu_entry_t) +#endif + { + if (entry->key == key && entry->frequency == THRESHOLD) + return true; + } +#endif + return false; +} +#endif diff --git a/src/cache.h b/src/cache.h index 6460cd28..cdbc43d7 100644 --- a/src/cache.h +++ b/src/cache.h @@ -5,6 +5,7 @@ #pragma once +#include #include struct cache; @@ -38,3 +39,17 @@ void *cache_put(struct cache *cache, uint32_t key, void *value); * @callback: a function for freeing cache entry completely */ void cache_free(struct cache *cache, void (*callback)(void *)); + +#if RV32_HAS(JIT) +/** + * cache_hot - check whether the frequency of the cache entry exceeds the + * threshold or not + * @cache: a pointer points to target cache + * @key: the key of the specified entry + */ +bool cache_hot(struct cache *cache, uint32_t key); +#endif + +#if !RV32_HAS(ARC) +uint32_t cache_freq(struct cache *cache, uint32_t key); +#endif \ No newline at end of file diff --git a/src/decode.h b/src/decode.h index 2d552730..9861863c 100644 --- a/src/decode.h +++ b/src/decode.h @@ -179,13 +179,31 @@ enum op_field { ) /* clang-format on */ +/* Macro operation fusion */ + +/* macro operation fusion: convert specific RISC-V instruction patterns + * into faster and equivalent code + */ +#define FUSE_INSN_LIST \ + _(fuse1) \ + _(fuse2) \ + _(fuse3) \ + _(fuse4) \ + _(fuse5) \ + _(fuse6) \ + _(fuse7) + /* clang-format off */ /* IR list */ enum { #define _(inst, can_branch, insn_len, reg_mask) rv_insn_##inst, RV_INSN_LIST #undef _ - N_RV_INSNS + N_RV_INSNS, +#define _(inst) rv_insn_##inst, + FUSE_INSN_LIST +#undef _ + N_TOTAL_INSNS, }; /* clang-format on */ diff --git a/src/emulate.c b/src/emulate.c index c8d790c5..88759917 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -25,6 +25,10 @@ extern struct target_ops gdbstub_ops; #include "riscv_private.h" #include "state.h" #include "utils.h" +#if RV32_HAS(JIT) +#include "cache.h" +#include "jit_x64.h" +#endif /* Shortcuts for comparing each field of specified RISC-V instruction */ #define IF_insn(i, o) (i->opcode == rv_insn_##o) @@ -282,8 +286,10 @@ void rv_debug(riscv_t *rv) } #endif /* RV32_HAS(GDBSTUB) */ +#if !RV32_HAS(JIT) /* hash function for the block map */ HASH_FUNC_IMPL(map_hash, BLOCK_MAP_CAPACITY_BITS, 1 << BLOCK_MAP_CAPACITY_BITS) +#endif /* allocate a basic block */ static block_t *block_alloc(riscv_t *rv) @@ -292,9 +298,14 @@ static block_t *block_alloc(riscv_t *rv) assert(block); block->n_insn = 0; block->predict = NULL; +#if RV32_HAS(JIT) + block->hot = false; + block->backward = false; +#endif return block; } +#if !RV32_HAS(JIT) /* insert a block into block map */ static void block_insert(block_map_t *map, const block_t *block) { @@ -330,6 +341,7 @@ static block_t *block_find(const block_map_t *map, const uint32_t addr) } return NULL; } +#endif FORCE_INLINE bool insn_is_misaligned(uint32_t pc) { @@ -370,8 +382,14 @@ static bool is_branch_taken = false; /* record the program counter of the previous block */ static uint32_t last_pc = 0; +#if RV32_HAS(JIT) +/* record whether the block is replaced by cache. If so, clear the EBB + * information */ +static bool clear_flag = false; +#endif + /* Interpreter-based execution path */ -#define RVOP(inst, code) \ +#define RVOP(inst, code, asm) \ static bool do_##inst(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, \ uint32_t PC) \ { \ @@ -393,27 +411,6 @@ static uint32_t last_pc = 0; /* FIXME: Add JIT-based execution path */ -/* Macro operation fusion */ - -/* macro operation fusion: convert specific RISC-V instruction patterns - * into faster and equivalent code - */ -#define FUSE_INSN_LIST \ - _(fuse1) \ - _(fuse2) \ - _(fuse3) \ - _(fuse4) \ - _(fuse5) \ - _(fuse6) \ - _(fuse7) - -enum { - rv_insn_fuse0 = N_RV_INSNS, -#define _(inst) rv_insn_##inst, - FUSE_INSN_LIST -#undef _ -}; - /* multiple lui */ static bool do_fuse1(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, uint32_t PC) { @@ -497,44 +494,26 @@ static bool do_fuse4(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, uint32_t PC) /* memset */ static bool do_fuse5(riscv_t *rv, - const rv_insn_t *ir, + const rv_insn_t *ir UNUSED, uint64_t cycle, - uint32_t PC) + uint32_t PC UNUSED) { /* FIXME: specify the correct cycle count for memset routine */ cycle += 2; - memory_t *m = ((state_t *) rv->userdata)->mem; - memset((char *) m->mem_base + rv->X[rv_reg_a0], rv->X[rv_reg_a1], - rv->X[rv_reg_a2]); - PC = rv->X[rv_reg_ra] & ~1U; - if (unlikely(RVOP_NO_NEXT(ir))) { - rv->csr_cycle = cycle; - rv->PC = PC; - return true; - } - const rv_insn_t *next = ir->next; - MUST_TAIL return next->impl(rv, next, cycle, PC); + rv->io.on_memset(rv); + return true; } /* memcpy */ static bool do_fuse6(riscv_t *rv, - const rv_insn_t *ir, + const rv_insn_t *ir UNUSED, uint64_t cycle, - uint32_t PC) + uint32_t PC UNUSED) { /* FIXME: specify the correct cycle count for memcpy routine */ cycle += 2; - memory_t *m = ((state_t *) rv->userdata)->mem; - memcpy((char *) m->mem_base + rv->X[rv_reg_a0], - (char *) m->mem_base + rv->X[rv_reg_a1], rv->X[rv_reg_a2]); - PC = rv->X[rv_reg_ra] & ~1U; - if (unlikely(RVOP_NO_NEXT(ir))) { - rv->csr_cycle = cycle; - rv->PC = PC; - return true; - } - const rv_insn_t *next = ir->next; - MUST_TAIL return next->impl(rv, next, cycle, PC); + rv->io.on_memcpy(rv); + return true; } /* multiple shift immediate */ @@ -634,6 +613,8 @@ static void block_translate(riscv_t *rv, block_t *block) prev_ir = ir; /* stop on branch */ if (insn_is_branch(ir->opcode)) { + if (ir->imm < 0) + block->backward = true; if (ir->opcode == rv_insn_jalr #if RV32_HAS(EXT_C) || ir->opcode == rv_insn_cjalr || ir->opcode == rv_insn_cjr @@ -878,6 +859,8 @@ static void match_pattern(riscv_t *rv, block_t *block) /* TODO: mixture of SW and LW */ /* TODO: reorder insturction to match pattern */ case rv_insn_slli: + case rv_insn_srli: + case rv_insn_srai: count = 1; next_ir = ir->next; while (1) { @@ -905,6 +888,11 @@ static void match_pattern(riscv_t *rv, block_t *block) } } +/* The IR with the same PC in different basic blocks is considered the same IR + * in the JIT gencode. However, the IR in basic block 1 could be optimized by + * constant optimization, while the IR in basic block 2 might not. Given this + * situation, we disable constant optimization in JIT mode. + */ typedef struct { bool is_constant[N_RV_REGS]; uint32_t const_val[N_RV_REGS]; @@ -941,16 +929,22 @@ static void optimize_constant(riscv_t *rv UNUSED, block_t *block) static block_t *prev = NULL; static block_t *block_find_or_translate(riscv_t *rv) { +#if !RV32_HAS(JIT) block_map_t *map = &rv->block_map; /* lookup the next block in the block map */ block_t *next = block_find(map, rv->PC); +#else + /* lookup the next block in the block cache */ + block_t *next = (block_t *) cache_get(rv->block_cache, rv->PC); +#endif if (!next) { +#if !RV32_HAS(JIT) if (map->size * 1.25 > map->block_capacity) { block_map_clear(rv); prev = NULL; } - +#endif /* allocate a new block */ next = block_alloc(rv); block_translate(rv, next); @@ -963,9 +957,24 @@ static block_t *block_find_or_translate(riscv_t *rv) /* macro operation fusion */ match_pattern(rv, next); } +#if !RV32_HAS(JIT) /* insert the block into block map */ block_insert(&rv->block_map, next); - +#else + /* insert the block into block cache */ + block_t *delete_target = cache_put(rv->block_cache, rv->PC, &(*next)); + if (delete_target) { + uint32_t idx; + rv_insn_t *ir, *next; + for (idx = 0, ir = delete_target->ir_head; + idx < delete_target->n_insn; idx++, ir = next) { + free(ir->fuse); + next = ir->next; + mpool_free(rv->block_ir_mp, ir); + } + mpool_free(rv->block_mp, delete_target); + } +#endif /* update the block prediction. * When translating a new block, the block predictor may benefit, * but updating it after finding a particular block may penalize @@ -978,6 +987,10 @@ static block_t *block_find_or_translate(riscv_t *rv) return next; } +#if RV32_HAS(JIT) +typedef void (*exec_tired1_block_func_t)(riscv_t *rv, uintptr_t); +#endif + void rv_step(riscv_t *rv, int32_t cycles) { assert(rv); @@ -1009,15 +1022,28 @@ void rv_step(riscv_t *rv, int32_t cycles) if (prev) { /* update previous block */ if (prev->pc_start != last_pc) +#if !RV32_HAS(JIT) prev = block_find(&rv->block_map, last_pc); - +#else + prev = cache_get(rv->block_cache, last_pc); +#endif if (prev) { rv_insn_t *last_ir = prev->ir_tail; +#if RV32_HAS(JIT) + if (clear_flag) { + if (is_branch_taken) + last_ir->branch_taken = NULL; + else + last_ir->branch_untaken = NULL; + + clear_flag = false; + } +#endif /* chain block */ if (!insn_is_unconditional_branch(last_ir->opcode)) { if (is_branch_taken && !last_ir->branch_taken) last_ir->branch_taken = block->ir_head; - else if (!last_ir->branch_untaken) + else if (!is_branch_taken && !last_ir->branch_untaken) last_ir->branch_untaken = block->ir_head; } else if (IF_insn(last_ir, jal) #if RV32_HAS(EXT_C) @@ -1030,11 +1056,33 @@ void rv_step(riscv_t *rv, int32_t cycles) } } last_pc = rv->PC; - - /* execute the block */ +#if RV32_HAS(JIT) + /* execute by tier 1 JIT compiler */ + struct jit_state *state = rv->jit_state; + if (block->hot) { + ((exec_tired1_block_func_t) state->buf)( + rv, (uintptr_t) (state->buf + block->offset)); + prev = NULL; + continue; + } /* check if using frequency of block exceed threshold */ + else if ((block->backward && + cache_freq(rv->block_cache, block->pc_start) >= 1024) || + (cache_hot(rv->block_cache, block->pc_start))) { + block->hot = true; + block->offset = translate_x64(rv, block); + ((exec_tired1_block_func_t) state->buf)( + rv, (uintptr_t) (state->buf + block->offset)); + prev = NULL; + continue; + } +#endif + /* execute the block by interpreter */ const rv_insn_t *ir = block->ir_head; - if (unlikely(!ir->impl(rv, ir, rv->csr_cycle, rv->PC))) + if (unlikely(!ir->impl(rv, ir, rv->csr_cycle, rv->PC))) { + /* block should not be extended if execption handler invoked */ + prev = NULL; break; + } prev = block; } } @@ -1052,6 +1100,22 @@ void ecall_handler(riscv_t *rv) syscall_handler(rv); } +void memset_handler(riscv_t *rv) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + memset((char *) m->mem_base + rv->X[rv_reg_a0], rv->X[rv_reg_a1], + rv->X[rv_reg_a2]); + rv->PC = rv->X[rv_reg_ra] & ~1U; +} + +void memcpy_handler(riscv_t *rv) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + memcpy((char *) m->mem_base + rv->X[rv_reg_a0], + (char *) m->mem_base + rv->X[rv_reg_a1], rv->X[rv_reg_a2]); + rv->PC = rv->X[rv_reg_ra] & ~1U; +} + void dump_registers(riscv_t *rv, char *out_file_path) { FILE *f = out_file_path[0] == '-' ? stdout : fopen(out_file_path, "w"); diff --git a/src/feature.h b/src/feature.h index 36fb4ac4..7125d1ae 100644 --- a/src/feature.h +++ b/src/feature.h @@ -52,5 +52,10 @@ #define RV32_FEATURE_ARC 0 #endif +/* Experimental just-in-time compiler */ +#ifndef RV32_FEATURE_JIT +#define RV32_FEATURE_JIT 0 +#endif + /* Feature test macro */ #define RV32_HAS(x) RV32_FEATURE_##x diff --git a/src/jit_x64.c b/src/jit_x64.c new file mode 100644 index 00000000..a45bd65a --- /dev/null +++ b/src/jit_x64.c @@ -0,0 +1,593 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + + +/* This JIT implementation has undergone extensive modifications, heavily + * relying on the ubpf_jit_x86_64.[ch] from ubpf. The original + * ubpf_jit_x86_64.[ch] file served as the foundation and source of inspiration + * for adapting and tailoring it specifically for this JIT implementation. + * Therefore, credit and sincere thanks are extended to ubpf for their + * invaluable work. + * Reference: + * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_x86_64.c + */ + +#if !defined(__x86_64__) +#error "This implementation is dedicated to x86-64." +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache.h" +#include "decode.h" +#include "io.h" +#include "jit_x64.h" +#include "state.h" +#include "utils.h" + +enum VM_REG { + VM_REG_0 = 0, + VM_REG_1, + VM_REG_2, + VM_REG_3, + VM_REG_4, + VM_REG_5, + VM_REG_6, + VM_REG_7, + VM_REG_8, + VM_REG_9, + VM_REG_10, + _VM_REG_MAX, +}; + +#define X64_CLS_MASK 0x07 +#define X64_ALU_OP_MASK 0xf0 +#define X64_CLS_ALU 0x04 +#define X64_CLS_ALU64 0x07 +#define X64_SRC_IMM 0x00 +#define X64_SRC_REG 0x08 +#define X64_OP_MUL_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x20) +#define X64_OP_MUL_REG (X64_CLS_ALU | X64_SRC_REG | 0x20) +#define X64_OP_DIV_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x30) +#define X64_OP_DIV_REG (X64_CLS_ALU | X64_SRC_REG | 0x30) +#define X64_OP_MOD_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x90) +#define X64_OP_MOD_REG (X64_CLS_ALU | X64_SRC_REG | 0x90) + +#define STACK_SIZE 512 +#define MAX_INSTS 1024 + +#if RV32_HAS(EXT_M) +static void muldivmod(struct jit_state *state, + uint8_t opcode, + int src, + int dst, + int32_t imm) +{ + bool mul = (opcode & X64_ALU_OP_MASK) == (X64_OP_MUL_IMM & X64_ALU_OP_MASK); + bool div = (opcode & X64_ALU_OP_MASK) == (X64_OP_DIV_IMM & X64_ALU_OP_MASK); + bool mod = (opcode & X64_ALU_OP_MASK) == (X64_OP_MOD_IMM & X64_ALU_OP_MASK); + bool is64 = (opcode & X64_CLS_MASK) == X64_CLS_ALU64; + bool reg = (opcode & X64_SRC_REG) == X64_SRC_REG; + + /* Short circuit for imm == 0. */ + if (!reg && imm == 0) { + assert(NULL); + if (div || mul) { + /* For division and multiplication, set result to zero. */ + emit_alu32(state, 0x31, dst, dst); + } else { + /* For modulo, set result to dividend. */ + emit_mov(state, dst, dst); + } + return; + } + + if (dst != RAX) { + emit_push(state, RAX); + } + + if (dst != RDX) { + emit_push(state, RDX); + } + + /* Load the divisor into RCX. */ + if (imm) { + emit_load_imm(state, RCX, imm); + } else { + emit_mov(state, src, RCX); + } + + /* Load the dividend into RAX. */ + emit_mov(state, dst, RAX); + + /* JIT has two different semantics for division and modulus. For division + * if the divisor is zero, the result is zero. For modulus, if the divisor + * is zero, the result is the dividend. To handle this we set the divisor + * to 1 if it is zero and then set the result to zero if the divisor was + * zero (for division) or set the result to the dividend if the divisor was + * zero (for modulo). + */ + + if (div || mod) { + /* Check if divisor is zero. */ + if (is64) { + emit_alu64(state, 0x85, RCX, RCX); + } else { + emit_alu32(state, 0x85, RCX, RCX); + } + + /* Save the dividend for the modulo case. */ + if (mod) { + emit_push(state, RAX); /* Save dividend. */ + } + + /* Save the result of the test. */ + emit1(state, 0x9c); /* pushfq */ + + /* Set the divisor to 1 if it is zero. */ + emit_load_imm(state, RDX, 1); + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xca); /* cmove rcx,rdx */ + + /* xor %edx,%edx */ + emit_alu32(state, 0x31, RDX, RDX); + } + + if (is64) { + emit_rex(state, 1, 0, 0, 0); + } + + /* Multiply or divide. */ + emit_alu32(state, 0xf7, mul ? 4 : 6, RCX); + + /* Division operation stores the remainder in RDX and the quotient in + * RAX. + */ + if (div || mod) { + /* Restore the result of the test. */ + emit1(state, 0x9d); /* popfq */ + + /* If zero flag is set, then the divisor was zero. */ + + if (div) { + /* Set the dividend to zero if the divisor was zero. */ + emit_load_imm(state, RCX, 0); + + /* Store 0 in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xc1); /* cmove rax,rcx */ + } else { + /* Restore dividend to RCX. */ + emit_pop(state, RCX); + + /* Store the dividend in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xd1); /* cmove rdx,rcx */ + } + } + + if (dst != RDX) { + if (mod) { + emit_mov(state, RDX, dst); + } + emit_pop(state, RDX); + } + if (dst != RAX) { + if (div || mul) { + emit_mov(state, RAX, dst); + } + emit_pop(state, RAX); + } +} +#endif + +#define REGISTER_MAP_SIZE 11 + +/* + * There are two common x86-64 calling conventions, as discussed at + * https://en.wikipedia.org/wiki/X64_calling_conventions#x86-64_calling_conventions + * + * Please Note: R12 is special and we are *not* using it. As a result, it is + * omitted from the list of non-volatile registers for both platforms (even + * though it is, in fact, non-volatile). + */ + +#if defined(_WIN32) +static int nonvolatile_reg[] = {RBP, RBX, RDI, RSI, R13, R14, R15}; +static int parameter_reg[] = {RCX, RDX, R8, R9}; +#define RCX_ALT R10 +static int register_map[REGISTER_MAP_SIZE] = { + RAX, R10, RDX, R8, R9, R14, R15, RDI, RSI, RBX, RBP, +}; +#else +#define RCX_ALT R9 +static const int nonvolatile_reg[] = {RBP, RBX, R13, R14, R15}; +static const int parameter_reg[] = {RDI, RSI, RDX, RCX, R8, R9}; +static const int register_map[REGISTER_MAP_SIZE] = { + RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP, +}; +#endif + +/* Return the x86 register for the given JIT register */ +static int map_register(int r) +{ + assert(r < _VM_REG_MAX); + return register_map[r % _VM_REG_MAX]; +} + +#define SET_SIZE_BITS 10 +#define SET_SIZE 1 << SET_SIZE_BITS +#define SET_SLOTS_SIZE 32 +HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS); + +/* + * The set consists of SET_SIZE buckets, with each bucket containing + * SET_SLOTS_SIZE slots. + */ +typedef struct { + uint32_t table[SET_SIZE][SET_SLOTS_SIZE]; +} set_t; + +/** + * set_reset - clear a set + * @set: a pointer points to target set + */ +static inline void set_reset(set_t *set) +{ + memset(set, 0, sizeof(set_t)); +} + +/** + * set_add - insert a new element into the set + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_add(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + uint8_t count = 0; + while (set->table[index][count]) { + if (set->table[index][count++] == key) + return false; + } + + set->table[index][count] = key; + return true; +} + +/** + * set_has - check whether the element exist in the set or not + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_has(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + for (uint8_t count = 0; set->table[index][count]; count++) { + if (set->table[index][count] == key) + return true; + } + return false; +} + +#define UPDATE_PC(pc) \ + emit_load_imm(state, RAX, (pc)); \ + emit_store(state, S32, RAX, parameter_reg[0], \ + offsetof(struct riscv_internal, PC)); + +static void prepare_translate(struct jit_state *state) +{ + /* Save platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) + emit_push(state, nonvolatile_reg[i]); + /* + * Assuming that the stack is 16-byte aligned right before + * the call insn that brought us to this code, when + * we start executing the jit'd code, we need to regain a 16-byte + * alignment. The STACK_SIZE is guaranteed to be + * divisible by 16. However, if we pushed an even number of + * registers on the stack when we are saving state (see above), + * then we have to add an additional 8 bytes to get back + * to a 16-byte alignment. + */ + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 5, RSP, 0x8); + + /* Set JIT R10 (the way to access the frame in JIT) to match RSP. */ + + emit_mov(state, RSP, map_register(VM_REG_10)); + + /* Allocate stack space */ + emit_alu64_imm32(state, 0x81, 5, RSP, STACK_SIZE); + +#if defined(_WIN32) + /* Windows x64 ABI requires home register space */ + /* Allocate home register space - 4 registers */ + emit_alu64_imm32(state, 0x81, 5, RSP, 4 * sizeof(uint64_t)); +#endif + + /* Jump to the entry point, the entry point is stored in the second + * parameter. */ + emit1(state, 0xff); + emit1(state, 0xe6); + + /* Epilogue */ + state->exit_loc = state->offset; + /* Move register 0 into rax */ + if (map_register(VM_REG_0) != RAX) + emit_mov(state, map_register(VM_REG_0), RAX); + + /* Deallocate stack space by restoring RSP from JIT R10. */ + emit_mov(state, map_register(VM_REG_10), RSP); + + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 0, RSP, 0x8); + + /* Restore platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) { + emit_pop(state, nonvolatile_reg[ARRAYS_SIZE(nonvolatile_reg) - i - 1]); + } + /* Return */ + emit1(state, 0xc3); +} + +#define X64(inst, code) \ + static void do_##inst(struct jit_state *state UNUSED, riscv_t *rv UNUSED, \ + rv_insn_t *ir UNUSED) \ + { \ + code; \ + } + +#include "rv32_jit_template.c" +#undef X64 + +static void do_fuse1(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load_imm(state, RAX, fuse[i].imm); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse2(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, RAX, ir->imm); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rd); + emit_load(state, S32, parameter_reg[0], RBX, + offsetof(struct riscv_internal, X) + 4 * ir->rs1); + emit_alu32(state, 0x01, RBX, RAX); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rs2); +} + +static void do_fuse3(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], RAX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, RBX, RAX); + emit_load(state, S32, parameter_reg[0], RBX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs2); + emit_store(state, S32, RBX, RAX, 0); + } +} + +static void do_fuse4(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], RAX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, RBX, RAX); + emit_load(state, S32, RAX, RBX, 0); + emit_store(state, S32, RBX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse5(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, RAX, ir->pc + 4); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memset); + emit_exit(&(*state)); +} + +static void do_fuse6(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, RAX, ir->pc + 4); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memcpy); + emit_exit(&(*state)); +} + +static void do_fuse7(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + switch (fuse[i].opcode) { + case rv_insn_slli: + emit_load(state, S32, parameter_reg[0], RAX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 4, RAX, fuse[i].imm & 0x1f); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srli: + emit_load(state, S32, parameter_reg[0], RAX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 5, RAX, fuse[i].imm & 0x1f); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srai: + emit_load(state, S32, parameter_reg[0], RAX, + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 7, RAX, fuse[i].imm & 0x1f); + emit_store(state, S32, RAX, parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + default: + __UNREACHABLE; + break; + } + } +} + +/* clang-format off */ +static const void *dispatch_table[] = { + /* RV32 instructions */ +#define _(inst, can_branch, insn_len, reg_mask) [rv_insn_##inst] = do_##inst, + RV_INSN_LIST +#undef _ + /* Macro operation fusion instructions */ +#define _(inst) [rv_insn_##inst] = do_##inst, + FUSE_INSN_LIST +#undef _ +}; +/* clang-format on */ +typedef void (*codegen_tired1_block_func_t)(struct jit_state *, + riscv_t *, + rv_insn_t *); + +static void translate(struct jit_state *state, riscv_t *rv, block_t *block) +{ + uint32_t idx; + rv_insn_t *ir, *next; + for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { + next = ir->next; + ((codegen_tired1_block_func_t) dispatch_table[ir->opcode])(state, rv, + ir); + } +} + +static void resolve_jumps(struct jit_state *state) +{ + int i; + for (i = 0; i < state->num_jumps; i++) { + struct jump jump = state->jumps[i]; + + int target_loc; + if (jump.target_offset != 0) + target_loc = jump.target_offset; + else if (jump.target_pc == TARGET_PC_EXIT) + target_loc = state->exit_loc; + else if (jump.target_pc == TARGET_PC_RETPOLINE) + target_loc = state->retpoline_loc; + else { + target_loc = jump.offset_loc + sizeof(uint32_t); + for (int i = 0; i < state->num_insn; i++) { + if (jump.target_pc == state->offset_map[i].PC) { + target_loc = state->offset_map[i].offset; + break; + } + } + } + /* Assumes jump offset is at end of instruction */ + uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t)); + + uint8_t *offset_ptr = &state->buf[jump.offset_loc]; + memcpy(offset_ptr, &rel, sizeof(uint32_t)); + } +} + +static void translate_chained_block(struct jit_state *state, + riscv_t *rv, + block_t *block, + set_t *set) +{ + if (set_has(set, block->pc_start)) + return; + set_add(set, block->pc_start); + offset_map_insert(state, block->pc_start); + translate(state, rv, block); + rv_insn_t *ir = block->ir_tail; + if (ir->branch_untaken && !set_has(set, ir->pc + 4)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + 4); + if (block1) + translate_chained_block(state, rv, block1, set); + } + if (ir->branch_taken && !set_has(set, ir->pc + ir->imm)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + ir->imm); + if (block1) + translate_chained_block(state, rv, block1, set); + } +} + +uint32_t translate_x64(riscv_t *rv, block_t *block) +{ + struct jit_state *state = rv->jit_state; + memset(state->offset_map, 0, MAX_INSTS * sizeof(struct offset_map)); + memset(state->jumps, 0, MAX_INSTS * sizeof(struct jump)); + state->num_insn = 0; + state->num_jumps = 0; + uint32_t entry_loc = state->offset; + set_t set; + set_reset(&set); + translate_chained_block(&(*state), rv, block, &set); + + if (state->offset == state->size) { + printf("Target buffer too small\n"); + goto out; + } + resolve_jumps(&(*state)); +out: + return entry_loc; +} + + +struct jit_state *init_state(size_t size) +{ + struct jit_state *state = malloc(sizeof(struct jit_state)); + state->offset = 0; + state->size = size; + state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); + assert(state->buf != MAP_FAILED); + prepare_translate(state); + state->offset_map = calloc(MAX_INSTS, sizeof(struct offset_map)); + state->jumps = calloc(MAX_INSTS, sizeof(struct jump)); + return state; +} + +void destroy_state(struct jit_state *state) +{ + munmap(state->buf, state->size); + free(state->offset_map); + free(state->jumps); + free(state); +} diff --git a/src/jit_x64.h b/src/jit_x64.h new file mode 100644 index 00000000..17f5d67b --- /dev/null +++ b/src/jit_x64.h @@ -0,0 +1,410 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "riscv_private.h" + +enum X64_REG { + RAX, + RCX, + RDX, + RBX, + RSP, + RBP, + RIP = 5, + RSI, + RDI, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, +}; + +enum operand_size { + S8, + S16, + S32, +}; + +struct jump { + uint32_t offset_loc; + uint32_t target_pc; + uint32_t target_offset; +}; + +/* Special values for target_pc in struct jump */ +#define TARGET_PC_EXIT -1U +#define TARGET_PC_RETPOLINE -3U + +struct offset_map { + uint32_t PC; + uint32_t offset; +}; + +struct jit_state { + uint8_t *buf; + uint32_t offset; + uint32_t size; + uint32_t exit_loc; + uint32_t retpoline_loc; + struct offset_map *offset_map; + int num_insn; + struct jump *jumps; + int num_jumps; +}; + +struct jit_state *init_state(size_t size); +void destroy_state(struct jit_state *state); +uint32_t translate_x64(riscv_t *rv, block_t *block); + +static inline void offset_map_insert(struct jit_state *state, int32_t target_pc) +{ + struct offset_map *map_entry = &state->offset_map[state->num_insn++]; + map_entry->PC = target_pc; + map_entry->offset = state->offset; +} + +static inline void emit_bytes(struct jit_state *state, void *data, uint32_t len) +{ + assert(state->offset <= state->size - len); + if ((state->offset + len) > state->size) { + state->offset = state->size; + return; + } + memcpy(state->buf + state->offset, data, len); + state->offset += len; +} + +static inline void emit1(struct jit_state *state, uint8_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit2(struct jit_state *state, uint16_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit4(struct jit_state *state, uint32_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit8(struct jit_state *state, uint64_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit_jump_target_address(struct jit_state *state, + int32_t target_pc) +{ + struct jump *jump = &state->jumps[state->num_jumps++]; + jump->offset_loc = state->offset; + jump->target_pc = target_pc; + emit4(state, 0); +} + +static inline void emit_jump_target_offset(struct jit_state *state, + uint32_t jump_loc, + uint32_t jump_state_offset) +{ + struct jump *jump = &state->jumps[state->num_jumps++]; + jump->offset_loc = jump_loc; + jump->target_offset = jump_state_offset; +} + +static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) +{ + assert(!(mod & ~0xc0)); + emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7)); +} + +static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) +{ + emit_modrm(state, 0xc0, r, m); +} + +static inline void emit_modrm_and_displacement(struct jit_state *state, + int r, + int m, + int32_t d) +{ + if (d == 0 && (m & 7) != RBP) { + emit_modrm(state, 0x00, r, m); + } else if (d >= -128 && d <= 127) { + emit_modrm(state, 0x40, r, m); + emit1(state, d); + } else { + emit_modrm(state, 0x80, r, m); + emit4(state, d); + } +} + +static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) +{ + assert(!(w & ~1)); + assert(!(r & ~1)); + assert(!(x & ~1)); + assert(!(b & ~1)); + emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b); +} + +/* + * Emits a REX prefix with the top bit of src and dst. + * Skipped if no bits would be set. + */ +static inline void emit_basic_rex(struct jit_state *state, + int w, + int src, + int dst) +{ + if (w || (src & 8) || (dst & 8)) { + emit_rex(state, w, !!(src & 8), 0, !!(dst & 8)); + } +} + +static inline void emit_push(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x50 | (r & 7)); +} + +static inline void emit_pop(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x58 | (r & 7)); +} + +/* REX prefix and ModRM byte */ +/* We use the MR encoding when there is a choice */ +/* 'src' is often used as an opcode extension */ +static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) +{ + emit_basic_rex(state, 0, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +} + +/* REX prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu32_imm32(struct jit_state *state, + int op, + int src, + int dst, + int32_t imm) +{ + emit_alu32(state, op, src, dst); + emit4(state, imm); +} + +/* REX prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu32_imm8(struct jit_state *state, + int op, + int src, + int dst, + int8_t imm) +{ + emit_alu32(state, op, src, dst); + emit1(state, imm); +} + +/* REX.W prefix and ModRM byte */ +/* We use the MR encoding when there is a choice */ +/* 'src' is often used as an opcode extension */ +static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) +{ + emit_basic_rex(state, 1, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +} + +/* REX.W prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu64_imm32(struct jit_state *state, + int op, + int src, + int dst, + int32_t imm) +{ + emit_alu64(state, op, src, dst); + emit4(state, imm); +} + +/* REX.W prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu64_imm8(struct jit_state *state, + int op, + int src, + int dst, + int8_t imm) +{ + emit_alu64(state, op, src, dst); + emit1(state, imm); +} + +/* Register to register mov */ +static inline void emit_mov(struct jit_state *state, int src, int dst) +{ + emit_alu64(state, 0x89, src, dst); +} + +static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) +{ + emit_alu64_imm32(state, 0x81, 7, dst, imm); +} + +static inline void emit_cmp32_imm32(struct jit_state *state, + int dst, + int32_t imm) +{ + emit_alu32_imm32(state, 0x81, 7, dst, imm); +} + +static inline void emit_cmp(struct jit_state *state, int src, int dst) +{ + emit_alu64(state, 0x39, src, dst); +} + +static inline void emit_cmp32(struct jit_state *state, int src, int dst) +{ + emit_alu32(state, 0x39, src, dst); +} + +static inline void emit_jcc(struct jit_state *state, + int code, + int32_t target_pc) +{ + emit1(state, 0x0f); + emit1(state, code); + emit_jump_target_address(state, target_pc); +} + +static inline void emit_jcc_offset(struct jit_state *state, int code) +{ + emit1(state, 0x0f); + emit1(state, code); + emit4(state, 0); +} + + +/* Load [src + offset] into dst */ +static inline void emit_load(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ + if (size == S8 || size == S16) { + /* movzx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xb6 : 0xb7); + } else if (size == S32) { + /* mov */ + emit1(state, 0x8b); + } + + emit_modrm_and_displacement(state, dst, src, offset); +} + +static inline void emit_load_sext(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ + if (size == S8 || size == S16) { + /* movsx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xbe : 0xbf); + } else if (size == S32) { + emit_basic_rex(state, 1, dst, src); + emit1(state, 0x63); + } + + emit_modrm_and_displacement(state, dst, src, offset); +} + +/* Load sign-extended immediate into register */ +static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) +{ + if (imm >= INT32_MIN && imm <= INT32_MAX) { + emit_alu64_imm32(state, 0xc7, 0, dst, imm); + } else { + /* movabs $imm,dst */ + emit_basic_rex(state, 1, 0, dst); + emit1(state, 0xb8 | (dst & 7)); + emit8(state, imm); + } +} + +/* Store register src to [dst + offset] */ +static inline void emit_store(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ + if (size == S16) { + emit1(state, 0x66); /* 16-bit override */ + } + emit1(state, size == S8 ? 0x88 : 0x89); + emit_modrm_and_displacement(state, src, dst, offset); +} + +/* Store immediate to [dst + offset] */ +static inline void emit_store_imm32(struct jit_state *state, + enum operand_size size, + int dst, + int32_t offset, + int32_t imm) +{ + if (size == S16) { + emit1(state, 0x66); /* 16-bit override */ + } + emit1(state, size == S8 ? 0xc6 : 0xc7); + emit_modrm_and_displacement(state, 0, dst, offset); + if (size == S32) { + emit4(state, imm); + } else if (size == S16) { + emit2(state, imm); + } else if (size == S8) { + emit1(state, imm); + } +} + +static inline void emit_ret(struct jit_state *state) +{ + emit1(state, 0xc3); +} + +static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) +{ + emit1(state, 0xe9); + emit_jump_target_address(state, target_pc); +} + +static inline void emit_call(struct jit_state *state, intptr_t target) +{ + emit_load_imm(state, RAX, (intptr_t) target); + /* callq *%rax */ + emit1(state, 0xff); + /* ModR/M byte: b11010000b = xd0, rax is register 0*/ + emit1(state, 0xd0); +} + +static inline void emit_exit(struct jit_state *state) +{ + emit1(state, 0xe9); + emit_jump_target_offset(state, state->offset, state->exit_loc); + emit4(state, 0); +} diff --git a/src/main.c b/src/main.c index 7f35034c..2ed8aa41 100644 --- a/src/main.c +++ b/src/main.c @@ -217,6 +217,8 @@ int main(int argc, char **args) /* system */ .on_ecall = ecall_handler, .on_ebreak = ebreak_handler, + .on_memcpy = memcpy_handler, + .on_memset = memset_handler, .allow_misalign = opt_misaligned, }; diff --git a/src/riscv.c b/src/riscv.c index f4807758..72f77219 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -10,9 +10,16 @@ #include "mpool.h" #include "riscv_private.h" #include "state.h" +#include "utils.h" +#if RV32_HAS(JIT) +#include "cache.h" +#include "jit_x64.h" +#define CODE_CACHE_SIZE (1024 * 1024) +#endif #define BLOCK_IR_MAP_CAPACITY_BITS 10 +#if !RV32_HAS(JIT) /* initialize the block map */ static void block_map_init(block_map_t *map, const uint8_t bits) { @@ -52,6 +59,7 @@ static void block_map_destroy(riscv_t *rv) mpool_destroy(rv->block_mp); mpool_destroy(rv->block_ir_mp); } +#endif riscv_user_t rv_userdata(riscv_t *rv) { @@ -119,9 +127,13 @@ riscv_t *rv_create(const riscv_io_t *io, rv->block_ir_mp = mpool_create( sizeof(rv_insn_t) << BLOCK_IR_MAP_CAPACITY_BITS, sizeof(rv_insn_t)); +#if !RV32_HAS(JIT) /* initialize the block map */ - block_map_init(&rv->block_map, 10); - + block_map_init(&rv->block_map, BLOCK_MAP_CAPACITY_BITS); +#else + rv->jit_state = init_state(CODE_CACHE_SIZE); + rv->block_cache = cache_create(BLOCK_MAP_CAPACITY_BITS); +#endif /* reset */ rv_reset(rv, 0U, argc, args); @@ -143,10 +155,16 @@ bool rv_enables_to_output_exit_code(riscv_t *rv) return rv->output_exit_code; } + void rv_delete(riscv_t *rv) { assert(rv); +#if !RV32_HAS(JIT) block_map_destroy(rv); +#else + destroy_state(rv->jit_state); + cache_free(rv->block_cache, NULL); +#endif free(rv); } diff --git a/src/riscv.h b/src/riscv.h index 1d5e45a3..b0538723 100644 --- a/src/riscv.h +++ b/src/riscv.h @@ -90,7 +90,7 @@ enum { #define MSTATUS_MPIE (1 << MSTATUS_MPIE_SHIFT) #define MSTATUS_MPP (3 << MSTATUS_MPP_SHIFT) -#define BLOCK_MAP_CAPACITY_BITS 10 +#define BLOCK_MAP_CAPACITY_BITS 11 /* forward declaration for internal structure */ typedef struct riscv_internal riscv_t; @@ -118,7 +118,8 @@ typedef void (*riscv_mem_write_b)(riscv_word_t addr, riscv_byte_t data); /* system instruction handlers */ typedef void (*riscv_on_ecall)(riscv_t *rv); typedef void (*riscv_on_ebreak)(riscv_t *rv); - +typedef void (*riscv_on_memset)(riscv_t *rv); +typedef void (*riscv_on_memcpy)(riscv_t *rv); /* RISC-V emulator I/O interface */ typedef struct { /* memory read interface */ @@ -135,7 +136,8 @@ typedef struct { /* system */ riscv_on_ecall on_ecall; riscv_on_ebreak on_ebreak; - + riscv_on_memset on_memset; + riscv_on_memcpy on_memcpy; /* enable misaligned memory access */ bool allow_misalign; } riscv_io_t; @@ -182,6 +184,12 @@ void syscall_handler(riscv_t *rv); /* environment call handler */ void ecall_handler(riscv_t *rv); +/* memset handler */ +void memset_handler(riscv_t *rv); + +/* memcpy handler */ +void memcpy_handler(riscv_t *rv); + /* dump registers as JSON to out_file_path */ void dump_registers(riscv_t *rv, char *out_file_path); diff --git a/src/riscv_private.h b/src/riscv_private.h index e0f1d1d0..fb15b277 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -12,6 +12,9 @@ #endif #include "decode.h" #include "riscv.h" +#if RV32_HAS(JIT) +#include "cache.h" +#endif /* CSRs */ enum { @@ -59,6 +62,11 @@ typedef struct block { struct block *predict; /**< block prediction */ rv_insn_t *ir_head, *ir_tail; /**< the first and last ir for this block */ + bool backward; +#if RV32_HAS(JIT) + bool hot; /**< Determine the block is hotspot or not */ + uint32_t offset; +#endif } block_t; typedef struct { @@ -83,20 +91,6 @@ struct riscv_internal { /* user provided data */ riscv_user_t userdata; -#if RV32_HAS(GDBSTUB) - /* gdbstub instance */ - gdbstub_t gdbstub; - - bool debug_mode; - - /* GDB instruction breakpoint */ - breakpoint_map_t breakpoint_map; - - /* The flag to notify interrupt from GDB client: it should - * be accessed by atomic operation when starting the GDBSTUB. */ - bool is_interrupted; -#endif - #if RV32_HAS(EXT_F) /* float registers */ riscv_float_t F[N_RV_REGS]; @@ -116,11 +110,29 @@ struct riscv_internal { uint32_t csr_mip; /* Machine interrupt pending */ uint32_t csr_mbadaddr; - bool compressed; /**< current instruction is compressed or not */ + bool compressed; /**< current instruction is compressed or not */ +#if !RV32_HAS(JIT) block_map_t block_map; /**< basic block map */ +#else + struct cache *block_cache; +#endif struct mpool *block_mp, *block_ir_mp; /* print exit code on syscall_exit */ bool output_exit_code; + void *jit_state; +#if RV32_HAS(GDBSTUB) + /* gdbstub instance */ + gdbstub_t gdbstub; + + bool debug_mode; + + /* GDB instruction breakpoint */ + breakpoint_map_t breakpoint_map; + + /* The flag to notify interrupt from GDB client: it should + * be accessed by atomic operation when starting the GDBSTUB. */ + bool is_interrupted; +#endif }; /* sign extend a 16 bit value */ diff --git a/src/rv32_template.c b/src/rv32_template.c index 4c5c5472..2a13d085 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -3,43 +3,86 @@ * specification version 20191213. */ +/* Please review the files gen-jit-template.py and jit_x64.[ch] to comprehend + * the custom domain-specific language used by the tier 1 JIT compiler. Each + * opcode is associated with a corresponding API. For instance, the API for + * ld_imm is named emit_load_imm. + */ + /* Internal */ -RVOP(nop, { rv->X[rv_reg_zero] = 0; }) +RVOP( + nop, + { rv->X[rv_reg_zero] = 0; }, + X64({})) /* LUI is used to build 32-bit constants and uses the U-type format. LUI * places the U-immediate value in the top 20 bits of the destination * register rd, filling in the lowest 12 bits with zeros. The 32-bit * result is sign-extended to 64 bits. */ -RVOP(lui, { rv->X[ir->rd] = ir->imm; }) +RVOP( + lui, + { rv->X[ir->rd] = ir->imm; }, + X64({ + ld_imm, RAX, imm; + st, S32, RAX, X, rd; + })) /* AUIPC is used to build pc-relative addresses and uses the U-type format. * AUIPC forms a 32-bit offset from the 20-bit U-immediate, filling in the * lowest 12 bits with zeros, adds this offset to the address of the AUIPC * instruction, then places the result in register rd. */ -RVOP(auipc, { rv->X[ir->rd] = ir->imm + PC; }) +RVOP( + auipc, + { rv->X[ir->rd] = ir->imm + PC; }, + X64({ + ld_imm, RAX, pc, imm; + st, S32, RAX, X, rd; + })) /* JAL: Jump and Link * store successor instruction address into rd. * add next J imm (offset) to pc. */ -RVOP(jal, { - const uint32_t pc = PC; - /* Jump */ - PC += ir->imm; - /* link with return address */ - if (ir->rd) - rv->X[ir->rd] = pc + 4; - /* check instruction misaligned */ - RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); - struct rv_insn *taken = ir->branch_taken; - if (taken) - MUST_TAIL return taken->impl(rv, taken, cycle, PC); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP( + jal, + { + const uint32_t pc = PC; + /* Jump */ + PC += ir->imm; + /* link with return address */ + if (ir->rd) + rv->X[ir->rd] = pc + 4; + /* check instruction misaligned */ + RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); + struct rv_insn *taken = ir->branch_taken; + if (taken) { +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC)) { + clear_flag = true; + goto end_insn; + } + if (cache_hot(rv->block_cache, PC)) + goto end_insn; +#endif + MUST_TAIL return taken->impl(rv, taken, cycle, PC); + } + end_insn: + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + cond, rd; + ld_imm, RAX, pc, 4; + st, S32, RAX, X, rd; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + jmp, pc, imm; + exit; + })) /* The branch history table records historical data pertaining to indirect jump * targets. This functionality alleviates the need to invoke block_find() and @@ -71,20 +114,35 @@ RVOP(jal, { * register rd. Register x0 can be used as the destination if the result is * not required. */ -RVOP(jalr, { - const uint32_t pc = PC; - /* jump */ - PC = (rv->X[ir->rs1] + ir->imm) & ~1U; - /* link */ - if (ir->rd) - rv->X[ir->rd] = pc + 4; - /* check instruction misaligned */ - RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); - LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP( + jalr, + { + const uint32_t pc = PC; + /* jump */ + PC = (rv->X[ir->rs1] + ir->imm) & ~1U; + /* link */ + if (ir->rd) + rv->X[ir->rd] = pc + 4; + /* check instruction misaligned */ + RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); +#if !RV32_HAS(JIT) + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); +#endif + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + cond, rd; + ld_imm, RAX, pc, 4; + st, S32, RAX, X, rd; + end; + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 0, RAX, imm; + alu32_imm, 32, 0x81, 4, RAX, ~1U; + st, S32, RAX, PC; + exit; + })) /* clang-format off */ #define BRANCH_FUNC(type, cond) \ @@ -94,6 +152,12 @@ RVOP(jalr, { struct rv_insn *untaken = ir->branch_untaken; \ if (!untaken) \ goto nextop; \ + IIF(RV32_HAS(JIT)) \ + ( \ + if (!cache_get(rv->block_cache, PC + 4)) { \ + clear_flag = true; \ + goto nextop; \ + }, ); \ PC += 4; \ last_pc = PC; \ MUST_TAIL return untaken->impl(rv, untaken, cycle, PC); \ @@ -104,9 +168,16 @@ RVOP(jalr, { RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); \ struct rv_insn *taken = ir->branch_taken; \ if (taken) { \ + IIF(RV32_HAS(JIT)) \ + ( \ + if (!cache_get(rv->block_cache, PC)) { \ + clear_flag = true; \ + goto end_insn; \ + }, ); \ last_pc = PC; \ MUST_TAIL return taken->impl(rv, taken, cycle, PC); \ } \ + end_insn: \ rv->csr_cycle = cycle; \ rv->PC = PC; \ return true; @@ -129,22 +200,154 @@ RVOP(jalr, { */ /* BEQ: Branch if Equal */ -RVOP(beq, { BRANCH_FUNC(uint32_t, !=); }) +RVOP( + beq, + { BRANCH_FUNC(uint32_t, !=); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x84; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* BNE: Branch if Not Equal */ -RVOP(bne, { BRANCH_FUNC(uint32_t, ==); }) +RVOP( + bne, + { BRANCH_FUNC(uint32_t, ==); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x85; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* BLT: Branch if Less Than */ -RVOP(blt, { BRANCH_FUNC(int32_t, >=); }) +RVOP( + blt, + { BRANCH_FUNC(int32_t, >=); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x8c; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* BGE: Branch if Greater Than */ -RVOP(bge, { BRANCH_FUNC(int32_t, <); }) +RVOP( + bge, + { BRANCH_FUNC(int32_t, <); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x8d; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* BLTU: Branch if Less Than Unsigned */ -RVOP(bltu, { BRANCH_FUNC(uint32_t, >=); }) +RVOP( + bltu, + { BRANCH_FUNC(uint32_t, >=); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x82; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* BGEU: Branch if Greater Than Unsigned */ -RVOP(bgeu, { BRANCH_FUNC(uint32_t, <); }) +RVOP( + bgeu, + { BRANCH_FUNC(uint32_t, <); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + set_jmp_off; + jcc, 0x83; + cond, branch_untaken; + jmp, pc, 4; + end; + ld_imm, RAX, pc, 4; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* There are 5 types of loads: two for byte and halfword sizes, and one for word * size. Two instructions are required for byte and halfword loads because they @@ -154,33 +357,84 @@ RVOP(bgeu, { BRANCH_FUNC(uint32_t, <); }) */ /* LB: Load Byte */ -RVOP(lb, { - rv->X[ir->rd] = sign_extend_b(rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm)); -}) +RVOP( + lb, + { + rv->X[ir->rd] = + sign_extend_b(rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm)); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld_sext, S8, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* LH: Load Halfword */ -RVOP(lh, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - RV_EXC_MISALIGN_HANDLER(1, load, false, 1); - rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(addr)); -}) +RVOP( + lh, + { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + RV_EXC_MISALIGN_HANDLER(1, load, false, 1); + rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(addr)); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld_sext, S16, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* LW: Load Word */ -RVOP(lw, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - RV_EXC_MISALIGN_HANDLER(3, load, false, 1); - rv->X[ir->rd] = rv->io.mem_read_w(addr); -}) +RVOP( + lw, + { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + RV_EXC_MISALIGN_HANDLER(3, load, false, 1); + rv->X[ir->rd] = rv->io.mem_read_w(addr); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* LBU: Load Byte Unsigned */ -RVOP(lbu, { rv->X[ir->rd] = rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm); }) +RVOP( + lbu, + { rv->X[ir->rd] = rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm); }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S8, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* LHU: Load Halfword Unsigned */ -RVOP(lhu, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - RV_EXC_MISALIGN_HANDLER(1, load, false, 1); - rv->X[ir->rd] = rv->io.mem_read_s(addr); -}) +RVOP( + lhu, + { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + RV_EXC_MISALIGN_HANDLER(1, load, false, 1); + rv->X[ir->rd] = rv->io.mem_read_s(addr); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S16, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* There are 3 types of stores: byte, halfword, and word-sized. Unlike loads, * there are no signed or unsigned variants, as stores to memory write exactly @@ -189,50 +443,130 @@ RVOP(lhu, { */ /* SB: Store Byte */ -RVOP(sb, { rv->io.mem_write_b(rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); }) +RVOP( + sb, + { rv->io.mem_write_b(rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S8, RBX, X, rs2; + st, S8, RBX, RAX, 0; + })) /* SH: Store Halfword */ -RVOP(sh, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - RV_EXC_MISALIGN_HANDLER(1, store, false, 1); - rv->io.mem_write_s(addr, rv->X[ir->rs2]); -}) +RVOP( + sh, + { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + RV_EXC_MISALIGN_HANDLER(1, store, false, 1); + rv->io.mem_write_s(addr, rv->X[ir->rs2]); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S16, RBX, X, rs2; + st, S16, RBX, RAX, 0; + })) /* SW: Store Word */ -RVOP(sw, { - const uint32_t addr = rv->X[ir->rs1] + ir->imm; - RV_EXC_MISALIGN_HANDLER(3, store, false, 1); - rv->io.mem_write_w(addr, rv->X[ir->rs2]); -}) +RVOP( + sw, + { + const uint32_t addr = rv->X[ir->rs1] + ir->imm; + RV_EXC_MISALIGN_HANDLER(3, store, false, 1); + rv->io.mem_write_w(addr, rv->X[ir->rs2]); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RBX, X, rs2; + st, S32, RBX, RAX, 0; + })) /* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic * overflow is ignored and the result is simply the low XLEN bits of the * result. ADDI rd, rs1, 0 is used to implement the MV rd, rs1 assembler * pseudo-instruction. */ -RVOP(addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }) +RVOP( + addi, + { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 0, RAX, imm; + st, S32, RAX, X, rd; + })) /* SLTI place the value 1 in register rd if register rs1 is less than the * signextended immediate when both are treated as signed numbers, else 0 is * written to rd. */ -RVOP(slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; }) +RVOP( + slti, + { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; }, + X64({ + ld, S32, RAX, X, rs1; + cmp_imm, RAX, imm; + st_imm, S32, rd, 1; + set_jmp_off; + jcc, 0x82; + st_imm, S32, rd, 0; + jmp_off; + })) /* SLTIU places the value 1 in register rd if register rs1 is less than the * immediate when both are treated as unsigned numbers, else 0 is written to rd. */ -RVOP(sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; }) +RVOP( + sltiu, + { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; }, + X64({ + ld, S32, RAX, X, rs1; + cmp_imm, RAX, imm; + st_imm, S32, rd, 1; + set_jmp_off; + jcc, 0x82; + st_imm, S32, rd, 0; + jmp_off; + })) /* XORI: Exclusive OR Immediate */ -RVOP(xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; }) +RVOP( + xori, + { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 6, RAX, imm; + st, S32, RAX, X, rd; + })) /* ORI: OR Immediate */ -RVOP(ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; }) +RVOP( + ori, + { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 1, RAX, imm; + st, S32, RAX, X, rd; + })) /* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit * immediate and place the result in rd. */ -RVOP(andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; }) +RVOP( + andi, + { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 4, RAX, imm; + st, S32, RAX, X, rd; + })) FORCE_INLINE void shift_func(riscv_t *rv, const rv_insn_t *ir) { @@ -255,123 +589,286 @@ FORCE_INLINE void shift_func(riscv_t *rv, const rv_insn_t *ir) /* SLLI performs logical left shift on the value in register rs1 by the shift * amount held in the lower 5 bits of the immediate. */ -RVOP(slli, { shift_func(rv, ir); }) +RVOP( + slli, + { shift_func(rv, ir); }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 8, 0xc1, 4, RAX, imm, 0x1f; + st, S32, RAX, X, rd; + })) /* SRLI performs logical right shift on the value in register rs1 by the shift * amount held in the lower 5 bits of the immediate. */ -RVOP(srli, { shift_func(rv, ir); }) +RVOP( + srli, + { shift_func(rv, ir); }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 8, 0xc1, 5, RAX, imm, 0x1f; + st, S32, RAX, X, rd; + })) /* SRAI performs arithmetic right shift on the value in register rs1 by the * shift amount held in the lower 5 bits of the immediate. */ -RVOP(srai, { shift_func(rv, ir); }) +RVOP( + srai, + { shift_func(rv, ir); }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 8, 0xc1, 7, RAX, imm, 0x1f; + st, S32, RAX, X, rd; + })) /* ADD */ -RVOP(add, { - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); -}) +RVOP( + add, + { + rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x01, RBX, RAX; + st, S32, RAX, X, rd; + })) /* SUB: Substract */ -RVOP(sub, { - rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); -}) +RVOP( + sub, + { + rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x29, RBX, RAX; + st, S32, RAX, X, rd; + })) /* SLL: Shift Left Logical */ -RVOP(sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); }) +RVOP( + sll, + { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RCX, X, rs2; + alu32_imm, 32, 0x81, 4, RCX, 0x1f; + alu32, 0xd3, 4, RAX; + st, S32, RAX, X, rd; + })) /* SLT: Set on Less Than */ -RVOP(slt, { - rv->X[ir->rd] = - ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; -}) +RVOP( + slt, + { + rv->X[ir->rd] = + ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + st_imm, S32, rd, 1; + set_jmp_off; + jcc, 0x82; + st_imm, S32, rd, 0; + jmp_off; + })) /* SLTU: Set on Less Than Unsigned */ -RVOP(sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; }) +RVOP( + sltu, + { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + cmp, RBX, RAX; + st_imm, S32, rd, 1; + set_jmp_off; + jcc, 0x82; + st_imm, S32, rd, 0; + jmp_off; + })) /* XOR: Exclusive OR */ -RVOP(xor, { - rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; -}) +RVOP( + xor, + { + rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x31, RBX, RAX; + st, S32, RAX, X, rd; + })) /* SRL: Shift Right Logical */ -RVOP(srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); }) +RVOP( + srl, + { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RCX, X, rs2; + alu32_imm, 32, 0x81, 4, RCX, 0x1f; + alu32, 0xd3, 5, RAX; + st, S32, RAX, X, rd; + })) /* SRA: Shift Right Arithmetic */ -RVOP(sra, - { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); }) +RVOP( + sra, + { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RCX, X, rs2; + alu32_imm, 32, 0x81, 4, RCX, 0x1f; + alu32, 0xd3, 7, RAX; + st, S32, RAX, X, rd; + })) /* OR */ -RVOP(or, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }) +RVOP( + or + , + { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x09, RBX, RAX; + st, S32, RAX, X, rd; + })) /* AND */ -RVOP(and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }) +RVOP( + and, + { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x21, RBX, RAX; + st, S32, RAX, X, rd; + })) /* ECALL: Environment Call */ -RVOP(ecall, { - rv->compressed = false; - rv->csr_cycle = cycle; - rv->PC = PC; - rv->io.on_ecall(rv); - return true; -}) +RVOP( + ecall, + { + rv->compressed = false; + rv->csr_cycle = cycle; + rv->PC = PC; + rv->io.on_ecall(rv); + return true; + }, + X64({ + ld_imm, RAX, pc; + st, S32, RAX, PC; + call, ecall; + exit; + })) /* EBREAK: Environment Break */ -RVOP(ebreak, { - rv->compressed = false; - rv->csr_cycle = cycle; - rv->PC = PC; - rv->io.on_ebreak(rv); - return true; -}) +RVOP( + ebreak, + { + rv->compressed = false; + rv->csr_cycle = cycle; + rv->PC = PC; + rv->io.on_ebreak(rv); + return true; + }, + X64({ + ld_imm, RAX, pc; + st, S32, RAX, PC; + call, ebreak; + exit; + })) /* WFI: Wait for Interrupt */ -RVOP(wfi, { - /* FIXME: Implement */ - return false; -}) +RVOP( + wfi, + { + /* FIXME: Implement */ + return false; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* URET: return from traps in U-mode */ -RVOP(uret, { - /* FIXME: Implement */ - return false; -}) +RVOP( + uret, + { + /* FIXME: Implement */ + return false; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* SRET: return from traps in S-mode */ -RVOP(sret, { - /* FIXME: Implement */ - return false; -}) +RVOP( + sret, + { + /* FIXME: Implement */ + return false; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* HRET: return from traps in H-mode */ -RVOP(hret, { - /* FIXME: Implement */ - return false; -}) +RVOP( + hret, + { + /* FIXME: Implement */ + return false; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* MRET: return from traps in U-mode */ -RVOP(mret, { - rv->csr_mstatus = MSTATUS_MPIE; - rv->PC = rv->csr_mepc; - return true; -}) +RVOP( + mret, + { + rv->csr_mstatus = MSTATUS_MPIE; + rv->PC = rv->csr_mepc; + return true; + }, + X64({ + assert; /* FIXME: Implement */ + })) #if RV32_HAS(Zifencei) /* RV32 Zifencei Standard Extension */ -RVOP(fencei, { - PC += 4; - /* FIXME: fill real implementations */ - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP( + fencei, + { + PC += 4; + /* FIXME: fill real implementations */ + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + assert; /* FIXME: Implement */ + })) #endif #if RV32_HAS(Zicsr) /* RV32 Zicsr Standard Extension */ /* CSRRW: Atomic Read/Write CSR */ -RVOP(csrrw, { - uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrw, + { + uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* CSRRS: Atomic Read and Set Bits in CSR */ /* The initial value in integer register rs1 is treated as a bit mask that @@ -382,71 +879,130 @@ RVOP(csrrw, { * * See Page 56 of the RISC-V Unprivileged Specification. */ -RVOP(csrrs, { - uint32_t tmp = - csr_csrrs(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrs, + { + uint32_t tmp = csr_csrrs( + rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* CSRRC: Atomic Read and Clear Bits in CSR */ -RVOP(csrrc, { - uint32_t tmp = - csr_csrrc(rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrc, + { + uint32_t tmp = csr_csrrc( + rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* CSRRWI */ -RVOP(csrrwi, { - uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrwi, + { + uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* CSRRSI */ -RVOP(csrrsi, { - uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrsi, + { + uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* CSRRCI */ -RVOP(csrrci, { - uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1); - rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; -}) +RVOP( + csrrci, + { + uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1); + rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; + }, + X64({ + assert; /* FIXME: Implement */ + })) #endif /* RV32M Standard Extension */ #if RV32_HAS(EXT_M) /* MUL: Multiply */ -RVOP(mul, - { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; }) +RVOP( + mul, + { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + mul, 0x28, RBX, RAX, 0; + st, S32, RAX, X, rd; + })) /* MULH: Multiply High Signed Signed */ /* It is important to first cast rs1 and rs2 to i32 so that the subsequent * cast to i64 sign-extends the register values. */ -RVOP(mulh, { - const int64_t a = (int32_t) rv->X[ir->rs1]; - const int64_t b = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; -}) +RVOP( + mulh, + { + const int64_t multiplicand = (int32_t) rv->X[ir->rs1]; + const int64_t multiplier = (int32_t) rv->X[ir->rs2]; + rv->X[ir->rd] = ((uint64_t) (multiplicand * multiplier)) >> 32; + }, + X64({ + ld_sext, S32, RAX, X, rs1; + ld_sext, S32, RBX, X, rs2; + mul, 0x2f, RBX, RAX, 0; + alu64_imm, 8, 0xc1, 5, RAX, 32; + st, S32, RAX, X, rd; + })) /* MULHSU: Multiply High Signed Unsigned */ /* It is essential to perform an initial cast of rs1 to i32, ensuring that the * subsequent cast to i64 results in sign extension of the register value. * Additionally, rs2 should not undergo sign extension. */ -RVOP(mulhsu, { - const int64_t a = (int32_t) rv->X[ir->rs1]; - const uint64_t b = rv->X[ir->rs2]; - rv->X[ir->rd] = ((uint64_t) (a * b)) >> 32; -}) +RVOP( + mulhsu, + { + const int64_t multiplicand = (int32_t) rv->X[ir->rs1]; + const uint64_t umultiplier = rv->X[ir->rs2]; + rv->X[ir->rd] = ((uint64_t) (multiplicand * umultiplier)) >> 32; + }, + X64({ + ld_sext, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + mul, 0x2f, RBX, RAX, 0; + alu64_imm, 8, 0xc1, 5, RAX, 32; + st, S32, RAX, X, rd; + })) /* MULHU: Multiply High Unsigned Unsigned */ -RVOP(mulhu, { - rv->X[ir->rd] = - ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; -}) +RVOP( + mulhu, + { + rv->X[ir->rd] = + ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + mul, 0x2f, RBX, RAX, 0; + alu64_imm, 8, 0xc1, 5, RAX, 32; + st, S32, RAX, X, rd; + })) /* DIV: Divide Signed */ /* +------------------------+-----------+----------+-----------+ @@ -456,14 +1012,28 @@ RVOP(mulhu, { * | Overflow (signed only) | −2^{L−1} | −1 | −2^{L−1} | * +------------------------+-----------+----------+-----------+ */ -RVOP(div, { - const int32_t dividend = (int32_t) rv->X[ir->rs1]; - const int32_t divisor = (int32_t) rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? ~0U - : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? rv->X[ir->rs1] /* overflow */ - : (unsigned int) (dividend / divisor); -}) +RVOP( + div, + { + const int32_t dividend = (int32_t) rv->X[ir->rs1]; + const int32_t divisor = (int32_t) rv->X[ir->rs2]; + rv->X[ir->rd] = !divisor ? ~0U + : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) + ? rv->X[ir->rs1] /* overflow */ + : (unsigned int) (dividend / divisor); + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + div, 0x38, RBX, RAX, 0; + cmp_imm, RBX, 0; + set_jmp_off; + jcc, 0x85; + ld_imm, RAX, -1; + jmp_off; + st, S32, RAX, X, rd; + /* FIXME: handle overflow */ + })) /* DIVU: Divide Unsigned */ /* +------------------------+-----------+----------+----------+ @@ -472,12 +1042,26 @@ RVOP(div, { * | Division by zero | x | 0 | 2^L − 1 | * +------------------------+-----------+----------+----------+ */ -RVOP(divu, { - const uint32_t dividend = rv->X[ir->rs1]; - const uint32_t divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? ~0U : dividend / divisor; -}) +RVOP( + divu, + { + const uint32_t udividend = rv->X[ir->rs1]; + const uint32_t udivisor = rv->X[ir->rs2]; + rv->X[ir->rd] = !udivisor ? ~0U : udividend / udivisor; + }, + X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + div, 0x38, RBX, RAX, 0; + cmp_imm, RBX, 0; + set_jmp_off; + jcc, 0x85; + ld_imm, RAX, ~0U; + jmp_off; + st, S32, RAX, X, rd; + })) +/* clang-format off */ /* REM: Remainder Signed */ /* +------------------------+-----------+----------+---------+ * | Condition | Dividend | Divisor | REM[W] | @@ -491,9 +1075,16 @@ RVOP(rem, { const int32_t divisor = rv->X[ir->rs2]; rv->X[ir->rd] = !divisor ? dividend : (divisor == -1 && rv->X[ir->rs1] == 0x80000000U) - ? 0 /* overflow */ - : (dividend % divisor); -}) + ? 0 : (dividend + % divisor); +}, +X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + mod, 0x98, RBX, RAX, 0; + st, S32, RAX, X, rd; + /* FIXME: handle overflow */ +})) /* REMU: Remainder Unsigned */ /* +------------------------+-----------+----------+----------+ @@ -503,10 +1094,18 @@ RVOP(rem, { * +------------------------+-----------+----------+----------+ */ RVOP(remu, { - const uint32_t dividend = rv->X[ir->rs1]; - const uint32_t divisor = rv->X[ir->rs2]; - rv->X[ir->rd] = !divisor ? dividend : dividend % divisor; -}) + const uint32_t udividend = rv->X[ir->rs1]; + const uint32_t udivisor = rv->X[ir->rs2]; + rv->X[ir->rd] = !udivisor ? udividend : udividend + % udivisor; +}, +X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + mod, 0x98, RBX, RAX, 0; + st, S32, RAX, X, rd; +})) +/* clang-format on */ #endif /* RV32A Standard Extension */ @@ -534,196 +1133,317 @@ RVOP(remu, { */ /* LR.W: Load Reserved */ -RVOP(lrw, { - rv->X[ir->rd] = rv->io.mem_read_w(rv->X[ir->rs1]); - /* skip registration of the 'reservation set' - * FIXME: uimplemented - */ -}) +RVOP( + lrw, + { + rv->X[ir->rd] = rv->io.mem_read_w(rv->X[ir->rs1]); + /* skip registration of the 'reservation set' + * FIXME: uimplemented + */ + }, + X64({ + assert; /* FIXME: Implement */ + })) /* SC.W: Store Conditional */ -RVOP(scw, { - /* assume the 'reservation set' is valid - * FIXME: unimplemented - */ - rv->io.mem_write_w(rv->X[ir->rs1], rv->X[ir->rs2]); - rv->X[ir->rd] = 0; -}) +RVOP( + scw, + { + /* assume the 'reservation set' is valid + * FIXME: unimplemented + */ + rv->io.mem_write_w(rv->X[ir->rs1], rv->X[ir->rs2]); + rv->X[ir->rd] = 0; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOSWAP.W: Atomic Swap */ -RVOP(amoswapw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - rv->io.mem_write_s(ir->rs1, rv->X[ir->rs2]); -}) +RVOP( + amoswapw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + rv->io.mem_write_s(ir->rs1, rv->X[ir->rs2]); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOADD.W: Atomic ADD */ -RVOP(amoaddw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2]; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amoaddw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOXOR.W: Atomic XOR */ -RVOP(amoxorw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2]; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amoxorw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOAND.W: Atomic AND */ -RVOP(amoandw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2]; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amoandw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOOR.W: Atomic OR */ -RVOP(amoorw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2]; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amoorw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOMIN.W: Atomic MIN */ -RVOP(amominw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t a = rv->X[ir->rd]; - const int32_t b = rv->X[ir->rs2]; - const int32_t res = a < b ? a : b; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amominw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = + rv->X[ir->rd] < rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOMAX.W: Atomic MAX */ -RVOP(amomaxw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const int32_t a = rv->X[ir->rd]; - const int32_t b = rv->X[ir->rs2]; - const int32_t res = a > b ? a : b; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amomaxw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const int32_t res = + rv->X[ir->rd] > rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, res); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOMINU.W */ -RVOP(amominuw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const uint32_t a = rv->X[ir->rd]; - const uint32_t b = rv->X[ir->rs2]; - const uint32_t res = a < b ? a : b; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amominuw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const uint32_t ures = + rv->X[ir->rd] < rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, ures); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* AMOMAXU.W */ -RVOP(amomaxuw, { - rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); - const uint32_t a = rv->X[ir->rd]; - const uint32_t b = rv->X[ir->rs2]; - const uint32_t res = a > b ? a : b; - rv->io.mem_write_s(ir->rs1, res); -}) +RVOP( + amomaxuw, + { + rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); + const uint32_t ures = + rv->X[ir->rd] > rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; + rv->io.mem_write_s(ir->rs1, ures); + }, + X64({ + assert; /* FIXME: Implement */ + })) #endif /* RV32_HAS(EXT_A) */ /* RV32F Standard Extension */ #if RV32_HAS(EXT_F) /* FLW */ -RVOP(flw, { - /* copy into the float register */ - const uint32_t data = rv->io.mem_read_w(rv->X[ir->rs1] + ir->imm); - rv->F[ir->rd].v = data; -}) +RVOP( + flw, + { + /* copy into the float register */ + const uint32_t data = rv->io.mem_read_w(rv->X[ir->rs1] + ir->imm); + rv->F[ir->rd].v = data; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSW */ -RVOP(fsw, { - /* copy from float registers */ - uint32_t data = rv->F[ir->rs2].v; - rv->io.mem_write_w(rv->X[ir->rs1] + ir->imm, data); -}) +RVOP( + fsw, + { + /* copy from float registers */ + uint32_t data = rv->F[ir->rs2].v; + rv->io.mem_write_w(rv->X[ir->rs1] + ir->imm, data); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMADD.S */ -RVOP(fmadds, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], rv->F[ir->rs3]); - set_fflag(rv); -}) +RVOP( + fmadds, + { + set_rounding_mode(rv); + rv->F[ir->rd] = + f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], rv->F[ir->rs3]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMSUB.S */ -RVOP(fmsubs, { - set_rounding_mode(rv); - riscv_float_t tmp = rv->F[ir->rs3]; - tmp.v ^= FMASK_SIGN; - rv->F[ir->rd] = f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], tmp); - set_fflag(rv); -}) +RVOP( + fmsubs, + { + set_rounding_mode(rv); + riscv_float_t tmp = rv->F[ir->rs3]; + tmp.v ^= FMASK_SIGN; + rv->F[ir->rd] = f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], tmp); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FNMSUB.S */ -RVOP(fnmsubs, { - set_rounding_mode(rv); - riscv_float_t tmp = rv->F[ir->rs1]; - tmp.v ^= FMASK_SIGN; - rv->F[ir->rd] = f32_mulAdd(tmp, rv->F[ir->rs2], rv->F[ir->rs3]); - set_fflag(rv); -}) +RVOP( + fnmsubs, + { + set_rounding_mode(rv); + riscv_float_t tmp = rv->F[ir->rs1]; + tmp.v ^= FMASK_SIGN; + rv->F[ir->rd] = f32_mulAdd(tmp, rv->F[ir->rs2], rv->F[ir->rs3]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FNMADD.S */ -RVOP(fnmadds, { - set_rounding_mode(rv); - riscv_float_t tmp1 = rv->F[ir->rs1]; - riscv_float_t tmp2 = rv->F[ir->rs3]; - tmp1.v ^= FMASK_SIGN; - tmp2.v ^= FMASK_SIGN; - rv->F[ir->rd] = f32_mulAdd(tmp1, rv->F[ir->rs2], tmp2); - set_fflag(rv); -}) +RVOP( + fnmadds, + { + set_rounding_mode(rv); + riscv_float_t tmp1 = rv->F[ir->rs1]; + riscv_float_t tmp2 = rv->F[ir->rs3]; + tmp1.v ^= FMASK_SIGN; + tmp2.v ^= FMASK_SIGN; + rv->F[ir->rd] = f32_mulAdd(tmp1, rv->F[ir->rs2], tmp2); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FADD.S */ -RVOP(fadds, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_add(rv->F[ir->rs1], rv->F[ir->rs2]); - set_fflag(rv); -}) +RVOP( + fadds, + { + set_rounding_mode(rv); + rv->F[ir->rd] = f32_add(rv->F[ir->rs1], rv->F[ir->rs2]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSUB.S */ -RVOP(fsubs, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_sub(rv->F[ir->rs1], rv->F[ir->rs2]); - set_fflag(rv); -}) +RVOP( + fsubs, + { + set_rounding_mode(rv); + rv->F[ir->rd] = f32_sub(rv->F[ir->rs1], rv->F[ir->rs2]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMUL.S */ -RVOP(fmuls, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_mul(rv->F[ir->rs1], rv->F[ir->rs2]); - set_fflag(rv); -}) +RVOP( + fmuls, + { + set_rounding_mode(rv); + rv->F[ir->rd] = f32_mul(rv->F[ir->rs1], rv->F[ir->rs2]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FDIV.S */ -RVOP(fdivs, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_div(rv->F[ir->rs1], rv->F[ir->rs2]); - set_fflag(rv); -}) +RVOP( + fdivs, + { + set_rounding_mode(rv); + rv->F[ir->rd] = f32_div(rv->F[ir->rs1], rv->F[ir->rs2]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSQRT.S */ -RVOP(fsqrts, { - set_rounding_mode(rv); - rv->F[ir->rd] = f32_sqrt(rv->F[ir->rs1]); - set_fflag(rv); -}) +RVOP( + fsqrts, + { + set_rounding_mode(rv); + rv->F[ir->rd] = f32_sqrt(rv->F[ir->rs1]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSGNJ.S */ -RVOP(fsgnjs, { - rv->F[ir->rd].v = - (rv->F[ir->rs1].v & ~FMASK_SIGN) | (rv->F[ir->rs2].v & FMASK_SIGN); -}) +RVOP( + fsgnjs, + { + rv->F[ir->rd].v = + (rv->F[ir->rs1].v & ~FMASK_SIGN) | (rv->F[ir->rs2].v & FMASK_SIGN); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSGNJN.S */ -RVOP(fsgnjns, { - rv->F[ir->rd].v = - (rv->F[ir->rs1].v & ~FMASK_SIGN) | (~rv->F[ir->rs2].v & FMASK_SIGN); -}) +RVOP( + fsgnjns, + { + rv->F[ir->rd].v = + (rv->F[ir->rs1].v & ~FMASK_SIGN) | (~rv->F[ir->rs2].v & FMASK_SIGN); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FSGNJX.S */ -RVOP(fsgnjxs, - { rv->F[ir->rd].v = rv->F[ir->rs1].v ^ (rv->F[ir->rs2].v & FMASK_SIGN); }) +RVOP( + fsgnjxs, + { rv->F[ir->rd].v = rv->F[ir->rs1].v ^ (rv->F[ir->rs2].v & FMASK_SIGN); }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMIN.S * In IEEE754-201x, fmin(x, y) return @@ -732,119 +1452,183 @@ RVOP(fsgnjxs, * - if both are NaN, return NaN * When input is signaling NaN, raise invalid operation */ -RVOP(fmins, { - if (f32_isSignalingNaN(rv->F[ir->rs1]) || - f32_isSignalingNaN(rv->F[ir->rs2])) - rv->csr_fcsr |= FFLAG_INVALID_OP; - bool less = f32_lt_quiet(rv->F[ir->rs1], rv->F[ir->rs2]) || - (f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]) && - (rv->F[ir->rs1].v & FMASK_SIGN)); - if (is_nan(rv->F[ir->rs1].v) && is_nan(rv->F[ir->rs2].v)) - rv->F[ir->rd].v = RV_NAN; - else - rv->F[ir->rd] = (less || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] - : rv->F[ir->rs2]); -}) +RVOP( + fmins, + { + if (f32_isSignalingNaN(rv->F[ir->rs1]) || + f32_isSignalingNaN(rv->F[ir->rs2])) + rv->csr_fcsr |= FFLAG_INVALID_OP; + bool less = f32_lt_quiet(rv->F[ir->rs1], rv->F[ir->rs2]) || + (f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]) && + (rv->F[ir->rs1].v & FMASK_SIGN)); + if (is_nan(rv->F[ir->rs1].v) && is_nan(rv->F[ir->rs2].v)) + rv->F[ir->rd].v = RV_NAN; + else + rv->F[ir->rd] = (less || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] + : rv->F[ir->rs2]); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMAX.S */ -RVOP(fmaxs, { - if (f32_isSignalingNaN(rv->F[ir->rs1]) || - f32_isSignalingNaN(rv->F[ir->rs2])) - rv->csr_fcsr |= FFLAG_INVALID_OP; - bool greater = f32_lt_quiet(rv->F[ir->rs2], rv->F[ir->rs1]) || - (f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]) && - (rv->F[ir->rs2].v & FMASK_SIGN)); - if (is_nan(rv->F[ir->rs1].v) && is_nan(rv->F[ir->rs2].v)) - rv->F[ir->rd].v = RV_NAN; - else - rv->F[ir->rd] = (greater || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] - : rv->F[ir->rs2]); -}) +RVOP( + fmaxs, + { + if (f32_isSignalingNaN(rv->F[ir->rs1]) || + f32_isSignalingNaN(rv->F[ir->rs2])) + rv->csr_fcsr |= FFLAG_INVALID_OP; + bool greater = f32_lt_quiet(rv->F[ir->rs2], rv->F[ir->rs1]) || + (f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]) && + (rv->F[ir->rs2].v & FMASK_SIGN)); + if (is_nan(rv->F[ir->rs1].v) && is_nan(rv->F[ir->rs2].v)) + rv->F[ir->rd].v = RV_NAN; + else + rv->F[ir->rd] = + (greater || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] + : rv->F[ir->rs2]); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FCVT.W.S and FCVT.WU.S convert a floating point number to an integer, * the rounding mode is specified in rm field. */ /* FCVT.W.S */ -RVOP(fcvtws, { - set_rounding_mode(rv); - uint32_t ret = f32_to_i32(rv->F[ir->rs1], softfloat_roundingMode, true); - if (ir->rd) - rv->X[ir->rd] = ret; - set_fflag(rv); -}) +RVOP( + fcvtws, + { + set_rounding_mode(rv); + uint32_t ret = f32_to_i32(rv->F[ir->rs1], softfloat_roundingMode, true); + if (ir->rd) + rv->X[ir->rd] = ret; + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FCVT.WU.S */ -RVOP(fcvtwus, { - set_rounding_mode(rv); - uint32_t ret = f32_to_ui32(rv->F[ir->rs1], softfloat_roundingMode, true); - if (ir->rd) - rv->X[ir->rd] = ret; - set_fflag(rv); -}) +RVOP( + fcvtwus, + { + set_rounding_mode(rv); + uint32_t ret = + f32_to_ui32(rv->F[ir->rs1], softfloat_roundingMode, true); + if (ir->rd) + rv->X[ir->rd] = ret; + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMV.X.W */ -RVOP(fmvxw, { - if (ir->rd) - rv->X[ir->rd] = rv->F[ir->rs1].v; -}) +RVOP( + fmvxw, + { + if (ir->rd) + rv->X[ir->rd] = rv->F[ir->rs1].v; + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FEQ.S performs a quiet comparison: it only sets the invalid operation * exception flag if either input is a signaling NaN. */ -RVOP(feqs, { - set_rounding_mode(rv); - uint32_t ret = f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]); - if (ir->rd) - rv->X[ir->rd] = ret; - set_fflag(rv); -}) +RVOP( + feqs, + { + set_rounding_mode(rv); + uint32_t ret = f32_eq(rv->F[ir->rs1], rv->F[ir->rs2]); + if (ir->rd) + rv->X[ir->rd] = ret; + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FLT.S and FLE.S perform what the IEEE 754-2008 standard refers to as * signaling comparisons: that is, they set the invalid operation exception * flag if either input is NaN. */ -RVOP(flts, { - set_rounding_mode(rv); - uint32_t ret = f32_lt(rv->F[ir->rs1], rv->F[ir->rs2]); - if (ir->rd) - rv->X[ir->rd] = ret; - set_fflag(rv); -}) - -RVOP(fles, { - set_rounding_mode(rv); - uint32_t ret = f32_le(rv->F[ir->rs1], rv->F[ir->rs2]); - if (ir->rd) - rv->X[ir->rd] = ret; - set_fflag(rv); -}) +RVOP( + flts, + { + set_rounding_mode(rv); + uint32_t ret = f32_lt(rv->F[ir->rs1], rv->F[ir->rs2]); + if (ir->rd) + rv->X[ir->rd] = ret; + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) + +RVOP( + fles, + { + set_rounding_mode(rv); + uint32_t ret = f32_le(rv->F[ir->rs1], rv->F[ir->rs2]); + if (ir->rd) + rv->X[ir->rd] = ret; + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FCLASS.S */ -RVOP(fclasss, { - if (ir->rd) - rv->X[ir->rd] = calc_fclass(rv->F[ir->rs1].v); -}) +RVOP( + fclasss, + { + if (ir->rd) + rv->X[ir->rd] = calc_fclass(rv->F[ir->rs1].v); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FCVT.S.W */ -RVOP(fcvtsw, { - set_rounding_mode(rv); - rv->F[ir->rd] = i32_to_f32(rv->X[ir->rs1]); - set_fflag(rv); -}) +RVOP( + fcvtsw, + { + set_rounding_mode(rv); + rv->F[ir->rd] = i32_to_f32(rv->X[ir->rs1]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FCVT.S.WU */ -RVOP(fcvtswu, { - set_rounding_mode(rv); - rv->F[ir->rd] = ui32_to_f32(rv->X[ir->rs1]); - set_fflag(rv); -}) +RVOP( + fcvtswu, + { + set_rounding_mode(rv); + rv->F[ir->rd] = ui32_to_f32(rv->X[ir->rs1]); + set_fflag(rv); + }, + X64({ + assert; /* FIXME: Implement */ + })) /* FMV.W.X */ -RVOP(fmvwx, { rv->F[ir->rd].v = rv->X[ir->rs1]; }) +RVOP(fmvwx, + { + rv->F[ir->rd].v = rv->X[ir->rs1]; }, + { + + X64({ + assert; /* FIXME: Implement */ + })) #endif -/* RV32C Standard Extension */ + /* RV32C Standard Extension */ #if RV32_HAS(EXT_C) /* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended non-zero @@ -853,31 +1637,56 @@ RVOP(fmvwx, { rv->F[ir->rd].v = rv->X[ir->rs1]; }) * This instruction is used to generate pointers to stack-allocated variables, * and expands to addi rd', x2, nzuimm[9:2]. */ -RVOP(caddi4spn, { rv->X[ir->rd] = rv->X[rv_reg_sp] + (uint16_t) ir->imm; }) +RVOP(caddi4spn, + { + rv->X[ir->rd] = rv->X[rv_reg_sp] + (uint16_t) ir->imm; }, + X64({ + ld, S32, RAX, X, rv_reg_sp; + alu32_imm, 32, 0x81, 0, RAX, uint, 16, imm; + st, S32, RAX, X, rd; + })) /* C.LW loads a 32-bit value from memory into register rd'. It computes an * effective address by adding the zero-extended offset, scaled by 4, to the * base address in register rs1'. It expands to lw rd', offset[6:2](rs1'). */ -RVOP(clw, { - const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; - RV_EXC_MISALIGN_HANDLER(3, load, true, 1); - rv->X[ir->rd] = rv->io.mem_read_w(addr); -}) +RVOP(clw, + { + const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; + RV_EXC_MISALIGN_HANDLER(3, load, true, 1); + rv->X[ir->rd] = rv->io.mem_read_w(addr); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* C.SW stores a 32-bit value in register rs2' to memory. It computes an * effective address by adding the zero-extended offset, scaled by 4, to the * base address in register rs1'. * It expands to sw rs2', offset[6:2](rs1'). */ -RVOP(csw, { - const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; - RV_EXC_MISALIGN_HANDLER(3, store, true, 1); - rv->io.mem_write_w(addr, rv->X[ir->rs2]); -}) +RVOP(csw, + { + const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; + RV_EXC_MISALIGN_HANDLER(3, store, true, 1); + rv->io.mem_write_w(addr, rv->X[ir->rs2]); + }, + X64({ + mem; + ld, S32, RAX, X, rs1; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RBX, X, rs2; + st, S32, RBX, RAX, 0; + })) /* C.NOP */ -RVOP(cnop, {/* no operation */}) +RVOP(cnop, {/* no operation */}, X64({/* no operation */})) /* C.ADDI adds the non-zero sign-extended 6-bit immediate to the value in * register rd then writes the result to rd. C.ADDI expands into @@ -885,33 +1694,66 @@ RVOP(cnop, {/* no operation */}) * with both rd=x0 and nzimm=0 encodes the C.NOP instruction; the remaining * code points with either rd=x0 or nzimm=0 encode HINTs. */ -RVOP(caddi, { rv->X[ir->rd] += (int16_t) ir->imm; }) +RVOP(caddi, { + rv->X[ir->rd] += (int16_t) ir->imm; }, X64({ + ld, S32, RAX, X, rd; + alu32_imm, 32, 0x81, 0, RAX, int, 16, imm; + st, S32, RAX, X, rd; + })) /* C.JAL */ -RVOP(cjal, { - rv->X[rv_reg_ra] = PC + 2; - PC += ir->imm; - RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); - struct rv_insn *taken = ir->branch_taken; - if (taken) - MUST_TAIL return taken->impl(rv, taken, cycle, PC); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cjal, + { + rv->X[rv_reg_ra] = PC + 2; + PC += ir->imm; + RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); + struct rv_insn *taken = ir->branch_taken; + if (taken) { +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC)) { + clear_flag = true; + goto end_insn; + } + if (cache_hot(rv->block_cache, PC)) + goto end_insn; +#endif + MUST_TAIL return taken->impl(rv, taken, cycle, PC); + } + end_insn: + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld_imm, RAX, pc, 2; + st, S32, RAX, X, rv_reg_ra; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + jmp, pc, imm; + exit; + })) /* C.LI loads the sign-extended 6-bit immediate, imm, into register rd. * C.LI expands into addi rd, x0, imm[5:0]. * C.LI is only valid when rd=x0; the code points with rd=x0 encode HINTs. */ -RVOP(cli, { rv->X[ir->rd] = ir->imm; }) +RVOP(cli, { + rv->X[ir->rd] = ir->imm; }, X64({ + ld_imm, RAX, imm; + st, S32, RAX, X, rd; + })) /* C.ADDI16SP is used to adjust the stack pointer in procedure prologues * and epilogues. It expands into addi x2, x2, nzimm[9:4]. * C.ADDI16SP is only valid when nzimm'=0; the code point with nzimm=0 is * reserved. */ -RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; }) +RVOP(caddi16sp, { + rv->X[ir->rd] += ir->imm; }, X64({ + ld, S32, RAX, X, rd; + alu32_imm, 32, 0x81, 0, RAX, imm; + st, S32, RAX, X, rd; + })) /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the * destination register, clears the bottom 12 bits, and sign-extends bit @@ -920,153 +1762,326 @@ RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; }) * C.LUI is only valid when rd'={x0, x2}, and when the immediate is not equal * to zero. */ -RVOP(clui, { rv->X[ir->rd] = ir->imm; }) +RVOP(clui, { + rv->X[ir->rd] = ir->imm; }, X64({ + ld_imm, RAX, imm; + st, S32, RAX, X, rd; + })) /* C.SRLI is a CB-format instruction that performs a logical right shift * of the value in register rd' then writes the result to rd'. The shift * amount is encoded in the shamt field. C.SRLI expands into srli rd', * rd', shamt[5:0]. */ -RVOP(csrli, { rv->X[ir->rs1] >>= ir->shamt; }) +RVOP(csrli, { + rv->X[ir->rs1] >>= ir->shamt; }, X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 8, 0xc1, 5, RAX, shamt; + st, S32, RAX, X, rs1; + })) /* C.SRAI is defined analogously to C.SRLI, but instead performs an * arithmetic right shift. C.SRAI expands to srai rd', rd', shamt[5:0]. */ -RVOP(csrai, { - const uint32_t mask = 0x80000000 & rv->X[ir->rs1]; - rv->X[ir->rs1] >>= ir->shamt; - for (unsigned int i = 0; i < ir->shamt; ++i) - rv->X[ir->rs1] |= mask >> i; -}) +RVOP(csrai, + { + const uint32_t mask = 0x80000000 & rv->X[ir->rs1]; + rv->X[ir->rs1] >>= ir->shamt; + for (unsigned int i = 0; i < ir->shamt; ++i) + rv->X[ir->rs1] |= mask >> i; + }, + X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 8, 0xc1, 7, RAX, shamt; + st, S32, RAX, X, rs1; + /* FIXME: Incomplete */ + })) /* C.ANDI is a CB-format instruction that computes the bitwise AND of the * value in register rd' and the sign-extended 6-bit immediate, then writes * the result to rd'. C.ANDI expands to andi rd', rd', imm[5:0]. */ -RVOP(candi, { rv->X[ir->rs1] &= ir->imm; }) +RVOP(candi, { + rv->X[ir->rs1] &= ir->imm; }, X64({ + ld, S32, RAX, X, rs1; + alu32_imm, 32, 0x81, 4, RAX, imm; + st, S32, RAX, X, rs1; + })) /* C.SUB */ -RVOP(csub, { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }) +RVOP(csub, { + rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x29, RBX, RAX; + st, S32, RAX, X, rd; + })) /* C.XOR */ -RVOP(cxor, { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }) - -RVOP(cor, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }) - -RVOP(cand, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }) +RVOP(cxor, { + rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x31, RBX, RAX; + st, S32, RAX, X, rd; + })) + +RVOP(cor, { + rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x09, RBX, RAX; + st, S32, RAX, X, rd; + })) + +RVOP(cand, { + rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x21, RBX, RAX; + st, S32, RAX, X, rd; + })) /* C.J performs an unconditional control transfer. The offset is sign-extended * and added to the pc to form the jump target address. * C.J can therefore target a ±2 KiB range. * C.J expands to jal x0, offset[11:1]. */ -RVOP(cj, { - PC += ir->imm; - RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); - struct rv_insn *taken = ir->branch_taken; - if (taken) - MUST_TAIL return taken->impl(rv, taken, cycle, PC); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cj, + { + PC += ir->imm; + RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); + struct rv_insn *taken = ir->branch_taken; + if (taken) { +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC)) { + clear_flag = true; + goto end_insn; + } + if (cache_hot(rv->block_cache, PC)) + goto end_insn; +#endif + MUST_TAIL return taken->impl(rv, taken, cycle, PC); + } + end_insn: + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + jmp, pc, imm; + exit; + })) /* C.BEQZ performs conditional control transfers. The offset is sign-extended * and added to the pc to form the branch target address. * It can therefore target a ±256 B range. C.BEQZ takes the branch if the * value in register rs1' is zero. It expands to beq rs1', x0, offset[8:1]. */ -RVOP(cbeqz, { - if (rv->X[ir->rs1]) { - is_branch_taken = false; - struct rv_insn *untaken = ir->branch_untaken; - if (!untaken) - goto nextop; - PC += 2; - last_pc = PC; - MUST_TAIL return untaken->impl(rv, untaken, cycle, PC); - } - is_branch_taken = true; - PC += (uint32_t) ir->imm; - struct rv_insn *taken = ir->branch_taken; - if (taken) { - last_pc = PC; - MUST_TAIL return taken->impl(rv, taken, cycle, PC); - } - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cbeqz, + { + if (rv->X[ir->rs1]) { + is_branch_taken = false; + struct rv_insn *untaken = ir->branch_untaken; + if (!untaken) + goto nextop; +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC + 2)) { + clear_flag = true; + goto nextop; + } +#endif + PC += 2; + last_pc = PC; + MUST_TAIL return untaken->impl(rv, untaken, cycle, PC); + } + is_branch_taken = true; + PC += ir->imm; + struct rv_insn *taken = ir->branch_taken; + if (taken) { +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC)) { + clear_flag = true; + goto end_insn; + } +#endif + last_pc = PC; + MUST_TAIL return taken->impl(rv, taken, cycle, PC); + } + end_insn: + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld, S32, RAX, X, rs1; + cmp_imm, RAX, 0; + set_jmp_off; + jcc, 0x84; + cond, branch_untaken; + jmp, pc, 2; + end; + ld_imm, RAX, pc, 2; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* C.BEQZ */ -RVOP(cbnez, { - if (!rv->X[ir->rs1]) { - is_branch_taken = false; - struct rv_insn *untaken = ir->branch_untaken; - if (!untaken) - goto nextop; - PC += 2; - last_pc = PC; - MUST_TAIL return untaken->impl(rv, untaken, cycle, PC); - } - is_branch_taken = true; - PC += (uint32_t) ir->imm; - struct rv_insn *taken = ir->branch_taken; - if (taken) { - last_pc = PC; - MUST_TAIL return taken->impl(rv, taken, cycle, PC); - } - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cbnez, + { + if (!rv->X[ir->rs1]) { + is_branch_taken = false; + struct rv_insn *untaken = ir->branch_untaken; + if (!untaken) + goto nextop; +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC + 2)) { + clear_flag = true; + goto nextop; + } +#endif + PC += 2; + last_pc = PC; + MUST_TAIL return untaken->impl(rv, untaken, cycle, PC); + } + is_branch_taken = true; + PC += ir->imm; + struct rv_insn *taken = ir->branch_taken; + if (taken) { +#if RV32_HAS(JIT) + if (!cache_get(rv->block_cache, PC)) { + clear_flag = true; + goto end_insn; + } +#endif + last_pc = PC; + MUST_TAIL return taken->impl(rv, taken, cycle, PC); + } + end_insn: + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld, S32, RAX, X, rs1; + cmp_imm, RAX, 0; + set_jmp_off; + jcc, 0x85; + cond, branch_untaken; + jmp, pc, 2; + end; + ld_imm, RAX, pc, 2; + st, S32, RAX, PC; + exit; + jmp_off; + cond, branch_taken; + jmp, pc, imm; + end; + ld_imm, RAX, pc, imm; + st, S32, RAX, PC; + exit; + })) /* C.SLLI is a CI-format instruction that performs a logical left shift of * the value in register rd then writes the result to rd. The shift amount * is encoded in the shamt field. C.SLLI expands into slli rd, rd, shamt[5:0]. */ -RVOP(cslli, { rv->X[ir->rd] <<= (uint8_t) ir->imm; }) +RVOP(cslli, { + rv->X[ir->rd] <<= (uint8_t) ir->imm; }, X64({ + ld, S32, RAX, X, rd; + alu32_imm, 8, 0xc1, 4, RAX, uint, 8, imm; + st, S32, RAX, X, rd; + })) /* C.LWSP */ -RVOP(clwsp, { - const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; - RV_EXC_MISALIGN_HANDLER(3, load, true, 1); - rv->X[ir->rd] = rv->io.mem_read_w(addr); -}) +RVOP(clwsp, + { + const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; + RV_EXC_MISALIGN_HANDLER(3, load, true, 1); + rv->X[ir->rd] = rv->io.mem_read_w(addr); + }, + X64({ + mem; + ld, S32, RAX, X, rv_reg_sp; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RAX, RBX, 0; + st, S32, RBX, X, rd; + })) /* C.JR */ -RVOP(cjr, { - PC = rv->X[ir->rs1]; - LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cjr, + { + PC = rv->X[ir->rs1]; +#if !RV32_HAS(JIT) + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); +#endif + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld, S32, RAX, X, rs1; + st, S32, RAX, PC; + exit; + })) /* C.MV */ -RVOP(cmv, { rv->X[ir->rd] = rv->X[ir->rs2]; }) +RVOP(cmv, { + rv->X[ir->rd] = rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs2; + st, S32, RAX, X, rd; + })) /* C.EBREAK */ -RVOP(cebreak, { - rv->compressed = true; - rv->csr_cycle = cycle; - rv->PC = PC; - rv->io.on_ebreak(rv); - return true; -}) +RVOP(cebreak, + { + rv->compressed = true; + rv->csr_cycle = cycle; + rv->PC = PC; + rv->io.on_ebreak(rv); + return true; + }, + X64({ + ld_imm, RAX, pc; + st, S32, RAX, PC; + ld_imm, RAX, 1; + st, S32, RAX, compressed; + call, ebreak; + exit; + })) /* C.JALR */ -RVOP(cjalr, { - /* Unconditional jump and store PC+2 to ra */ - const int32_t jump_to = rv->X[ir->rs1]; - rv->X[rv_reg_ra] = PC + 2; - PC = jump_to; - RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); - LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); - rv->csr_cycle = cycle; - rv->PC = PC; - return true; -}) +RVOP(cjalr, + { + /* Unconditional jump and store PC+2 to ra */ + const int32_t jump_to = rv->X[ir->rs1]; + rv->X[rv_reg_ra] = PC + 2; + PC = jump_to; + RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); +#if !RV32_HAS(JIT) + LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); +#endif + rv->csr_cycle = cycle; + rv->PC = PC; + return true; + }, + X64({ + ld_imm, RAX, pc, 2; + st, S32, RAX, X, rv_reg_ra; + ld, S32, RAX, X, rs1; + st, S32, RAX, PC; + exit; + })) /* C.ADD adds the values in registers rd and rs2 and writes the result to * register rd. @@ -1075,12 +2090,27 @@ RVOP(cjalr, { * the C.JALR and C.EBREAK instructions. The code points with rs2=x0 and rd=x0 * are HINTs. */ -RVOP(cadd, { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }) +RVOP(cadd, { + rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }, X64({ + ld, S32, RAX, X, rs1; + ld, S32, RBX, X, rs2; + alu32, 0x01, RBX, RAX; + st, S32, RAX, X, rd; + })) /* C.SWSP */ -RVOP(cswsp, { - const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; - RV_EXC_MISALIGN_HANDLER(3, store, true, 1); - rv->io.mem_write_w(addr, rv->X[ir->rs2]); -}) +RVOP(cswsp, + { + const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; + RV_EXC_MISALIGN_HANDLER(3, store, true, 1); + rv->io.mem_write_w(addr, rv->X[ir->rs2]); + }, + X64({ + mem; + ld, S32, RAX, X, rv_reg_sp; + ld_imm, RBX, mem; + alu64, 0x01, RBX, RAX; + ld, S32, RBX, X, rs2; + st, S32, RBX, RAX, 0; + })) #endif diff --git a/tools/gen-jit-template.py b/tools/gen-jit-template.py new file mode 100755 index 00000000..a35231b5 --- /dev/null +++ b/tools/gen-jit-template.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 + +''' +This script serves as a code generator for creating JIT code templates +based on existing code files in the 'src' directory, eliminating the need +for writing duplicated code. +''' + +import re +import sys + +INSN = { + "Zifencei": ["fencei"], + "Zicsr": [ + "csrrw", + "csrrs", + "csrrc", + "csrrw", + "csrrsi", + "csrrci"], + "EXT_M": [ + "mul", + "mulh", + "mulhsu", + "mulhu", + "div", + "divu", + "rem", + "remu"], + "EXT_A": [ + "lrw", + "scw", + "amoswapw", + "amoaddw", + "amoxorw", + "amoandw", + "amoorw", + "amominw", + "amomaxw", + "amominuw", + "amomaxuw"], + "EXT_F": [ + "flw", + "fsw", + "fmadds", + "fmsubs", + "fnmsubs", + "fnmadds", + "fadds", + "fsubs", + "fmuls", + "fdivs", + "fsqrts", + "fsgnjs", + "fsgnjns", + "fsgnjxs", + "fmins", + "fmaxs", + "fcvtws", + "fcvtwus", + "fmvxw", + "feqs", + "flts", + "fles", + "fclasss", + "fcvtsw", + "fcvtswu", + "fmvwx"], + "EXT_C": [ + "caddi4spn", + "clw", + "csw", + "cnop", + "caddi", + "cjal", + "cli", + "caddi16sp", + "clui", + "csrli", + "csrai", + "candi", + "csub", + "cxor", + "cor", + "cand", + "cj", + "cbeqz", + "cbnez", + "cslli", + "clwsp", + "cjr", + "cmv", + "cebreak", + "cjalr", + "cadd", + "cswsp", + ], +} +EXT_LIST = ["Zifencei", "Zicsr", "EXT_M", "EXT_A", "EXT_F", "EXT_C"] +SKIP_LIST = [] +# check enabled extension in Makefile + + +def parse_argv(EXT_LIST, SKIP_LIST): + for argv in sys.argv: + if argv.find("RV32_FEATURE_") != -1: + ext = argv[argv.find("RV32_FEATURE_") + 13:-2] + if argv[-1:] == "1" and EXT_LIST.count(ext): + EXT_LIST.remove(ext) + for ext in EXT_LIST: + SKIP_LIST += INSN[ext] + + +def remove_comment(str): + str = re.sub(r'//[\s|\S]+?\n', "", str) + return re.sub(r'/\*[\s|\S]+?\*/\n', "", str) + + +# parse_argv(EXT_LIST, SKIP_LIST) +# prepare PROLOGUE +output = "" +f = open('src/rv32_template.c', 'r') +lines = f.read() +# remove exception handler +lines = re.sub(r'RV_EXC[\S]+?\([\S|\s]+?\);\s', "", lines) +# collect functions +emulate_funcs = re.findall(r'RVOP\([\s|\S]+?}\)', lines) +codegen_funcs = re.findall(r'X64\([\s|\S]+?}\)', lines) + +op = [] +impl = [] +for i in range(len(emulate_funcs)): + op.append(emulate_funcs[i][5:emulate_funcs[i].find(',')]) + impl.append(codegen_funcs[i]) + +f.close() + +fields = {"imm", "pc", "rs1", "rs2", "rd", "shamt", "branch_taken", "branch_untaken"} +# generate jit template +for i in range(len(op)): + if (not SKIP_LIST.count(op[i])): + output += impl[i][0:4] + op[i] + ", {" + IRs = re.findall(r'[\s|\S]+?;', impl[i][5:]) + # parse_and_translate_IRs + for i in range(len(IRs)): + IR = IRs[i].strip()[:-1] + items = [s.strip() for s in IR.split(',')] + asm = "" + for i in range(len(items)): + if items[i] in fields: + items[i] = "ir->" + items[i] + # print(items) + match items[0]: + case "alu32_imm": + if len(items) == 8: + asm = "emit_alu32_imm{}(state, {}, {}, {}, ({}{}_t) {});".format( + items[1], items[2], items[3], items[4], items[5], items[6], items[7]) + elif len(items) == 7: + asm = "emit_alu32_imm{}(state, {}, {}, {}, {} & {});".format( + items[1], items[2], items[3], items[4], items[5], items[6]) + else: + asm = "emit_alu32_imm{}(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4], items[5]) + case "alu64_imm": + asm = "emit_alu64_imm{}(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4], items[5]) + case "alu64": + asm = "emit_alu64(state, {}, {}, {});".format( + items[1], items[2], items[3]) + case "alu32": + asm = "emit_alu32(state, {}, {}, {});".format( + items[1], items[2], items[3]) + case "ld_imm": + if items[2] == "mem": + asm = "emit_load_imm(state, {}, (intptr_t) (m->mem_base + ir->imm));".format( + items[1]) + elif len(items) == 4: + asm = "emit_load_imm(state, {}, {} + {});".format( + items[1], items[2], items[3]) + else: + asm = "emit_load_imm(state, {}, {});".format( + items[1], items[2]) + case "ld_sext": + if (items[3] == "X"): + asm = "emit_load_sext(state, {}, parameter_reg[0], {}, offsetof(struct riscv_internal, X) + 4 * {});".format( + items[1], items[2], items[4]) + else: + asm = "emit_load_sext(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "ld": + if (items[3] == "X"): + asm = "emit_load(state, {}, parameter_reg[0], {}, offsetof(struct riscv_internal, X) + 4 * {});".format( + items[1], items[2], items[4]) + else: + asm = "emit_load(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "st_imm": + asm = "emit_store_imm32(state, {}, parameter_reg[0], offsetof(struct riscv_internal, X) + 4 * {}, {});".format( + items[1], items[2], items[3]) + case "st": + if (items[3] == "X"): + asm = "emit_store(state, {}, {}, parameter_reg[0], offsetof(struct riscv_internal, X) + 4 * {});".format( + items[1], items[2], items[4]) + elif items[3] == "PC" or items[3] == "compressed": + asm = "emit_store(state, {}, {}, parameter_reg[0], offsetof(struct riscv_internal, {}));".format( + items[1], items[2], items[3]) + else: + asm = "emit_store(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "cmp": + asm = "emit_cmp32(state, {}, {});".format( + items[1], items[2]) + case "cmp_imm": + asm = "emit_cmp_imm32(state, {}, {});".format( + items[1], items[2]) + case "jmp": + asm = "emit_jmp(state, {} + {});".format( + items[1], items[2]) + case "jcc": + asm = "emit_jcc_offset(state, {});".format(items[1]) + case "set_jmp_off": + asm = "uint32_t jump_loc = state->offset;" + case "jmp_off": + asm = "emit_jump_target_offset(state, jump_loc + 2, state->offset);" + case "mem": + asm = "memory_t *m = ((state_t *) rv->userdata)->mem;" + case "call": + asm = "emit_call(state, (intptr_t) rv->io.on_{});".format( + items[1]) + case "exit": + asm = "emit_exit(&(*state));" + case "mul": + asm = "muldivmod(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "div": + asm = "muldivmod(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "mod": + asm = "muldivmod(state, {}, {}, {}, {});".format( + items[1], items[2], items[3], items[4]) + case "cond": + asm = "if({})".format(items[1]) + "{" + case "end": + asm = "}" + case "assert": + asm = "assert(NULL);" + output += asm + "\n" + output += "})\n" + +sys.stdout.write(output)