Skip to content

Commit

Permalink
Introduce a tier-1 JIT compiler based on x86-64 architecture
Browse files Browse the repository at this point in the history
When the using frequency of a block exceeds a predetermined threshold,
the tier-1 JIT compiler traces the chained block and generate
corresponding low quailty machine code. The resulting target machine
code is stored in the code cache for future utilization.

The primary objective of introducing the tier-1 JIT compiler is to
enhance the execution speed of RISC-V instructions. This implementation
requires two additional components: a tier-1 machine code generator,
and code cache. Furthermore, this tier-1 JIT compiler serves as the
foundational target for future improvements.

In addition, we have developed a Python script that effectively traces
code templates and automatically generates JIT code templates. This
approach eliminates the need for manually writing duplicated code.

As shown in the performance analysis below, the tier-1 JIT compiler's
performance closely parallels that of QEMU in benchmarks with a
constrained dynamic instruction count. However, for benchmarks
featuring a substantial dynamic instruction count or lacking specific
hotspots—examples include pi and STRINGSORT—the tier-1 JIT compiler
demonstrates noticeably slower execution compared to QEMU.

Hence, a robust tier-2 JIT compiler is essential to generate optimized
machine code across diverse execution paths, coupled with a runtime
profiler for detecting hotspots.

* Perfromance
| Metric   | rv32emu-T1C | qemu  |
|----------+-------------+-------|
|aes	   |         0.02|  0.031|
|mandelbrot|	    0.029| 0.0115|
|puzzle	   |       0.0115|  0.009|
|pi        |       0.0413| 0.0177|
|dhrystone |	    0.331|  0.393|
|Nqeueens  |	    0.854|  0.749|
|qsort-O2  |	    2.384|   2.16|
|miniz-O2  |	     1.33|   1.01|
|primes-O2 |	     2.93|  1.069|
|sha512-O2 |	    2.057|  0.939|
|stream	   |       12.747|  10.36|
|STRINGSORT|       89.012| 11.496|

As demonstrated in the memory usage analysis below, the tier-1 JIT
compiler utilizes less memory than QEMU across all benchmarks.

* Memory usage
| Metric   | rv32emu-T1C |   qemu  |
|----------+-------------+---------|
|aes	   |      186,228|1,343,012|
|mandelbrot|	  152,203|  841,841|
|puzzle	   |      153,423|  890,225|
|pi        |      152,923|  879,957|
|dhrystone |	  154,466|  856,404|
|Nqeueens  |	  154,880|  858,618|
|qsort-O2  |	  155,091|  933,506|
|miniz-O2  |	  165,627|1,076,682|
|primes-O2 |	  150,540|  928,446|
|sha512-O2 |	  153,553|  978,177|
|stream	   |      165,911|  957,845|
|STRINGSORT|      167,871|1,104,702|

Related: sysprog21#238
  • Loading branch information
qwe661234 committed Dec 14, 2023
1 parent 90b42a6 commit 165f036
Show file tree
Hide file tree
Showing 17 changed files with 3,134 additions and 634 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ jobs:
- name: gdbstub test
run: |
make distclean ENABLE_GDBSTUB=1 gdbstub-test
- name: JIT test
run: |
make ENABLE_JIT=1 clean check
coding-style:
runs-on: ubuntu-22.04
steps:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ build/path/
tests/**/*.elf
tests/arch-test-target/config.ini
__pycache__/
src/rv32_jit_template.c
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,21 @@ gdbstub-test: $(BIN)
$(Q).ci/gdbstub-test.sh && $(call notice, [OK])
endif

ENABLE_JIT ?= 0
$(call set-feature, JIT)
ifeq ($(call has, JIT), 1)
OBJS_EXT += jit_x64.o
ifneq ($(processor), x86_64)
$(error JIT mode only supports for x64 target currently.)
endif

src/rv32_jit_template.c:
$(Q)tools/gen-jit-template.py $(CFLAGS) > $@

$(OUT)/jit_x64.o: src/jit_x64.c src/rv32_jit_template.c
$(VECHO) " CC\t$@\n"
$(Q)$(CC) -o $@ $(CFLAGS) -c -MMD -MF $@.d $<
endif
# For tail-call elimination, we need a specific set of build flags applied.
# FIXME: On macOS + Apple Silicon, -fno-stack-protector might have a negative impact.
$(OUT)/emulate.o: CFLAGS += -foptimize-sibling-calls -fomit-frame-pointer -fno-stack-check -fno-stack-protector
Expand Down Expand Up @@ -214,7 +229,7 @@ endif
endif

clean:
$(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT)
$(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT) src/rv32_jit_template.c
distclean: clean
-$(RM) $(DOOM_DATA) $(QUAKE_DATA)
$(RM) -r $(OUT)/id1
Expand Down
1 change: 1 addition & 0 deletions mk/tools.mk
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ HIST_BIN := $(OUT)/rv_histogram
# FIXME: riscv.o and map.o are dependencies of 'elf.o', not 'rv_histogram'.
HIST_OBJS := \
riscv.o \
utils.o \
map.o \
elf.o \
decode.o \
Expand Down
68 changes: 65 additions & 3 deletions src/cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
#include "mpool.h"
#include "utils.h"

/* THRESHOLD is set to identify hot spots. Once the frequency of use for a block
* exceeds the THRESHOLD, the JIT compiler flow is triggered.
/* Currently, THRESHOLD is set to identify hot spots. Once the using frequency
* for a block exceeds the THRESHOLD, the tier-1 JIT compiler process is
* triggered.
* FIXME: Implement effective profiler to detect hot spots, instead of simply
* relying on THRESHOLD.
*/
#define THRESHOLD 1000
#define THRESHOLD 4096

static uint32_t cache_size, cache_size_bits;
static struct mpool *cache_mp;
Expand Down Expand Up @@ -545,3 +548,62 @@ void cache_free(cache_t *cache, void (*callback)(void *))
free(cache->map);
free(cache);
}

#if !RV32_HAS(ARC)
uint32_t cache_freq(struct cache *cache, uint32_t key)
{
if (!cache->capacity ||
hlist_empty(&cache->map->ht_list_head[cache_hash(key)]))
return 0;
lfu_entry_t *entry = NULL;
#ifdef __HAVE_TYPEOF
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list)
#else
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list, lfu_entry_t)
#endif
{
if (entry->key == key)
return entry->frequency;
}
return 0;
}
#endif

#if RV32_HAS(JIT)
bool cache_hot(struct cache *cache, uint32_t key)
{
if (!cache->capacity ||
hlist_empty(&cache->map->ht_list_head[cache_hash(key)]))
return false;
#if RV32_HAS(ARC)
arc_entry_t *entry = NULL;
#ifdef __HAVE_TYPEOF
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list)
#else
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list, arc_entry_t)
#endif
{
if (entry->key == key && entry->frequency == THRESHOLD)
return true;
}
#else
lfu_entry_t *entry = NULL;
#ifdef __HAVE_TYPEOF
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list)
#else
hlist_for_each_entry (entry, &cache->map->ht_list_head[cache_hash(key)],
ht_list, lfu_entry_t)
#endif
{
if (entry->key == key && entry->frequency == THRESHOLD)
return true;
}
#endif
return false;
}
#endif
15 changes: 15 additions & 0 deletions src/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#pragma once

#include <stdbool.h>
#include <stdint.h>

struct cache;
Expand Down Expand Up @@ -38,3 +39,17 @@ void *cache_put(struct cache *cache, uint32_t key, void *value);
* @callback: a function for freeing cache entry completely
*/
void cache_free(struct cache *cache, void (*callback)(void *));

#if RV32_HAS(JIT)
/**
* cache_hot - check whether the frequency of the cache entry exceeds the
* threshold or not
* @cache: a pointer points to target cache
* @key: the key of the specified entry
*/
bool cache_hot(struct cache *cache, uint32_t key);
#endif

#if !RV32_HAS(ARC)
uint32_t cache_freq(struct cache *cache, uint32_t key);
#endif
20 changes: 19 additions & 1 deletion src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,31 @@ enum op_field {
)
/* clang-format on */

/* Macro operation fusion */

/* macro operation fusion: convert specific RISC-V instruction patterns
* into faster and equivalent code
*/
#define FUSE_INSN_LIST \
_(fuse1) \
_(fuse2) \
_(fuse3) \
_(fuse4) \
_(fuse5) \
_(fuse6) \
_(fuse7)

/* clang-format off */
/* IR list */
enum {
#define _(inst, can_branch, insn_len, reg_mask) rv_insn_##inst,
RV_INSN_LIST
#undef _
N_RV_INSNS
N_RV_INSNS,
#define _(inst) rv_insn_##inst,
FUSE_INSN_LIST
#undef _
N_TOTAL_INSNS,
};
/* clang-format on */

Expand Down
Loading

0 comments on commit 165f036

Please sign in to comment.