diff --git a/Makefile b/Makefile index e451348..e3f936d 100644 --- a/Makefile +++ b/Makefile @@ -270,7 +270,7 @@ malloc_cmp_test: clean $(CC) $(CFLAGS) $(OPTIMIZE) $(EXE_CFLAGS) $(OS_FLAGS) -DMALLOC_PERF_TEST $(ISO_ALLOC_PRINTF_SRC) tests/tests.c -o $(BUILD_DIR)/malloc_tests echo "Running IsoAlloc Performance Test" build/tests - echo "Running glibc malloc Performance Test" + echo "Running system malloc Performance Test" build/malloc_tests ## C++ Support - Build a debug version of the unit test diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 7b54a6e..037b345 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -80,16 +80,31 @@ The same test run on an AWS t2.xlarge Ubuntu 20.04 instance with 4 `Intel(R) Xeo ``` Running IsoAlloc Performance Test -iso_alloc/iso_free 1441616 tests completed in 0.418426 seconds -iso_calloc/iso_free 1441616 tests completed in 0.578068 seconds -iso_realloc/iso_free 1441616 tests completed in 0.681393 seconds +iso_alloc/iso_free 1441616 tests completed in 0.147336 seconds +iso_calloc/iso_free 1441616 tests completed in 0.161482 seconds +iso_realloc/iso_free 1441616 tests completed in 0.244981 seconds Running glibc malloc Performance Test -malloc/free 1441616 tests completed in 0.352161 seconds -calloc/free 1441616 tests completed in 0.562425 seconds -realloc/free 1441616 tests completed in 0.590622 seconds +malloc/free 1441616 tests completed in 0.182437 seconds +calloc/free 1441616 tests completed in 0.246065 seconds +realloc/free 1441616 tests completed in 0.332292 seconds +``` + +Here is the same test as above on Mac OS 11.6 + +``` +Running IsoAlloc Performance Test + +iso_alloc/iso_free 1441616 tests completed in 0.124150 seconds +iso_calloc/iso_free 1441616 tests completed in 0.182955 seconds +iso_realloc/iso_free 1441616 tests completed in 0.275084 seconds + +Running system malloc Performance Test +malloc/free 1441616 tests completed in 0.090845 seconds +calloc/free 1441616 tests completed in 0.200397 seconds +realloc/free 1441616 tests completed in 0.254574 seconds ``` This same test can be used with the `perf` utility to measure basic stats like page faults and CPU utilization using both heap implementations. The output below is on the same AWS t2.xlarge instance as above. @@ -163,10 +178,10 @@ cache-thrashN mimalloc 00.36 3356 1.44 0.00 0 229 cache-thrashN tcmalloc 01.87 6880 7.42 0.00 0 1138 cache-thrashN jemalloc 00.37 3760 1.46 0.00 0 296 -redis isoalloc 9.335 71048 4.35 0.36 0 19326 ops/sec: 214227.92 -redis mimalloc 4.611 28932 2.13 0.20 4 6657 ops/sec: 433692.97 -redis tcmalloc 5.055 37088 2.37 0.19 3 8444 ops/sec: 395588.59 -redis jemalloc 5.150 30964 2.42 0.19 5 7024 ops/sec: 388279.50 +redis isoalloc 8.669 76240 4.07 0.30 1 21473 ops/sec: 230702.66, relative time: 8.669s +redis mimalloc 4.555 28968 2.13 0.17 4 6655 ops/sec: 439023.69, relative time: 4.555s +redis tcmalloc 4.715 37120 2.21 0.17 3 8446 ops/sec: 424108.56, relative time: 4.715s +redis jemalloc 5.125 30836 2.41 0.17 0 7034 ops/sec: 390174.03, relative time: 5.125s ``` IsoAlloc isn't quite ready for performance sensitive server workloads but it's more than fast enough for client side mobile/desktop applications with risky C/C++ attack surface. diff --git a/include/iso_alloc_internal.h b/include/iso_alloc_internal.h index 239777b..0c944eb 100644 --- a/include/iso_alloc_internal.h +++ b/include/iso_alloc_internal.h @@ -268,8 +268,8 @@ using namespace std; * create. This is a completely arbitrary number but * it does correspond to the size of the _root.zones * array that lives in global memory. Currently the - * iso_alloc_zone structure is roughly 1088 bytes so - * this allocates 8912896 bytes (~8.5 MB) for _root */ + * iso_alloc_zone structure is roughly 1090 bytes so + * this allocates 8929280 bytes (~8.9 MB) for _root */ #define MAX_ZONES 8192 /* Each user allocation zone we make is 4mb in size. @@ -296,6 +296,8 @@ using namespace std; #define BIG_ZONE_USER_PAGE_COUNT 2 #define BIG_ZONE_USER_PAGE_COUNT_SHIFT 1 +#define ZONE_LOOKUP_TABLE_SZ ((SMALL_SZ_MAX+1) * sizeof(uint16_t)) + /* We allocate zones at startup for common sizes. * Each of these default zones is ZONE_USER_SIZE bytes * so ZONE_8192 holds less chunks than ZONE_128 for @@ -414,6 +416,7 @@ static uint64_t default_zones[] = {ZONE_512, ZONE_512, ZONE_512, ZONE_1024}; typedef uint64_t bit_slot_t; typedef int64_t bitmap_index_t; +typedef uint16_t zone_lookup_table_t; typedef struct { void *user_pages_start; /* Start of the pages backing this zone */ @@ -430,6 +433,7 @@ typedef struct { bool internally_managed; /* Zones can be managed by iso_alloc or custom */ bool is_full; /* Indicates whether this zone is full to avoid expensive free bit slot searches */ uint16_t index; /* Zone index */ + uint16_t next_sz_index; /* What is the index of the next zone of this size */ #if CPU_PIN uint8_t cpu_core; /* What CPU core this zone is pinned to */ #endif diff --git a/misc/commands.gdb b/misc/commands.gdb index 3d66ddb..d434cba 100644 --- a/misc/commands.gdb +++ b/misc/commands.gdb @@ -4,3 +4,5 @@ i r x/i $pc thread apply all bt thread apply all info locals +p *_root +p _zone_lookup_table diff --git a/src/iso_alloc.c b/src/iso_alloc.c index 405c382..6871cd1 100644 --- a/src/iso_alloc.c +++ b/src/iso_alloc.c @@ -17,6 +17,13 @@ uint32_t g_page_size; uint32_t _default_zone_count; iso_alloc_root *_root; +/* Zones are linked by their next_sz_index member which + * tells the allocator where in the _root->zones array + * it can find the next zone that holds the same size + * chunks. The lookup table helps us find the first zone + * that holds a specific size in O(1) time */ +static zone_lookup_table_t *_zone_lookup_table; + #if NO_ZERO_ALLOCATIONS void *_zero_alloc_page; #endif @@ -363,6 +370,10 @@ INTERNAL_HIDDEN void iso_alloc_initialize_global_root(void) { LOG_AND_ABORT("Could not initialize global root"); } + /* We mlock the root or every allocation would + * result in a soft page fault */ + mlock(&_root, sizeof(iso_alloc_root)); + _default_zone_count = sizeof(default_zones) >> 3; _root->zones_size = (MAX_ZONES * sizeof(iso_alloc_zone)); @@ -377,18 +388,17 @@ INTERNAL_HIDDEN void iso_alloc_initialize_global_root(void) { _root->zones = (void *) (p + g_page_size); name_mapping(p, _root->zones_size, "isoalloc zone metadata"); + /* If we don't lock the zone lookup table we will incur a + * soft page fault with almost every allocation */ + _zone_lookup_table = mmap_rw_pages(ZONE_LOOKUP_TABLE_SZ, true, NULL); + mlock(&_zone_lookup_table, ZONE_LOOKUP_TABLE_SZ); + for(int64_t i = 0; i < _default_zone_count; i++) { if((_iso_new_zone(default_zones[i], true)) == NULL) { LOG_AND_ABORT("Failed to create a new zone"); } } - /* This call to mlock may fail if memory limits - * are set too low. This will not affect us - * at runtime. It just means some of the default - * root meta data may get swapped to disk */ - mlock(&_root, sizeof(iso_alloc_root)); - _root->zone_handle_mask = rand_uint64(); _root->big_zone_next_mask = rand_uint64(); _root->big_zone_canary_secret = rand_uint64(); @@ -574,6 +584,8 @@ __attribute__((destructor(LAST_DTOR))) void iso_alloc_dtor(void) { munmap(_root, sizeof(iso_alloc_root)); #endif + munmap(_zone_lookup_table, ZONE_LOOKUP_TABLE_SZ); + UNLOCK_ROOT(); } @@ -600,9 +612,10 @@ INTERNAL_HIDDEN iso_alloc_zone *iso_new_zone(size_t size, bool internal) { return zone; } +/* Requires the root is locked */ INTERNAL_HIDDEN iso_alloc_zone *_iso_new_zone(size_t size, bool internal) { if(_root->zones_used >= MAX_ZONES) { - LOG_AND_ABORT("Cannot allocate additional zones"); + LOG_AND_ABORT("Cannot allocate additional zones. I have already allocated %d", _root->zones_used); } if(size > SMALL_SZ_MAX) { @@ -691,6 +704,36 @@ INTERNAL_HIDDEN iso_alloc_zone *_iso_new_zone(size_t size, bool internal) { POISON_ZONE(new_zone); MASK_ZONE_PTRS(new_zone); + /* The lookup table is never used for custom zones */ + if(internal == true) { + /* If no other zones of this size exist then set the + * index in the zone lookup table to its index */ + if(_zone_lookup_table[size] == 0) { + _zone_lookup_table[size] = _root->zones_used; + } else { + /* Other zones exist that hold this size. We need to + * fixup the most recent ones next_sz_index member. + * We do this by walking the list using next_sz_index */ + for(int32_t i = _zone_lookup_table[size]; i < _root->zones_used;) { + iso_alloc_zone *zt = &_root->zones[i]; + + if(zt->chunk_size != size) { + LOG_AND_ABORT("Inconsistent lookup table for zone[%d] chunk size %d (%d)", zt->index, zt->chunk_size, size); + } + + /* Follow this zone's next_sz_index member */ + if(zt->next_sz_index != 0) { + i = zt->next_sz_index; + } else { + /* If this zones next_sz_index is zero then set + * it to the zone we just created and break */ + zt->next_sz_index = new_zone->index; + break; + } + } + } + } + _root->zones_used++; return new_zone; @@ -831,7 +874,46 @@ INTERNAL_HIDDEN iso_alloc_zone *iso_find_zone_fit(size_t size) { iso_alloc_zone *zone = NULL; int32_t i = 0; -#if !SMALL_MEM_STARTUP + if(IS_ALIGNED(size) != 0) { + size = ALIGN_SZ_UP(size); + } + + /* Fast path via lookup table */ + if(_zone_lookup_table[size] != 0) { + i = _zone_lookup_table[size]; + + for(; i < _root->zones_used;) { + zone = &_root->zones[i]; + + if(zone->chunk_size != size) { + LOG_AND_ABORT("Zone lookup table failed to match sizes for zone[%d](%d) for chunk size (%d)", zone->index, zone->chunk_size, size); + } + + if(zone->internally_managed == false) { + LOG_AND_ABORT("Lookup table should never contain custom zones"); + } + + bool fits = iso_does_zone_fit(zone, size); + + if(fits == true) { + return zone; + } + + if(zone->next_sz_index != 0) { + i = zone->next_sz_index; + } else { + /* We have reached the end of our linked zones. The + * lookup table failed to find us a usable zone. + * Instead of creating a new one we will break out + * of this loop and try iterating through all zones, + * including ones we may have skipped over, to find + * a suitable candidate. */ + break; + } + } + } + +#if SMALL_MEM_STARTUP /* A simple optimization to find which default zone * should fit this allocation. If we fail then a * slower iterative approach is used. The longer a diff --git a/src/iso_alloc_profiler.c b/src/iso_alloc_profiler.c index a04ad91..ed62752 100644 --- a/src/iso_alloc_profiler.c +++ b/src/iso_alloc_profiler.c @@ -148,7 +148,7 @@ INTERNAL_HIDDEN uint64_t __iso_alloc_mem_usage() { iso_alloc_zone *zone = &_root->zones[i]; mem_usage += zone->bitmap_size; mem_usage += ZONE_USER_SIZE; - LOG("Zone[%d] holds %d byte chunks, megabytes (%d)", zone->index, zone->chunk_size, (ZONE_USER_SIZE / MEGABYTE_SIZE)); + LOG("Zone[%d] holds %d byte chunks, megabytes (%d) next zone = %d", zone->index, zone->chunk_size, (ZONE_USER_SIZE / MEGABYTE_SIZE), zone->next_sz_index); } return (mem_usage / MEGABYTE_SIZE);