From f6785df663928d6614030e93aaeafa4a3b24ae3f Mon Sep 17 00:00:00 2001 From: "debing.sun" Date: Wed, 21 Feb 2024 00:11:09 +0800 Subject: [PATCH] Defragger improvements around large bins (#12996) Implement #12963 ## Changes 1. large bins don't have external fragmentation or are at least non-defraggable, so we should ignore the effect of large bins when measuring fragmentation, and only measure fragmentation of small bins. this affects both the allocator_frag* metrics and also the active-defrag trigger 2. Adding INFO metrics for `muzzy` memory, which is memory returned to the OS but still shows as RSS until the OS reclaims it. --------- Co-authored-by: Oran Agra --- src/defrag.c | 16 ++++++---- src/object.c | 9 ++++-- src/server.c | 6 +++- src/server.h | 2 ++ src/zmalloc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++----- src/zmalloc.h | 3 +- 6 files changed, 99 insertions(+), 18 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 61d9c6a0435f..3813ec3ac0d8 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -780,17 +780,21 @@ void activeDefragKvstore(kvstore *kvs) { * or not, a false detection can cause the defragmenter to waste a lot of CPU * without the possibility of getting any results. */ float getAllocatorFragmentation(size_t *out_frag_bytes) { - size_t resident, active, allocated; - zmalloc_get_allocator_info(&allocated, &active, &resident); - float frag_pct = ((float)active / allocated)*100 - 100; - size_t frag_bytes = active - allocated; + size_t resident, active, allocated, frag_smallbins_bytes; + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes); + + /* Calculate the fragmentation ratio as the proportion of wasted memory in small + * bins (which are defraggable) relative to the total allocated memory (including large bins). + * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, + * despite the fact it's not a lot of memory for the user. */ + float frag_pct = (float)frag_smallbins_bytes / allocated * 100; float rss_pct = ((float)resident / allocated)*100 - 100; size_t rss_bytes = resident - allocated; if(out_frag_bytes) - *out_frag_bytes = frag_bytes; + *out_frag_bytes = frag_smallbins_bytes; serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)", - allocated, active, resident, frag_pct, rss_pct, frag_bytes, rss_bytes); + allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes); return frag_pct; } diff --git a/src/object.c b/src/object.c index dffb6fab9f0e..d5bb74f8fe10 100644 --- a/src/object.c +++ b/src/object.c @@ -1184,9 +1184,9 @@ struct redisMemOverhead *getMemoryOverheadData(void) { mh->total_frag_bytes = server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used; mh->allocator_frag = - (float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated; + (float)server.cron_malloc_stats.allocator_frag_smallbins_bytes / server.cron_malloc_stats.allocator_allocated + 1; mh->allocator_frag_bytes = - server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated; + server.cron_malloc_stats.allocator_frag_smallbins_bytes; mh->allocator_rss = (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active; mh->allocator_rss_bytes = @@ -1556,7 +1556,7 @@ NULL } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) { struct redisMemOverhead *mh = getMemoryOverheadData(); - addReplyMapLen(c,27+mh->num_dbs); + addReplyMapLen(c,28+mh->num_dbs); addReplyBulkCString(c,"peak.allocated"); addReplyLongLong(c,mh->peak_allocated); @@ -1628,6 +1628,9 @@ NULL addReplyBulkCString(c,"allocator.resident"); addReplyLongLong(c,server.cron_malloc_stats.allocator_resident); + addReplyBulkCString(c,"allocator.muzzy"); + addReplyLongLong(c,server.cron_malloc_stats.allocator_muzzy); + addReplyBulkCString(c,"allocator-fragmentation.ratio"); addReplyDouble(c,mh->allocator_frag); diff --git a/src/server.c b/src/server.c index b24d54b7e000..adcc9641c261 100644 --- a/src/server.c +++ b/src/server.c @@ -1228,7 +1228,10 @@ void cronUpdateMemoryStats(void) { * allocations, and allocator reserved pages that can be pursed (all not actual frag) */ zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated, &server.cron_malloc_stats.allocator_active, - &server.cron_malloc_stats.allocator_resident); + &server.cron_malloc_stats.allocator_resident, + NULL, + &server.cron_malloc_stats.allocator_muzzy, + &server.cron_malloc_stats.allocator_frag_smallbins_bytes); /* in case the allocator isn't providing these stats, fake them so that * fragmentation info still shows some (inaccurate metrics) */ if (!server.cron_malloc_stats.allocator_resident) { @@ -5643,6 +5646,7 @@ sds genRedisInfoString(dict *section_dict, int all_sections, int everything) { "allocator_allocated:%zu\r\n", server.cron_malloc_stats.allocator_allocated, "allocator_active:%zu\r\n", server.cron_malloc_stats.allocator_active, "allocator_resident:%zu\r\n", server.cron_malloc_stats.allocator_resident, + "allocator_muzzy:%zu\r\n", server.cron_malloc_stats.allocator_muzzy, "total_system_memory:%lu\r\n", (unsigned long)total_system_mem, "total_system_memory_human:%s\r\n", total_system_hmem, "used_memory_lua:%lld\r\n", memory_lua, /* deprecated, renamed to used_memory_vm_eval */ diff --git a/src/server.h b/src/server.h index acfbd09343fc..fb0f27179a9b 100644 --- a/src/server.h +++ b/src/server.h @@ -1464,6 +1464,8 @@ struct malloc_stats { size_t allocator_allocated; size_t allocator_active; size_t allocator_resident; + size_t allocator_muzzy; + size_t allocator_frag_smallbins_bytes; }; /*----------------------------------------------------------------------------- diff --git a/src/zmalloc.c b/src/zmalloc.c index 343d730a2f2a..682310a46028 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -626,9 +626,54 @@ size_t zmalloc_get_rss(void) { #if defined(USE_JEMALLOC) -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident) { +#include + +#define STRINGIFY_(x) #x +#define STRINGIFY(x) STRINGIFY_(x) + +/* Compute the total memory wasted in fragmentation of inside small arena bins. + * Done by summing the memory in unused regs in all slabs of all small bins. */ +size_t zmalloc_get_frag_smallbins(void) { + unsigned nbins; + size_t sz, frag = 0; + char buf[100]; + + sz = sizeof(unsigned); + assert(!je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0)); + for (unsigned j = 0; j < nbins; j++) { + size_t curregs, curslabs, reg_size; + uint32_t nregs; + + /* The size of the current bin */ + snprintf(buf, sizeof(buf), "arenas.bin.%d.size", j); + sz = sizeof(size_t); + assert(!je_mallctl(buf, ®_size, &sz, NULL, 0)); + + /* Number of used regions in the bin */ + snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curregs", j); + sz = sizeof(size_t); + assert(!je_mallctl(buf, &curregs, &sz, NULL, 0)); + + /* Number of regions per slab */ + snprintf(buf, sizeof(buf), "arenas.bin.%d.nregs", j); + sz = sizeof(uint32_t); + assert(!je_mallctl(buf, &nregs, &sz, NULL, 0)); + + /* Number of current slabs in the bin */ + snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curslabs", j); + sz = sizeof(size_t); + assert(!je_mallctl(buf, &curslabs, &sz, NULL, 0)); + + /* Calculate the fragmentation bytes for the current bin and add it to the total. */ + frag += ((nregs * curslabs) - curregs) * reg_size; + } + + return frag; +} + +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, + size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes) +{ uint64_t epoch = 1; size_t sz; *allocated = *resident = *active = 0; @@ -645,6 +690,26 @@ int zmalloc_get_allocator_info(size_t *allocated, /* Unlike zmalloc_used_memory, this matches the stats.resident by taking * into account all allocations done by this process (not only zmalloc). */ je_mallctl("stats.allocated", allocated, &sz, NULL, 0); + + /* Retained memory is memory released by `madvised(..., MADV_DONTNEED)`, which is not part + * of RSS or mapped memory, and doesn't have a strong association with physical memory in the OS. + * It is still part of the VM-Size, and may be used again in later allocations. */ + if (retained) { + *retained = 0; + je_mallctl("stats.retained", retained, &sz, NULL, 0); + } + + /* Unlike retained, Muzzy representats memory released with `madvised(..., MADV_FREE)`. + * These pages will show as RSS for the process, until the OS decides to re-use them. */ + if (muzzy) { + size_t pmuzzy, page; + assert(!je_mallctl("stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".pmuzzy", &pmuzzy, &sz, NULL, 0)); + assert(!je_mallctl("arenas.page", &page, &sz, NULL, 0)); + *muzzy = pmuzzy * page; + } + + /* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). */ + *frag_smallbins_bytes = zmalloc_get_frag_smallbins(); return 1; } @@ -670,10 +735,12 @@ int jemalloc_purge(void) { #else -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident) { - *allocated = *resident = *active = 0; +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, + size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes) +{ + *allocated = *resident = *active = *frag_smallbins_bytes = 0; + if (retained) *retained = 0; + if (muzzy) *muzzy = 0; return 1; } diff --git a/src/zmalloc.h b/src/zmalloc.h index 491013a8530a..1b63d6aedc42 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -122,7 +122,8 @@ __attribute__((malloc)) char *zstrdup(const char *s); size_t zmalloc_used_memory(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); -int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident); +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, + size_t *retained, size_t *muzzy, size_t *frag_smallbins_bytes); void set_jemalloc_bg_thread(int enable); int jemalloc_purge(void); size_t zmalloc_get_private_dirty(long pid);