Adding real allocator fragmentation to INFO and MEMORY command + acti…

…ve defrag test other fixes / improvements: - LUA script memory isn't taken from zmalloc (taken from libc malloc) so it can cause high fragmentation ratio to be displayed (which is false) - there was a problem with "fragmentation" info being calculated from RSS and used_memory sampled at different times (now sampling them together) other details: - adding a few more allocator info fields to INFO and MEMORY commands - improve defrag test to measure defrag latency of big keys - increasing the accuracy of the defrag test (by looking at real grag info) this way we can use an even lower threshold and still avoid false positives - keep the old (total) "fragmentation" field unchanged, but add new ones for spcific things - add these the MEMORY DOCTOR command - deduct LUA memory from the rss in case of non jemalloc allocator (one for which we don't "allocator active/used") - reduce sampling rate of the rss and allocator info
redis · Mar 12, 2018 · 806736c · 806736c
1 parent be1b4aa
commit 806736c
Show file tree

Hide file tree

Showing 8 changed files with 333 additions and 72 deletions.
diff --git a/src/defrag.c b/src/defrag.c
@@ -700,18 +700,9 @@ void defragDictBucketCallback(void *privdata, dictEntry **bucketref) {
  * or not, a false detection can cause the defragmenter to waste a lot of CPU
  * without the possibility of getting any results. */
 float getAllocatorFragmentation(size_t *out_frag_bytes) {
-    size_t epoch = 1, allocated = 0, resident = 0, active = 0, sz = sizeof(size_t);
-    /* Update the statistics cached by mallctl. */
-    je_mallctl("epoch", &epoch, &sz, &epoch, sz);
-    /* Unlike RSS, this does not include RSS from shared libraries and other non
-     * heap mappings. */
-    je_mallctl("stats.resident", &resident, &sz, NULL, 0);
-    /* Unlike resident, this doesn't not include the pages jemalloc reserves
-     * for re-use (purge will clean that). */
-    je_mallctl("stats.active", &active, &sz, NULL, 0);
-    /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
-     * into account all allocations done by this process (not only zmalloc). */
-    je_mallctl("stats.allocated", &allocated, &sz, NULL, 0);
+    size_t resident = server.cron_malloc_stats.allocator_resident;
+    size_t active = server.cron_malloc_stats.allocator_active;
+    size_t allocated = server.cron_malloc_stats.allocator_allocated;
     float frag_pct = ((float)active / allocated)*100 - 100;
     size_t frag_bytes = active - allocated;
     float rss_pct = ((float)resident / allocated)*100 - 100;

diff --git a/src/object.c b/src/object.c
@@ -876,8 +876,23 @@ struct redisMemOverhead *getMemoryOverheadData(void) {
     mh->total_allocated = zmalloc_used;
     mh->startup_allocated = server.initial_memory_usage;
     mh->peak_allocated = server.stat_peak_memory;
-    mh->fragmentation =
-        zmalloc_get_fragmentation_ratio(server.resident_set_size);
+    mh->total_frag =
+        (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.zmalloc_used;
+    mh->total_frag_bytes =
+        server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used;
+    mh->allocator_frag =
+        (float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated;
+    mh->allocator_frag_bytes =
+        server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated;
+    mh->allocator_rss =
+        (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active;
+    mh->allocator_rss_bytes =
+        server.cron_malloc_stats.allocator_resident - server.cron_malloc_stats.allocator_active;
+    mh->rss_extra =
+        (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.allocator_resident;
+    mh->rss_extra_bytes =
+        server.cron_malloc_stats.process_rss - server.cron_malloc_stats.allocator_resident;
+
     mem_total += server.initial_memory_usage;
 
     mem = 0;
@@ -980,6 +995,9 @@ sds getMemoryDoctorReport(void) {
     int empty = 0;          /* Instance is empty or almost empty. */
     int big_peak = 0;       /* Memory peak is much larger than used mem. */
     int high_frag = 0;      /* High fragmentation. */
+    int high_alloc_frag = 0;/* High allocator fragmentation. */
+    int high_proc_rss = 0;  /* High process rss overhead. */
+    int high_alloc_rss = 0; /* High rss overhead. */
     int big_slave_buf = 0;  /* Slave buffers are too big. */
     int big_client_buf = 0; /* Client buffers are too big. */
     int num_reports = 0;
@@ -995,12 +1013,30 @@ sds getMemoryDoctorReport(void) {
             num_reports++;
         }
 
-        /* Fragmentation is higher than 1.4? */
-        if (mh->fragmentation > 1.4) {
+        /* Fragmentation is higher than 1.4 and 10MB ?*/
+        if (mh->total_frag > 1.4 && mh->total_frag_bytes > 10<<20) {
             high_frag = 1;
             num_reports++;
         }
 
+        /* External fragmentation is higher than 1.1 and 10MB? */
+        if (mh->allocator_frag > 1.1 && mh->allocator_frag_bytes > 10<<20) {
+            high_alloc_frag = 1;
+            num_reports++;
+        }
+
+        /* Allocator fss is higher than 1.1 and 10MB ? */
+        if (mh->allocator_rss > 1.1 && mh->allocator_rss_bytes > 10<<20) {
+            high_alloc_rss = 1;
+            num_reports++;
+        }
+
+        /* Non-Allocator fss is higher than 1.1 and 10MB ? */
+        if (mh->rss_extra > 1.1 && mh->rss_extra_bytes > 10<<20) {
+            high_proc_rss = 1;
+            num_reports++;
+        }
+
         /* Clients using more than 200k each average? */
         long numslaves = listLength(server.slaves);
         long numclients = listLength(server.clients)-numslaves;
@@ -1034,7 +1070,16 @@ sds getMemoryDoctorReport(void) {
             s = sdscat(s," * Peak memory: In the past this instance used more than 150% the memory that is currently using. The allocator is normally not able to release memory after a peak, so you can expect to see a big fragmentation ratio, however this is actually harmless and is only due to the memory peak, and if the Redis instance Resident Set Size (RSS) is currently bigger than expected, the memory will be used as soon as you fill the Redis instance with more data. If the memory peak was only occasional and you want to try to reclaim memory, please try the MEMORY PURGE command, otherwise the only other option is to shutdown and restart the instance.\n\n");
         }
         if (high_frag) {
-            s = sdscatprintf(s," * High fragmentation: This instance has a memory fragmentation greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
+            s = sdscatprintf(s," * High total RSS: This instance has a memory fragmentation and RSS overhead greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
+        }
+        if (high_alloc_frag) {
+            s = sdscatprintf(s," * High allocator fragmentation: This instance has an allocator external fragmentation greater than 1.1. This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. You can try enabling 'activedefrag' config option.\n\n");
+        }
+        if (high_alloc_rss) {
+            s = sdscatprintf(s," * High allocator RSS overhead: This instance has an RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the allocator is much larger than the sum what the allocator actually holds). This problem is usually due to a large peak memory (check if there is a peak memory entry above in the report), you can try the MEMORY PURGE command to reclaim it.\n\n");
+        }
+        if (high_proc_rss) {
+            s = sdscatprintf(s," * High process RSS overhead: This instance has non-allocator RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the Redis process is much larger than the RSS the allocator holds). This problem may be due to LUA scripts or Modules.\n\n");
         }
         if (big_slave_buf) {
             s = sdscat(s," * Big slave buffers: The slave output buffers in this instance are greater than 10MB for each slave (on average). This likely means that there is some slave instance that is struggling receiving data, either because it is too slow or because of networking issues. As a result, data piles on the master output buffers. Please try to identify what slave is not receiving data correctly and why. You can use the INFO output in order to check the slaves delays and the CLIENT LIST command to check the output buffers of each slave.\n\n");
@@ -1148,7 +1193,7 @@ void memoryCommand(client *c) {
     } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) {
         struct redisMemOverhead *mh = getMemoryOverheadData();
 
-        addReplyMultiBulkLen(c,(14+mh->num_dbs)*2);
+        addReplyMultiBulkLen(c,(24+mh->num_dbs)*2);
 
         addReplyBulkCString(c,"peak.allocated");
         addReplyLongLong(c,mh->peak_allocated);
@@ -1202,8 +1247,38 @@ void memoryCommand(client *c) {
         addReplyBulkCString(c,"peak.percentage");
         addReplyDouble(c,mh->peak_perc);
 
-        addReplyBulkCString(c,"fragmentation");
-        addReplyDouble(c,mh->fragmentation);
+        addReplyBulkCString(c,"allocator.allocated");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_allocated);
+
+        addReplyBulkCString(c,"allocator.active");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_active);
+
+        addReplyBulkCString(c,"allocator.resident");
+        addReplyLongLong(c,server.cron_malloc_stats.allocator_resident);
+
+        addReplyBulkCString(c,"allocator-fragmentation.ratio");
+        addReplyDouble(c,mh->allocator_frag);
+
+        addReplyBulkCString(c,"allocator-fragmentation.bytes");
+        addReplyLongLong(c,mh->allocator_frag_bytes);
+
+        addReplyBulkCString(c,"allocator-rss.ratio");
+        addReplyDouble(c,mh->allocator_rss);
+
+        addReplyBulkCString(c,"allocator-rss.bytes");
+        addReplyLongLong(c,mh->allocator_rss_bytes);
+
+        addReplyBulkCString(c,"rss-overhead.ratio");
+        addReplyDouble(c,mh->rss_extra);
+
+        addReplyBulkCString(c,"rss-overhead.bytes");
+        addReplyLongLong(c,mh->rss_extra_bytes);
+
+        addReplyBulkCString(c,"fragmentation"); /* this is the total RSS overhead, including fragmentation */
+        addReplyDouble(c,mh->total_frag); /* it is kept here for backwards compatibility */
+
+        addReplyBulkCString(c,"fragmentation.bytes");
+        addReplyLongLong(c,mh->total_frag_bytes);
 
         freeMemoryOverheadData(mh);
     } else if (!strcasecmp(c->argv[1]->ptr,"malloc-stats") && c->argc == 2) {

diff --git a/src/server.c b/src/server.c
@@ -1007,8 +1007,33 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     if (zmalloc_used_memory() > server.stat_peak_memory)
         server.stat_peak_memory = zmalloc_used_memory();
 
-    /* Sample the RSS here since this is a relatively slow call. */
-    server.resident_set_size = zmalloc_get_rss();
+    run_with_period(10) {
+        /* Sample the RSS and other metrics here since this is a relatively slow call.
+         * We must sample the zmalloc_used at the same time we take the rss, otherwise
+         * the frag ratio calculate may be off (ratio of two samples at different times) */
+        server.cron_malloc_stats.process_rss = zmalloc_get_rss();
+        server.cron_malloc_stats.zmalloc_used = zmalloc_used_memory();
+        /* Sampling the allcator info can be slow too.
+         * The fragmentation ratio it'll show is potentically more accurate
+         * it excludes other RSS pages such as: shared libraries, LUA and other non-zmalloc
+         * allocations, and allocator reserved pages that can be pursed (all not actual frag) */
+        zmalloc_get_allocator_info(&server.cron_malloc_stats.allocator_allocated,
+                                   &server.cron_malloc_stats.allocator_active,
+                                   &server.cron_malloc_stats.allocator_resident);
+        /* in case the allocator isn't providing these stats, fake them so that
+         * fragmention info still shows some (inaccurate metrics) */
+        if (!server.cron_malloc_stats.allocator_resident) {
+            /* LUA memory isn't part of zmalloc_used, but it is part of the process RSS,
+             * so we must desuct it in order to be able to calculate correct
+             * "allocator fragmentation" ratio */
+            size_t lua_memory = lua_gc(server.lua,LUA_GCCOUNT,0)*1024LL;
+            server.cron_malloc_stats.allocator_resident = server.cron_malloc_stats.process_rss - lua_memory;
+        }
+        if (!server.cron_malloc_stats.allocator_active)
+            server.cron_malloc_stats.allocator_active = server.cron_malloc_stats.allocator_resident;
+        if (!server.cron_malloc_stats.allocator_allocated)
+            server.cron_malloc_stats.allocator_allocated = server.cron_malloc_stats.zmalloc_used;
+    }
 
     /* We received a SIGTERM, shutting down here in a safe way, as it is
      * not ok doing so inside the signal handler. */
@@ -1924,7 +1949,11 @@ void initServer(void) {
     server.stat_peak_memory = 0;
     server.stat_rdb_cow_bytes = 0;
     server.stat_aof_cow_bytes = 0;
-    server.resident_set_size = 0;
+    server.cron_malloc_stats.zmalloc_used = 0;
+    server.cron_malloc_stats.process_rss = 0;
+    server.cron_malloc_stats.allocator_allocated = 0;
+    server.cron_malloc_stats.allocator_active = 0;
+    server.cron_malloc_stats.allocator_resident = 0;
     server.lastbgsave_status = C_OK;
     server.aof_last_write_status = C_OK;
     server.aof_last_write_errno = 0;
@@ -2974,7 +3003,7 @@ sds genRedisInfoString(char *section) {
         bytesToHuman(peak_hmem,server.stat_peak_memory);
         bytesToHuman(total_system_hmem,total_system_mem);
         bytesToHuman(used_memory_lua_hmem,memory_lua);
-        bytesToHuman(used_memory_rss_hmem,server.resident_set_size);
+        bytesToHuman(used_memory_rss_hmem,server.cron_malloc_stats.process_rss);
         bytesToHuman(maxmemory_hmem,server.maxmemory);
 
         if (sections++) info = sdscat(info,"\r\n");
@@ -2991,20 +3020,30 @@ sds genRedisInfoString(char *section) {
             "used_memory_startup:%zu\r\n"
             "used_memory_dataset:%zu\r\n"
             "used_memory_dataset_perc:%.2f%%\r\n"
+            "allocator_allocated:%zu\r\n"
+            "allocator_active:%zu\r\n"
+            "allocator_resident:%zu\r\n"
             "total_system_memory:%lu\r\n"
             "total_system_memory_human:%s\r\n"
             "used_memory_lua:%lld\r\n"
             "used_memory_lua_human:%s\r\n"
             "maxmemory:%lld\r\n"
             "maxmemory_human:%s\r\n"
             "maxmemory_policy:%s\r\n"
+            "allocator_frag_ratio:%.2f\r\n"
+            "allocator_frag_bytes:%zu\r\n"
+            "allocator_rss_ratio:%.2f\r\n"
+            "allocator_rss_bytes:%zu\r\n"
+            "rss_overhead_ratio:%.2f\r\n"
+            "rss_overhead_bytes:%zu\r\n"
             "mem_fragmentation_ratio:%.2f\r\n"
+            "mem_fragmentation_bytes:%zu\r\n"
             "mem_allocator:%s\r\n"
             "active_defrag_running:%d\r\n"
             "lazyfree_pending_objects:%zu\r\n",
             zmalloc_used,
             hmem,
-            server.resident_set_size,
+            server.cron_malloc_stats.process_rss,
             used_memory_rss_hmem,
             server.stat_peak_memory,
             peak_hmem,
@@ -3013,14 +3052,24 @@ sds genRedisInfoString(char *section) {
             mh->startup_allocated,
             mh->dataset,
             mh->dataset_perc,
+            server.cron_malloc_stats.allocator_allocated,
+            server.cron_malloc_stats.allocator_active,
+            server.cron_malloc_stats.allocator_resident,
             (unsigned long)total_system_mem,
             total_system_hmem,
             memory_lua,
             used_memory_lua_hmem,
             server.maxmemory,
             maxmemory_hmem,
             evict_policy,
-            mh->fragmentation,
+            mh->allocator_frag,
+            mh->allocator_frag_bytes,
+            mh->allocator_rss,
+            mh->allocator_rss_bytes,
+            mh->rss_extra,
+            mh->rss_extra_bytes,
+            mh->total_frag, /* this is the total RSS overhead, including fragmentation, */
+            mh->total_frag_bytes, /* named so for backwards compatibility */
             ZMALLOC_LIB,
             server.active_defrag_running,
             lazyfreeGetPendingObjectsCount()

diff --git a/src/server.h b/src/server.h
@@ -840,7 +840,14 @@ struct redisMemOverhead {
     size_t bytes_per_key;
     float dataset_perc;
     float peak_perc;
-    float fragmentation;
+    float total_frag;
+    size_t total_frag_bytes;
+    float allocator_frag;
+    size_t allocator_frag_bytes;
+    float allocator_rss;
+    size_t allocator_rss_bytes;
+    float rss_extra;
+    size_t rss_extra_bytes;
     size_t num_dbs;
     struct {
         size_t dbid;
@@ -869,6 +876,14 @@ typedef struct rdbSaveInfo {
 
 #define RDB_SAVE_INFO_INIT {-1,0,"000000000000000000000000000000",-1}
 
+typedef struct malloc_stats {
+    size_t zmalloc_used;
+    size_t process_rss;
+    size_t allocator_allocated;
+    size_t allocator_active;
+    size_t allocator_resident;
+} malloc_stats;
+
 /*-----------------------------------------------------------------------------
  * Global server state
  *----------------------------------------------------------------------------*/
@@ -971,7 +986,7 @@ struct redisServer {
     long long slowlog_entry_id;     /* SLOWLOG current entry ID */
     long long slowlog_log_slower_than; /* SLOWLOG time limit (to get logged) */
     unsigned long slowlog_max_len;     /* SLOWLOG max number of items logged */
-    size_t resident_set_size;       /* RSS sampled in serverCron(). */
+    malloc_stats cron_malloc_stats; /* sampled in serverCron(). */
     long long stat_net_input_bytes; /* Bytes read from network. */
     long long stat_net_output_bytes; /* Bytes written to network. */
     size_t stat_rdb_cow_bytes;      /* Copy on write bytes during RDB saving. */

diff --git a/src/zmalloc.c b/src/zmalloc.c
@@ -297,10 +297,33 @@ size_t zmalloc_get_rss(void) {
 }
 #endif
 
-/* Fragmentation = RSS / allocated-bytes */
-float zmalloc_get_fragmentation_ratio(size_t rss) {
-    return (float)rss/zmalloc_used_memory();
+#if defined(USE_JEMALLOC)
+int zmalloc_get_allocator_info(size_t *allocated,
+                               size_t *active,
+                               size_t *resident) {
+    size_t epoch = 1, sz = sizeof(size_t);
+    *allocated = *resident = *active = 0;
+    /* Update the statistics cached by mallctl. */
+    je_mallctl("epoch", &epoch, &sz, &epoch, sz);
+    /* Unlike RSS, this does not include RSS from shared libraries and other non
+     * heap mappings. */
+    je_mallctl("stats.resident", resident, &sz, NULL, 0);
+    /* Unlike resident, this doesn't not include the pages jemalloc reserves
+     * for re-use (purge will clean that). */
+    je_mallctl("stats.active", active, &sz, NULL, 0);
+    /* Unlike zmalloc_used_memory, this matches the stats.resident by taking
+     * into account all allocations done by this process (not only zmalloc). */
+    je_mallctl("stats.allocated", allocated, &sz, NULL, 0);
+    return 1;
 }
+#else
+int zmalloc_get_allocator_info(size_t *allocated,
+                               size_t *active,
+                               size_t *resident) {
+    *allocated = *resident = *active = 0;
+    return 1;
+}
+#endif
 
 /* Get the sum of the specified field (converted form kb to bytes) in
  * /proc/self/smaps. The field must be specified with trailing ":" as it

diff --git a/src/zmalloc.h b/src/zmalloc.h
@@ -79,8 +79,8 @@ void zfree(void *ptr);
 char *zstrdup(const char *s);
 size_t zmalloc_used_memory(void);
 void zmalloc_set_oom_handler(void (*oom_handler)(size_t));
-float zmalloc_get_fragmentation_ratio(size_t rss);
 size_t zmalloc_get_rss(void);
+int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident);
 size_t zmalloc_get_private_dirty(long pid);
 size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
 size_t zmalloc_get_memory_size(void);

diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
@@ -452,6 +452,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
             puts $t
         }
         exit 0
+    } elseif {$opt eq {--verbose}} {
+        set ::verbose 1
     } elseif {$opt eq {--client}} {
         set ::client 1
         set ::test_server_port $arg