Skip to content

Commit

Permalink
Merge pull request #289 from riemann/fix-linux-zfs-arc-memory-usage
Browse files Browse the repository at this point in the history
Fix `riemann-health` memory reporting when using ZFS on Linux
  • Loading branch information
jamtur01 committed May 30, 2024
2 parents 0f583e1 + fda2a2a commit a47f0fd
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 10 deletions.
41 changes: 39 additions & 2 deletions lib/riemann/tools/health.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Health
include Riemann::Tools
include Riemann::Tools::Utils

PROC_PID_INIT_INO = 0xEFFFFFFC
SI_UNITS = '_kMGTPEZYRQ'

opt :cpu_warning, 'CPU warning threshold (fraction of total jiffies)', default: 0.9
Expand Down Expand Up @@ -158,6 +159,11 @@ def report_uptime(uptime)
end
end

def linux_running_in_container?
@linux_running_in_container = File.readlink('/proc/self/ns/pid') != "pid:[#{PROC_PID_INIT_INO}]" if @linux_running_in_container.nil?
@linux_running_in_container
end

def linux_cpu
new = File.read('/proc/stat')
unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
Expand Down Expand Up @@ -198,13 +204,44 @@ def linux_memory
info[x[0]] = x[1].to_i
end

free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
total = m['MemTotal'].to_i
free = m['MemFree'] + m['Buffers'] + m['Cached'] + linux_zfs_arc_evictable_memory
total = m['MemTotal']
fraction = 1 - (free.to_f / total)

report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -eo pmem,pid,comm`)}"
end

# On Linux, the ZFS ARC is reported as used, not as cached memory.
# https://github.com/openzfs/zfs/issues/10251
#
# Gather ZFS ARC statisticts about evictable memory. The available
# fields are listed here:
# https://github.com/openzfs/zfs/blob/master/include/sys/arc_impl.h
def linux_zfs_arc_evictable_memory
# When the system is a container, it can access the hosts stats that
# cause invalid memory usage reporting. We should only remove
# evictable memory from the ZFS ARC on the host system.
return 0 if linux_running_in_container?

m = File.readlines('/proc/spl/kstat/zfs/arcstats').each_with_object(Hash.new(0)) do |line, info|
x = line.split(/\s+/)
info[x[0]] = x[2].to_i
end

(
m['anon_evictable_data'] +
m['anon_evictable_metadata'] +
m['mru_evictable_data'] +
m['mru_evictable_metadata'] +
m['mfu_evictable_data'] +
m['mfu_evictable_metadata'] +
m['uncached_evictable_data'] +
m['uncached_evictable_metadata']
) / 1024 # We want kB...
rescue Errno::ENOENT
0
end

def freebsd_cpu
u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map(&:to_i) # FreeBSD has 5 cpu stats

Expand Down
6 changes: 2 additions & 4 deletions lib/riemann/tools/http_check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,8 @@ def tick
def test_uri_addresses(uri, addresses)
request = get_request(uri)

responses = []

addresses.each do |address|
responses << test_uri_address(uri, address.to_s, request)
responses = addresses.map do |address|
test_uri_address(uri, address.to_s, request)
end

responses.compact!
Expand Down
159 changes: 159 additions & 0 deletions spec/riemann/tools/health_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,165 @@
end
end

describe '#linux_running_in_container?' do
before do
allow(File).to receive(:readlink).with('/proc/self/ns/pid').and_return(pid_namespace)
end

context 'when running on the host' do
let(:pid_namespace) { 'pid:[4026531836]' }

it 'returns the expected value' do
expect(subject).not_to be_linux_running_in_container
end
end

context 'when running in a container' do
let(:pid_namespace) { 'pid:[4026532474]' }

it 'returns the expected value' do
expect(subject).to be_linux_running_in_container
end
end
end

describe '#linux_zfs_arc_evictable_memory' do
before do
allow(subject).to receive(:linux_running_in_container?).and_return(false)
allow(File).to receive(:readlines).with('/proc/spl/kstat/zfs/arcstats').and_return(<<~OUTPUT.split("\n"))
12 1 0x01 123 33456 16771914747167 65909736923948
name type data
hits 4 4194887
misses 4 10500
demand_data_hits 4 107986
demand_data_misses 4 2
demand_metadata_hits 4 4058473
demand_metadata_misses 4 9216
prefetch_data_hits 4 28396
prefetch_data_misses 4 1207
prefetch_metadata_hits 4 32
prefetch_metadata_misses 4 75
mru_hits 4 882890
mru_ghost_hits 4 7737
mfu_hits 4 3311966
mfu_ghost_hits 4 2306
deleted 4 42072676
mutex_miss 4 1771
access_skip 4 0
evict_skip 4 1004
evict_not_enough 4 68
evict_l2_cached 4 0
evict_l2_eligible 4 5516808656384
evict_l2_eligible_mfu 4 216467456
evict_l2_eligible_mru 4 5516592188928
evict_l2_ineligible 4 6029312
evict_l2_skip 4 0
hash_elements 4 124644
hash_elements_max 4 158917
hash_collisions 4 1256052
hash_chains 4 1793
hash_chain_max 4 4
p 4 8383553536
c 4 16767066112
c_min 4 1047941632
c_max 4 16767066112
size 4 16791717984
compressed_size 4 16052915712
uncompressed_size 4 16066757120
overhead_size 4 685315584
hdr_size 4 40836960
data_size 4 16724525056
metadata_size 4 13706240
dbuf_size 4 3017856
dnode_size 4 8890304
bonus_size 4 734400
anon_size 4 205520896
anon_evictable_data 4 0
anon_evictable_metadata 4 0
mru_size 4 16532327936
mru_evictable_data 4 15577382912
mru_evictable_metadata 4 2461696
mru_ghost_size 4 225443840
mru_ghost_evictable_data 4 52822016
mru_ghost_evictable_metadata 4 172621824
mfu_size 4 382464
mfu_evictable_data 4 0
mfu_evictable_metadata 4 0
mfu_ghost_size 4 18432
mfu_ghost_evictable_data 4 0
mfu_ghost_evictable_metadata 4 18432
l2_hits 4 0
l2_misses 4 0
l2_prefetch_asize 4 0
l2_mru_asize 4 0
l2_mfu_asize 4 0
l2_bufc_data_asize 4 0
l2_bufc_metadata_asize 4 0
l2_feeds 4 0
l2_rw_clash 4 0
l2_read_bytes 4 0
l2_write_bytes 4 0
l2_writes_sent 4 0
l2_writes_done 4 0
l2_writes_error 4 0
l2_writes_lock_retry 4 0
l2_evict_lock_retry 4 0
l2_evict_reading 4 0
l2_evict_l1cached 4 0
l2_free_on_write 4 0
l2_abort_lowmem 4 0
l2_cksum_bad 4 0
l2_io_error 4 0
l2_size 4 0
l2_asize 4 0
l2_hdr_size 4 0
l2_log_blk_writes 4 0
l2_log_blk_avg_asize 4 0
l2_log_blk_asize 4 0
l2_log_blk_count 4 0
l2_data_to_meta_ratio 4 0
l2_rebuild_success 4 0
l2_rebuild_unsupported 4 0
l2_rebuild_io_errors 4 0
l2_rebuild_dh_errors 4 0
l2_rebuild_cksum_lb_errors 4 0
l2_rebuild_lowmem 4 0
l2_rebuild_size 4 0
l2_rebuild_asize 4 0
l2_rebuild_bufs 4 0
l2_rebuild_bufs_precached 4 0
l2_rebuild_log_blks 4 0
memory_throttle_count 4 0
memory_direct_count 4 0
memory_indirect_count 4 0
memory_all_bytes 4 33534132224
memory_free_bytes 4 14101360640
memory_available_bytes 3 12877639168
arc_no_grow 4 0
arc_tempreserve 4 132096
arc_loaned_bytes 4 0
arc_prune 4 0
arc_meta_used 4 65964976
arc_meta_limit 4 12575299584
arc_dnode_limit 4 1257529958
arc_meta_max 4 129330864
arc_meta_min 4 16777216
async_upgrade_sync 4 2597
demand_hit_predictive_prefetch 4 907
demand_hit_prescient_prefetch 4 0
arc_need_free 4 0
arc_sys_free 4 1223721472
arc_raw_size 4 0
cached_only_in_progress 4 0
abd_chunk_waste_size 4 7168
OUTPUT
end

it 'return the expected size' do
expect(subject.linux_zfs_arc_evictable_memory).to eq(15_214_692)
end
end

describe('#disks') do
before do
allow(subject).to receive(:df).and_return(<<~OUTPUT)
Expand Down
6 changes: 2 additions & 4 deletions tools/riemann-docker/lib/riemann/tools/docker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,8 @@ def tick
disk if @disk_enabled

# Get CPU, Memory and Load of each container
threads = []

containers.each do |ctr|
threads << Thread.new(ctr) do |container|
threads = containers.map do |ctr|
Thread.new(ctr) do |container|
id = container.id
name = get_container_name(container)

Expand Down

0 comments on commit a47f0fd

Please sign in to comment.