Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[memory_monitoring] Enhance monitoring the memory usage of containers #10008

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 125 additions & 31 deletions files/image_config/monit/memory_checker
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,22 @@ check program container_memory_<container_name> with path "/usr/bin/memory_check
"""

import argparse
import os
import subprocess
import sys
import syslog
import re

CGROUP_DOCKER_MEMORY_DIR = "/sys/fs/cgroup/memory/docker/"


def get_command_result(command):
"""Executes the command and return the resulting output.
"""Executes command and returns command's stdout.

Args:
command: A string contains the command to be executed.

Returns:
A string which contains the output of command.
command_stdout: A string contains command's stdout.
"""
command_stdout = ""

Expand All @@ -53,9 +55,100 @@ def get_command_result(command):
return command_stdout.strip()


def get_container_id(container_name):
"""Gets full container ID of the specified container

Args:
container_name: A string indicates the name of specified container.

Returns:
container_id: A string indicates the full ID of specified container.
"""
container_id = ""

get_container_info_cmd = "docker ps --no-trunc"
command_stdout = get_command_result(get_container_info_cmd)

for line in command_stdout.splitlines():
if container_name in line:
container_id = line.split()[0].strip()
break

if not container_id:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get contianer ID of '{}'! Exiting ..."
.format(container_name))
sys.exit(4)

return container_id


def get_memory_usage(container_id):
"""Reads the container's memory usage from the control group subsystem's file
'/sys/fs/cgroup/memory/docker/<container_id>/memory.usage_in_bytes'.

Args:
container_id: A string indicates the full ID of a container.

Returns:
memory_usage_in_bytes: A string indicates memory usage (Bytes) of a container.
"""
memory_usage_in_bytes = ""

docker_memory_usage_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.usage_in_bytes"
if not os.path.exists(docker_memory_usage_file_path):
syslog.syslog(syslog.LOG_ERR,
"[memory_checker] cgroup memory usage file '{}' of container '{}' does not exist on device! Exiting ..."
.format(docker_memory_usage_file_path, container_id))
sys.exit(5)

get_memory_usage_cmd = "sudo cat {}".format(docker_memory_usage_file_path)
memory_usage_in_bytes = get_command_result(get_memory_usage_cmd)
if not memory_usage_in_bytes:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get the memory usage of container '{}'! Exiting ..."
.format(container_id))
sys.exit(6)

return memory_usage_in_bytes


def get_cache_usage(container_id):
"""Reads the container's cache usage from the field 'total_inactive_file' in control
group subsystem's file '/sys/fs/cgroup/memory/docker/<container_id>/memory.stat'.

Args:
container_id: A string indicates the full ID of a container.

Returns:
cache_usage_in_bytes: A string indicates the cache usage (Bytes) of a container.
"""
cache_usage_in_bytes = ""

docker_memory_stat_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.stat"
if not os.path.exists(docker_memory_stat_file_path):
syslog.syslog(syslog.LOG_ERR,
"[memory_checker] cgroup memory statistics file '{}' of container '{}' does not exist on device! Exiting ..."
.format(docker_memory_stat_file_path, container_id))
sys.exit(7)

get_cache_usage_cmd = "sudo cat {}".format(docker_memory_stat_file_path)
command_stdout = get_command_result(get_cache_usage_cmd)

for line in command_stdout.splitlines():
if "total_inactive_file" in line:
cache_usage_in_bytes = line.split()[1].strip()
break

if not cache_usage_in_bytes:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get the cache usage of container '{}'! Exiting ..."
.format(container_id))
sys.exit(8)

return cache_usage_in_bytes


def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
the syslog if the memory usage is larger than the threshold value.
"""Checks the memory usage of a container from its cgroup subsystem and writes an alerting
messages into the syslog if the memory usage is larger than the threshold value.

Args:
container_name: A string represtents name of a container
Expand All @@ -64,32 +157,33 @@ def check_memory_usage(container_name, threshold_value):
Returns:
None.
"""
command = "docker stats --no-stream --format \{{\{{.MemUsage\}}\}} {}".format(container_name)
command_stdout = get_command_result(command)
mem_usage = command_stdout.split("/")[0].strip()
match_obj = re.match(r"\d+\.?\d*", mem_usage)
if match_obj:
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()])
mem_usage_unit = mem_usage[match_obj.end():]

mem_usage_bytes = 0.0
if mem_usage_unit == "B":
mem_usage_bytes = mem_usage_value
elif mem_usage_unit == "KiB":
mem_usage_bytes = mem_usage_value * 1024
elif mem_usage_unit == "MiB":
mem_usage_bytes = mem_usage_value * 1024 ** 2
elif mem_usage_unit == "GiB":
mem_usage_bytes = mem_usage_value * 1024 ** 3

if mem_usage_bytes > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
.format(mem_usage))
sys.exit(4)
container_id = get_container_id(container_name)
syslog.syslog(syslog.LOG_INFO, "[memory_checker] Container ID of '{}' is: '{}'."
.format(container_name, container_id))

memory_usage_in_bytes = get_memory_usage(container_id)
syslog.syslog(syslog.LOG_INFO, "[memory_checker] The memory usage of container '{}' is '{}' Bytes!"
.format(container_name, memory_usage_in_bytes))

cache_usage_in_bytes = get_cache_usage(container_id)
syslog.syslog(syslog.LOG_INFO, "[memory_checker] The cache usage of container '{}' is '{}' Bytes!"
.format(container_name, cache_usage_in_bytes))

try:
memory_usage = int(memory_usage_in_bytes)
cache_usage = int(cache_usage_in_bytes)
except ValueError as err:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to convert the memory or cache usage in string to integer! Exiting ...")
sys.exit(9)

total_memory_usage = memory_usage - cache_usage
syslog.syslog(syslog.LOG_INFO, "[memory_checker] Total memory usage of container '{}' is '{}' Bytes!"
.format(container_name, total_memory_usage))

if total_memory_usage > threshold_value:
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, total_memory_usage, threshold_value))
sys.exit(3)


def main():
Expand Down