-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[memory_monitoring] Enhance monitoring the memory usage of containers #19179
base: master
Are you sure you want to change the base?
Changes from 6 commits
0acb57a
1e060c2
50f9b21
1a3f383
a0aced0
e967f75
4cbee02
b5bc2b1
d5a3ff5
ac25cc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ check program container_memory_<container_name> with path "/usr/bin/memory_check | |
""" | ||
|
||
import argparse | ||
import os | ||
import subprocess | ||
import sys | ||
import syslog | ||
|
@@ -33,6 +34,15 @@ from swsscommon import swsscommon | |
EVENTS_PUBLISHER_SOURCE = "sonic-events-host" | ||
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded" | ||
|
||
CGROUP_DOCKER_MEMORY_DIR = "/sys/fs/cgroup/memory/docker/" | ||
|
||
# Define common error codes | ||
ERROR_CONTAINER_ID_NOT_FOUND = "[memory_checker] Failed to get container ID of '{}'! Exiting ..." | ||
ERROR_CGROUP_MEMORY_USAGE_NOT_FOUND = "[memory_checker] cgroup memory usage file '{}' of container '{}' does not exist on device! Exiting ..." | ||
ERROR_CONTAINER_MEMORY_USAGE_NOT_FOUND = "[memory_checker] Failed to get the memory usage of container '{}'! Exiting ..." | ||
ERROR_CONTAINER_CACHE_USAGE_NOT_FOUND = "[memory_checker] Failed to get the cache usage of container '{}'! Exiting ..." | ||
ERROR_CGROUP_MEMORY_STATS_NOT_FOUND = "[memory_checker] cgroup memory statistics file '{}' of container '{}' does not exist on device! Exiting ..." | ||
|
||
def get_command_result(command): | ||
"""Executes the command and return the resulting output. | ||
|
||
|
@@ -59,6 +69,81 @@ def get_command_result(command): | |
|
||
return command_stdout.strip() | ||
|
||
def get_container_id(container_name): | ||
"""Gets full container ID of the specified container | ||
Args: | ||
container_name: A string indicates the name of specified container. | ||
Returns: | ||
container_id: A string indicates the full ID of specified container. | ||
""" | ||
container_id = "" | ||
|
||
get_container_info_cmd = "docker ps --no-trunc" | ||
command_stdout = get_command_result(get_container_info_cmd) | ||
|
||
for line in command_stdout.splitlines(): | ||
if container_name in line: | ||
container_id = line.split()[0].strip() | ||
break | ||
|
||
if not container_id: | ||
syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_ID_NOT_FOUND.format(container_name)) | ||
|
||
sys.exit(4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we define what certain error codes are and use those, such as for example, ERROR_CONTAINER_ID_NOT_FOUND? Maybe we can reuse certain error codes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated. |
||
|
||
return container_id | ||
|
||
def get_memory_usage(container_id): | ||
"""Reads the container's memory usage from the control group subsystem's file | ||
'/sys/fs/cgroup/memory/docker/<container_id>/memory.usage_in_bytes'. | ||
Args: | ||
container_id: A string indicates the full ID of a container. | ||
Returns: | ||
memory_usage_in_bytes: A string indicates memory usage (Bytes) of a container. | ||
""" | ||
memory_usage_in_bytes = "" | ||
|
||
docker_memory_usage_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.usage_in_bytes" | ||
if not os.path.exists(docker_memory_usage_file_path): | ||
syslog.syslog(syslog.LOG_ERR, ERROR_CGROUP_MEMORY_USAGE_NOT_FOUND.format(docker_memory_usage_file_path, container_id)) | ||
sys.exit(5) | ||
|
||
get_memory_usage_cmd = "sudo cat {}".format(docker_memory_usage_file_path) | ||
memory_usage_in_bytes = get_command_result(get_memory_usage_cmd) | ||
if not memory_usage_in_bytes: | ||
syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_MEMORY_USAGE_NOT_FOUND.format(container_id)) | ||
sys.exit(6) | ||
|
||
return memory_usage_in_bytes | ||
|
||
def get_inactive_cache_usage(container_id): | ||
"""Reads the container's cache usage from the field 'total_inactive_file' in control | ||
group subsystem's file '/sys/fs/cgroup/memory/docker/<container_id>/memory.stat'. | ||
Args: | ||
container_id: A string indicates the full ID of a container. | ||
Returns: | ||
cache_usage_in_bytes: A string indicates the cache usage (Bytes) of a container. | ||
""" | ||
cache_usage_in_bytes = "" | ||
|
||
docker_memory_stat_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.stat" | ||
if not os.path.exists(docker_memory_stat_file_path): | ||
syslog.syslog(syslog.LOG_ERR, ERROR_CGROUP_MEMORY_STATS_NOT_FOUND.format(docker_memory_stat_file_path, container_id)) | ||
sys.exit(7) | ||
|
||
get_inactive_cache_usage_cmd = "sudo cat {}".format(docker_memory_stat_file_path) | ||
command_stdout = get_command_result(get_inactive_cache_usage_cmd) | ||
|
||
for line in command_stdout.splitlines(): | ||
if "total_inactive_file" in line: | ||
cache_usage_in_bytes = line.split()[1].strip() | ||
break | ||
|
||
if not cache_usage_in_bytes: | ||
syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_CACHE_USAGE_NOT_FOUND.format(container_id)) | ||
sys.exit(8) | ||
|
||
return cache_usage_in_bytes | ||
|
||
def publish_events(container_name, mem_usage_bytes, threshold_value): | ||
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) | ||
|
@@ -71,8 +156,8 @@ def publish_events(container_name, mem_usage_bytes, threshold_value): | |
|
||
|
||
def check_memory_usage(container_name, threshold_value): | ||
"""Checks the memory usage of a container and writes an alerting messages into | ||
the syslog if the memory usage is larger than the threshold value. | ||
"""Checks the memory usage of a container from its cgroup subsystem and writes an alerting | ||
messages into the syslog if the memory usage is larger than the threshold value. | ||
|
||
Args: | ||
container_name: A string represtents name of a container | ||
|
@@ -81,37 +166,33 @@ def check_memory_usage(container_name, threshold_value): | |
Returns: | ||
None. | ||
""" | ||
command = ["docker", "stats", "--no-stream", "--format", "{{.MemUsage}}", container_name] | ||
command_stdout = get_command_result(command) | ||
mem_usage = command_stdout.split("/")[0].strip() | ||
match_obj = re.match(r"\d+\.?\d*", mem_usage) | ||
if match_obj: | ||
mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()]) | ||
mem_usage_unit = mem_usage[match_obj.end():] | ||
|
||
mem_usage_bytes = 0.0 | ||
if mem_usage_unit == "B": | ||
mem_usage_bytes = mem_usage_value | ||
elif mem_usage_unit == "KiB": | ||
mem_usage_bytes = mem_usage_value * 1024 | ||
elif mem_usage_unit == "MiB": | ||
mem_usage_bytes = mem_usage_value * 1024 ** 2 | ||
elif mem_usage_unit == "GiB": | ||
mem_usage_bytes = mem_usage_value * 1024 ** 3 | ||
|
||
if mem_usage_bytes > threshold_value: | ||
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" | ||
.format(container_name, mem_usage_bytes, threshold_value)) | ||
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" | ||
.format(container_name, mem_usage_bytes, threshold_value)) | ||
# publish event | ||
publish_events(container_name, "{:.2f}".format(mem_usage_bytes), str(threshold_value)) | ||
sys.exit(3) | ||
else: | ||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'" | ||
.format(mem_usage)) | ||
sys.exit(4) | ||
container_id = get_container_id(container_name) | ||
syslog.syslog(syslog.LOG_INFO, "[memory_checker] Container ID of '{}' is: '{}'." | ||
.format(container_name, container_id)) | ||
|
||
memory_usage_in_bytes = get_memory_usage(container_id) | ||
syslog.syslog(syslog.LOG_INFO, "[memory_checker] The memory usage of container '{}' is '{}' Bytes!" | ||
.format(container_name, memory_usage_in_bytes)) | ||
|
||
cache_usage_in_bytes = get_inactive_cache_usage(container_id) | ||
syslog.syslog(syslog.LOG_INFO, "[memory_checker] The cache usage of container '{}' is '{}' Bytes!" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it would be beneficial to also see what the active cache usage is? Since here you are defining cache usage as only the inactive cache usage. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be just identical to the logic of "docker stats", refer https://docs.docker.com/reference/cli/docker/container/stats/#extended-description and below snippet calculation. |
||
.format(container_name, cache_usage_in_bytes)) | ||
|
||
try: | ||
memory_usage = int(memory_usage_in_bytes) | ||
cache_usage = int(cache_usage_in_bytes) | ||
except ValueError as err: | ||
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to convert the memory or cache usage in string to integer! Exiting ...") | ||
sys.exit(9) | ||
|
||
total_memory_usage = memory_usage - cache_usage | ||
syslog.syslog(syslog.LOG_INFO, "[memory_checker] Total memory usage of container '{}' is '{}' Bytes!" | ||
.format(container_name, total_memory_usage)) | ||
|
||
if total_memory_usage > threshold_value: | ||
print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" | ||
.format(container_name, total_memory_usage, threshold_value)) | ||
sys.exit(3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @FengPan-Frank could you add more details about the changes in the description? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done. |
||
|
||
def is_service_active(service_name): | ||
"""Test if service is running. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I recall that in the
get_command_result()
function, when using subprocess with shell=False, the command should be provided as a list of strings rather than a single string. Could you double-check this? Same for other placesQuick test
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! Fixed and added verification test result.