From 0acb57aaf4df648feebfda9c5339f1d3337f72e3 Mon Sep 17 00:00:00 2001 From: Feng Pan Date: Mon, 3 Jun 2024 08:58:15 +0000 Subject: [PATCH 1/2] [memory_monitoring] Enhance monitoring the memory usage of containers --- files/image_config/monit/memory_checker | 144 ++++++++++++++++++------ 1 file changed, 112 insertions(+), 32 deletions(-) diff --git a/files/image_config/monit/memory_checker b/files/image_config/monit/memory_checker index e5bfe4e3864e..f4d614f5a527 100755 --- a/files/image_config/monit/memory_checker +++ b/files/image_config/monit/memory_checker @@ -20,6 +20,7 @@ check program container_memory_ with path "/usr/bin/memory_check """ import argparse +import os import subprocess import sys import syslog @@ -33,6 +34,8 @@ from swsscommon import swsscommon EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded" +CGROUP_DOCKER_MEMORY_DIR = "/sys/fs/cgroup/memory/docker/" + def get_command_result(command): """Executes the command and return the resulting output. @@ -59,6 +62,87 @@ def get_command_result(command): return command_stdout.strip() +def get_container_id(container_name): + """Gets full container ID of the specified container + Args: + container_name: A string indicates the name of specified container. + Returns: + container_id: A string indicates the full ID of specified container. + """ + container_id = "" + + get_container_info_cmd = "docker ps --no-trunc" + command_stdout = get_command_result(get_container_info_cmd) + + for line in command_stdout.splitlines(): + if container_name in line: + container_id = line.split()[0].strip() + break + + if not container_id: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get contianer ID of '{}'! Exiting ..." + .format(container_name)) + sys.exit(4) + + return container_id + +def get_memory_usage(container_id): + """Reads the container's memory usage from the control group subsystem's file + '/sys/fs/cgroup/memory/docker//memory.usage_in_bytes'. + Args: + container_id: A string indicates the full ID of a container. + Returns: + memory_usage_in_bytes: A string indicates memory usage (Bytes) of a container. + """ + memory_usage_in_bytes = "" + + docker_memory_usage_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.usage_in_bytes" + if not os.path.exists(docker_memory_usage_file_path): + syslog.syslog(syslog.LOG_ERR, + "[memory_checker] cgroup memory usage file '{}' of container '{}' does not exist on device! Exiting ..." + .format(docker_memory_usage_file_path, container_id)) + sys.exit(5) + + get_memory_usage_cmd = "sudo cat {}".format(docker_memory_usage_file_path) + memory_usage_in_bytes = get_command_result(get_memory_usage_cmd) + if not memory_usage_in_bytes: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get the memory usage of container '{}'! Exiting ..." + .format(container_id)) + sys.exit(6) + + return memory_usage_in_bytes + +def get_cache_usage(container_id): + """Reads the container's cache usage from the field 'total_inactive_file' in control + group subsystem's file '/sys/fs/cgroup/memory/docker//memory.stat'. + Args: + container_id: A string indicates the full ID of a container. + Returns: + cache_usage_in_bytes: A string indicates the cache usage (Bytes) of a container. + """ + cache_usage_in_bytes = "" + + docker_memory_stat_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.stat" + if not os.path.exists(docker_memory_stat_file_path): + syslog.syslog(syslog.LOG_ERR, + "[memory_checker] cgroup memory statistics file '{}' of container '{}' does not exist on device! Exiting ..." + .format(docker_memory_stat_file_path, container_id)) + sys.exit(7) + + get_cache_usage_cmd = "sudo cat {}".format(docker_memory_stat_file_path) + command_stdout = get_command_result(get_cache_usage_cmd) + + for line in command_stdout.splitlines(): + if "total_inactive_file" in line: + cache_usage_in_bytes = line.split()[1].strip() + break + + if not cache_usage_in_bytes: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to get the cache usage of container '{}'! Exiting ..." + .format(container_id)) + sys.exit(8) + + return cache_usage_in_bytes def publish_events(container_name, mem_usage_bytes, threshold_value): events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) @@ -71,8 +155,8 @@ def publish_events(container_name, mem_usage_bytes, threshold_value): def check_memory_usage(container_name, threshold_value): - """Checks the memory usage of a container and writes an alerting messages into - the syslog if the memory usage is larger than the threshold value. + """Checks the memory usage of a container from its cgroup subsystem and writes an alerting + messages into the syslog if the memory usage is larger than the threshold value. Args: container_name: A string represtents name of a container @@ -81,37 +165,33 @@ def check_memory_usage(container_name, threshold_value): Returns: None. """ - command = ["docker", "stats", "--no-stream", "--format", "{{.MemUsage}}", container_name] - command_stdout = get_command_result(command) - mem_usage = command_stdout.split("/")[0].strip() - match_obj = re.match(r"\d+\.?\d*", mem_usage) - if match_obj: - mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()]) - mem_usage_unit = mem_usage[match_obj.end():] - - mem_usage_bytes = 0.0 - if mem_usage_unit == "B": - mem_usage_bytes = mem_usage_value - elif mem_usage_unit == "KiB": - mem_usage_bytes = mem_usage_value * 1024 - elif mem_usage_unit == "MiB": - mem_usage_bytes = mem_usage_value * 1024 ** 2 - elif mem_usage_unit == "GiB": - mem_usage_bytes = mem_usage_value * 1024 ** 3 - - if mem_usage_bytes > threshold_value: - print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" - .format(container_name, mem_usage_bytes, threshold_value)) - syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" - .format(container_name, mem_usage_bytes, threshold_value)) - # publish event - publish_events(container_name, "{:.2f}".format(mem_usage_bytes), str(threshold_value)) - sys.exit(3) - else: - syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'" - .format(mem_usage)) - sys.exit(4) + container_id = get_container_id(container_name) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] Container ID of '{}' is: '{}'." + .format(container_name, container_id)) + + memory_usage_in_bytes = get_memory_usage(container_id) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] The memory usage of container '{}' is '{}' Bytes!" + .format(container_name, memory_usage_in_bytes)) + cache_usage_in_bytes = get_cache_usage(container_id) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] The cache usage of container '{}' is '{}' Bytes!" + .format(container_name, cache_usage_in_bytes)) + + try: + memory_usage = int(memory_usage_in_bytes) + cache_usage = int(cache_usage_in_bytes) + except ValueError as err: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to convert the memory or cache usage in string to integer! Exiting ...") + sys.exit(9) + + total_memory_usage = memory_usage - cache_usage + syslog.syslog(syslog.LOG_INFO, "[memory_checker] Total memory usage of container '{}' is '{}' Bytes!" + .format(container_name, total_memory_usage)) + + if total_memory_usage > threshold_value: + print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" + .format(container_name, total_memory_usage, threshold_value)) + sys.exit(3) def is_service_active(service_name): """Test if service is running. From 1e060c294dee2c0a9d7f3baad8d9aa5d05b99b0b Mon Sep 17 00:00:00 2001 From: Feng Pan Date: Mon, 3 Jun 2024 08:58:15 +0000 Subject: [PATCH 2/2] [memory_monitoring] Enhance monitoring the memory usage of containers --- files/image_config/monit/memory_checker | 145 ++++++++++++++++++------ 1 file changed, 113 insertions(+), 32 deletions(-) diff --git a/files/image_config/monit/memory_checker b/files/image_config/monit/memory_checker index e5bfe4e3864e..23f98ce25af7 100755 --- a/files/image_config/monit/memory_checker +++ b/files/image_config/monit/memory_checker @@ -20,6 +20,7 @@ check program container_memory_ with path "/usr/bin/memory_check """ import argparse +import os import subprocess import sys import syslog @@ -33,6 +34,15 @@ from swsscommon import swsscommon EVENTS_PUBLISHER_SOURCE = "sonic-events-host" EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded" +CGROUP_DOCKER_MEMORY_DIR = "/sys/fs/cgroup/memory/docker/" + +# Define common error codes +ERROR_CONTAINER_ID_NOT_FOUND = "[memory_checker] Failed to get container ID of '{}'! Exiting ..." +ERROR_CGROUP_MEMORY_USAGE_NOT_FOUND = "[memory_checker] cgroup memory usage file '{}' of container '{}' does not exist on device! Exiting ..." +ERROR_CONTAINER_MEMORY_USAGE_NOT_FOUND = "[memory_checker] Failed to get the memory usage of container '{}'! Exiting ..." +ERROR_CONTAINER_CACHE_USAGE_NOT_FOUND = "[memory_checker] Failed to get the cache usage of container '{}'! Exiting ..." +ERROR_CGROUP_MEMORY_STATS_NOT_FOUND = "[memory_checker] cgroup memory statistics file '{}' of container '{}' does not exist on device! Exiting ..." + def get_command_result(command): """Executes the command and return the resulting output. @@ -59,6 +69,81 @@ def get_command_result(command): return command_stdout.strip() +def get_container_id(container_name): + """Gets full container ID of the specified container + Args: + container_name: A string indicates the name of specified container. + Returns: + container_id: A string indicates the full ID of specified container. + """ + container_id = "" + + get_container_info_cmd = "docker ps --no-trunc" + command_stdout = get_command_result(get_container_info_cmd) + + for line in command_stdout.splitlines(): + if container_name in line: + container_id = line.split()[0].strip() + break + + if not container_id: + syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_ID_NOT_FOUND.format(container_name)) + + sys.exit(4) + + return container_id + +def get_memory_usage(container_id): + """Reads the container's memory usage from the control group subsystem's file + '/sys/fs/cgroup/memory/docker//memory.usage_in_bytes'. + Args: + container_id: A string indicates the full ID of a container. + Returns: + memory_usage_in_bytes: A string indicates memory usage (Bytes) of a container. + """ + memory_usage_in_bytes = "" + + docker_memory_usage_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.usage_in_bytes" + if not os.path.exists(docker_memory_usage_file_path): + syslog.syslog(syslog.LOG_ERR, ERROR_CGROUP_MEMORY_USAGE_NOT_FOUND.format(docker_memory_usage_file_path, container_id)) + sys.exit(5) + + get_memory_usage_cmd = "sudo cat {}".format(docker_memory_usage_file_path) + memory_usage_in_bytes = get_command_result(get_memory_usage_cmd) + if not memory_usage_in_bytes: + syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_MEMORY_USAGE_NOT_FOUND.format(container_id)) + sys.exit(6) + + return memory_usage_in_bytes + +def get_cache_usage(container_id): + """Reads the container's cache usage from the field 'total_inactive_file' in control + group subsystem's file '/sys/fs/cgroup/memory/docker//memory.stat'. + Args: + container_id: A string indicates the full ID of a container. + Returns: + cache_usage_in_bytes: A string indicates the cache usage (Bytes) of a container. + """ + cache_usage_in_bytes = "" + + docker_memory_stat_file_path = CGROUP_DOCKER_MEMORY_DIR + container_id + "/memory.stat" + if not os.path.exists(docker_memory_stat_file_path): + syslog.syslog(syslog.LOG_ERR, ERROR_CGROUP_MEMORY_STATS_NOT_FOUND.format(docker_memory_stat_file_path, container_id)) + sys.exit(7) + + get_cache_usage_cmd = "sudo cat {}".format(docker_memory_stat_file_path) + command_stdout = get_command_result(get_cache_usage_cmd) + + for line in command_stdout.splitlines(): + if "total_inactive_file" in line: + cache_usage_in_bytes = line.split()[1].strip() + break + + if not cache_usage_in_bytes: + syslog.syslog(syslog.LOG_ERR, ERROR_CONTAINER_CACHE_USAGE_NOT_FOUND.format(container_id)) + sys.exit(8) + + return cache_usage_in_bytes def publish_events(container_name, mem_usage_bytes, threshold_value): events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE) @@ -71,8 +156,8 @@ def publish_events(container_name, mem_usage_bytes, threshold_value): def check_memory_usage(container_name, threshold_value): - """Checks the memory usage of a container and writes an alerting messages into - the syslog if the memory usage is larger than the threshold value. + """Checks the memory usage of a container from its cgroup subsystem and writes an alerting + messages into the syslog if the memory usage is larger than the threshold value. Args: container_name: A string represtents name of a container @@ -81,37 +166,33 @@ def check_memory_usage(container_name, threshold_value): Returns: None. """ - command = ["docker", "stats", "--no-stream", "--format", "{{.MemUsage}}", container_name] - command_stdout = get_command_result(command) - mem_usage = command_stdout.split("/")[0].strip() - match_obj = re.match(r"\d+\.?\d*", mem_usage) - if match_obj: - mem_usage_value = float(mem_usage[match_obj.start():match_obj.end()]) - mem_usage_unit = mem_usage[match_obj.end():] - - mem_usage_bytes = 0.0 - if mem_usage_unit == "B": - mem_usage_bytes = mem_usage_value - elif mem_usage_unit == "KiB": - mem_usage_bytes = mem_usage_value * 1024 - elif mem_usage_unit == "MiB": - mem_usage_bytes = mem_usage_value * 1024 ** 2 - elif mem_usage_unit == "GiB": - mem_usage_bytes = mem_usage_value * 1024 ** 3 - - if mem_usage_bytes > threshold_value: - print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" - .format(container_name, mem_usage_bytes, threshold_value)) - syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" - .format(container_name, mem_usage_bytes, threshold_value)) - # publish event - publish_events(container_name, "{:.2f}".format(mem_usage_bytes), str(threshold_value)) - sys.exit(3) - else: - syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'" - .format(mem_usage)) - sys.exit(4) + container_id = get_container_id(container_name) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] Container ID of '{}' is: '{}'." + .format(container_name, container_id)) + + memory_usage_in_bytes = get_memory_usage(container_id) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] The memory usage of container '{}' is '{}' Bytes!" + .format(container_name, memory_usage_in_bytes)) + cache_usage_in_bytes = get_cache_usage(container_id) + syslog.syslog(syslog.LOG_INFO, "[memory_checker] The cache usage of container '{}' is '{}' Bytes!" + .format(container_name, cache_usage_in_bytes)) + + try: + memory_usage = int(memory_usage_in_bytes) + cache_usage = int(cache_usage_in_bytes) + except ValueError as err: + syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to convert the memory or cache usage in string to integer! Exiting ...") + sys.exit(9) + + total_memory_usage = memory_usage - cache_usage + syslog.syslog(syslog.LOG_INFO, "[memory_checker] Total memory usage of container '{}' is '{}' Bytes!" + .format(container_name, total_memory_usage)) + + if total_memory_usage > threshold_value: + print("[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!" + .format(container_name, total_memory_usage, threshold_value)) + sys.exit(3) def is_service_active(service_name): """Test if service is running.