Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Fix cgroup used memory calculation for Ray memory monitor #43071

Merged
merged 4 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
69 changes: 31 additions & 38 deletions python/ray/_private/utils.py
Expand Up @@ -602,49 +602,35 @@ def get_num_cpus(


# TODO(clarng): merge code with c++
def get_cgroupv1_used_memory(memory_stat_filename, memory_usage_filename):
def get_cgroup_used_memory(
memory_stat_filename, memory_usage_filename, inactive_file_key, active_file_key
jjyao marked this conversation as resolved.
Show resolved Hide resolved
):
"""
The calculation logic is the same with `GetCGroupV1MemoryUsedBytes`
The calculation logic is the same with `GetCGroupMemoryUsedBytes`
in `memory_monitor.cc` file.
"""
total_cache_bytes = -1
shmem_used_bytes = -1
inactive_file_bytes = -1
active_file_bytes = -1
with open(memory_stat_filename, "r") as f:
lines = f.readlines()
for line in lines:
if "total_cache " in line:
total_cache_bytes = int(line.split()[1])
elif "total_shmem " in line:
shmem_used_bytes = int(line.split()[1])
if f"{inactive_file_key} " in line:
inactive_file_bytes = int(line.split()[1])
elif f"{active_file_key} " in line:
active_file_bytes = int(line.split()[1])

with open(memory_usage_filename, "r") as f:
lines = f.readlines()
cgroup_usage_in_bytes = int(lines[0].strip())

if total_cache_bytes == -1 or cgroup_usage_in_bytes == -1 or shmem_used_bytes == -1:
if (
inactive_file_bytes == -1
or cgroup_usage_in_bytes == -1
or active_file_bytes == -1
):
return None

return cgroup_usage_in_bytes - (total_cache_bytes - shmem_used_bytes)


def get_cgroupv2_used_memory(stat_file, usage_file):
# Uses same calculation as libcontainer, that is:
# memory.current - memory.stat[inactive_file]
# Source: https://github.com/google/cadvisor/blob/24dd1de08a72cfee661f6178454db995900c0fee/container/libcontainer/handler.go#L836 # noqa: E501
inactive_file_bytes = -1
current_usage = -1
with open(usage_file, "r") as f:
current_usage = int(f.read().strip())
with open(stat_file, "r") as f:
lines = f.readlines()
for line in lines:
if "inactive_file" in line:
inactive_file_bytes = int(line.split()[1])
if current_usage >= 0 and inactive_file_bytes >= 0:
working_set = current_usage - inactive_file_bytes
assert working_set >= 0
return working_set
return None
return cgroup_usage_in_bytes - inactive_file_bytes - active_file_bytes


def get_used_memory():
Expand All @@ -657,21 +643,28 @@ def get_used_memory():
# container.
docker_usage = None
# For cgroups v1:
memory_usage_filename = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
memory_stat_filename = "/sys/fs/cgroup/memory/memory.stat"
memory_usage_filename_v1 = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
memory_stat_filename_v1 = "/sys/fs/cgroup/memory/memory.stat"
# For cgroups v2:
memory_usage_filename_v2 = "/sys/fs/cgroup/memory.current"
memory_stat_filename_v2 = "/sys/fs/cgroup/memory.stat"
if os.path.exists(memory_usage_filename):
docker_usage = get_cgroupv1_used_memory(
memory_stat_filename,
memory_usage_filename,
if os.path.exists(memory_usage_filename_v1) and os.path.exists(
memory_stat_filename_v1
):
docker_usage = get_cgroup_used_memory(
memory_stat_filename_v1,
memory_usage_filename_v1,
"total_inactive_file",
"total_active_file",
)
elif os.path.exists(memory_usage_filename_v2) and os.path.exists(
memory_stat_filename_v2
):
docker_usage = get_cgroupv2_used_memory(
memory_stat_filename_v2, memory_usage_filename_v2
docker_usage = get_cgroup_used_memory(
memory_stat_filename_v2,
memory_usage_filename_v2,
"inactive_file",
"active_file",
)

if docker_usage is not None:
Expand Down
172 changes: 30 additions & 142 deletions src/ray/common/memory_monitor.cc
Expand Up @@ -111,145 +111,24 @@ std::tuple<int64_t, int64_t> MemoryMonitor::GetMemoryBytes() {
return std::tuple(system_used_bytes, system_total_bytes);
}

int64_t MemoryMonitor::GetCGroupV1MemoryUsedBytes(const char *stat_path,
const char *usage_path) {
// How does this function calculate in-used memory from cgroup memory info file?
// It reads 2 cgroup files:
// mem stat file: /sys/fs/cgroup/memory/memory.stat
// mem usage file: /sys/fs/cgroup/memory/memory.usage_in_bytes
// Formula:
// OS_managed_cache_and_buffer = `memory.stat.total_cache - memory.stat.total_shmem`
// used_memory = `memory.usage_in_bytes` - OS_managed_cache_and_buffer
//
// This value is consistent with values `MemTotal` `MemAvailable` `MemFree` in
// `/proc/meminfo`
// and they have relationship of:
// - `memory.usage_in_bytes` == `MemTotal` - `MemFree`
// - `memory.stat.total_cache` == `MemAvailable` - `MemFree`
// - OS_managed_cache_and_buffer = `memory.stat.total_cache` - `shmem`
//
// Explanation: What's this part `OS_managed_cache_and_buffer` memory for and why
// we should treat this part memory as "Not-in-used" ?
// Linux OS tries to fully use the whole physical memory size, so that in many
// cases we can observe that the system has very little `MemFree` size,
// but at the same time system might have a large `MemAvailable` size,
// then this part `MemAvailable - MemFree` size is borrowed by Linux OS
// to cache pages / buffers , BUT, once user process requests to allocate memory,
// and there is no sufficient free memory, OS will evict cache data out of this
// part memory and allocate them to user proces.
//
// Explanation: What's the part of `shmem` and why we should treat it as "in-used" ?
// `shmem` overview: https://man7.org/linux/man-pages/man7/shm_overview.7.html
// Ray object store use `/dev/shm`, which is a `tmpfs` file system,
// In linux, `tmpfs` file system uses `shmem` memory.
// Note that `tmpfs` file system might swap data to disk (when option noswap=False)
// but `shmem` value always means the `tmpfs` data size in physical memory.
// We can read `shmem` value from /proc/meminfo `Shmem` item or from
// /sys/fs/cgroup/memory/memory.stat `total_shmem` item.
//
// Explanation: Why don't use cgroup
// `memory.stat.total_rss` and `memory.stat.inactive_file_bytes` to compute the
// in-used memory ?
// In my test using these values can't calculate out correct used memory number,
// cgroup official doc is fuzzy when describing these values.
// But in my test,
// cgroup `memory.usage_in_bytes` value perfectly matches (`MemTotal` - `MemFree`)
// value, so I am sure that `memory.usage_in_bytes` value is correct. and cgroup
// `memory.stat.total_cache` value also perfectly matches
// (`MemAvailable` - `MemFree`) value,
//
// Correctness testing criteria:
// - [Check cgroup memory.stat value consistency with /proc/meminfo]
// Ensuring the calculated value is consistent with values computed from
// /proc/meminfo Note that cgroup mem file is consistent with /proc/meminfo file
// unless there is bug in cgroup, i.e. we can read MemTotal / MemAvailable / Shmem
// from /proc/meminfo file, then the calculated value by this function
// should equals to `MemTotal` - (`MemAvailable` - `Shmem`)
//
// - [OS_managed_cache_and_buffer test]
// Prepare an idle OS environment that has a large number of `free` memory.
// Use dd command to write a large file to disk, after dd completes,
// the calculated used_memory value should keep nearly the same with the value
// before dd execution.
// But note that free memory will decrease significantly because OS
// cached the file data as part of "OS_managed_cache_and_buffer" I mentioned above.
//
// - [/dev/shm test]
// If we use dd command to write a large file to /dev/shm,
// and no swapping occurs (you can use `free -h` to check whether swap size
// increases), after dd completes, the calculated used-memory value should be nearly
// previous_in_used_memory_bytes + bytes_of_written_file
//
// - [Host OS SIGKILL signal test]:
// 1. get current "used_memory" by running this `GetCGroupV1MemoryUsedBytes`
// function.
// 2. get "swap_space_size" by running `free` command
// 3. read "used_swap_size" value by reading "total_swap" item from
// /sys/fs/cgroup/memory/memory.stat
// 4. Create a program that gradually requests to allocate memory,
// record that after it gets allocated memory of "oom_size" bytes,
// the process is killed by OS SIGKILL signal.
// The "oom_size" recorded in step-(4) should approximately satisfy the following
// formula: oom_size ~== (total_physical_memory + swap_space_size) - used_memory -
// used_swap_size
std::ifstream memstat_ifs(stat_path, std::ios::in | std::ios::binary);
if (!memstat_ifs.is_open()) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs) << " file not found: " << stat_path;
return kNull;
}
std::ifstream memusage_ifs(usage_path, std::ios::in | std::ios::binary);
if (!memusage_ifs.is_open()) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs) << " file not found: " << usage_path;
return kNull;
}

std::string line;
std::string title;
int64_t value;

int64_t cgroup_usage_in_bytes;
// The content of "/sys/fs/cgroup/memory/memory.usage_in_bytes" file is
// an integer representing the total memory usage bytes of the container.
std::getline(memusage_ifs, line);
std::istringstream iss(line);
iss >> cgroup_usage_in_bytes;

int64_t total_cache_bytes = kNull;
int64_t shmem_used_bytes = kNull;
while (std::getline(memstat_ifs, line)) {
std::istringstream iss(line);
iss >> title >> value;
if (title == "total_cache") {
total_cache_bytes = value;
} else if (title == "total_shmem") {
shmem_used_bytes = value;
}
}
if (total_cache_bytes == kNull || shmem_used_bytes == kNull) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs)
<< "Failed to parse cgroup v1 mem stat. total cache " << total_cache_bytes
<< " total_shmem " << shmem_used_bytes;
return kNull;
}
int64_t used = cgroup_usage_in_bytes - (total_cache_bytes - shmem_used_bytes);
return used;
}

int64_t MemoryMonitor::GetCGroupV2MemoryUsedBytes(const char *stat_path,
const char *usage_path) {
// Uses same calculation as libcontainer, that is: memory.current -
// memory.stat[inactive_file]. Source:
// https://github.com/google/cadvisor/blob/24dd1de08a72cfee661f6178454db995900c0fee/container/libcontainer/handler.go#L836
int64_t MemoryMonitor::GetCGroupMemoryUsedBytes(const char *stat_path,
const char *usage_path,
const char *inactive_file_key,
const char *active_file_key) {
// CGroup reported memory usage includes file page caches
jjyao marked this conversation as resolved.
Show resolved Hide resolved
// and we should exclude those since they are reclaimable
// by the kernel and are considered available memory from
// the OOM killer's perspective.
std::ifstream memstat_ifs(stat_path, std::ios::in | std::ios::binary);
if (!memstat_ifs.is_open()) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs)
<< " cgroups v2 memory.stat file not found: " << stat_path;
<< " memory stat file not found: " << stat_path;
return kNull;
}
std::ifstream memusage_ifs(usage_path, std::ios::in | std::ios::binary);
if (!memusage_ifs.is_open()) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs)
<< " cgroups v2 memory.current file not found: " << usage_path;
<< " memory usage file not found: " << usage_path;
return kNull;
}

Expand All @@ -258,24 +137,28 @@ int64_t MemoryMonitor::GetCGroupV2MemoryUsedBytes(const char *stat_path,
std::string line;

int64_t inactive_file_bytes = kNull;
int64_t active_file_bytes = kNull;
while (std::getline(memstat_ifs, line)) {
std::istringstream iss(line);
iss >> title >> value;
if (title == kCgroupsV2MemoryStatInactiveKey) {
if (title == inactive_file_key) {
inactive_file_bytes = value;
break;
} else if (title == active_file_key) {
active_file_bytes = value;
}
}

int64_t current_usage_bytes = kNull;
memusage_ifs >> current_usage_bytes;
if (current_usage_bytes == kNull || inactive_file_bytes == kNull) {
if (current_usage_bytes == kNull || inactive_file_bytes == kNull ||
active_file_bytes == kNull) {
RAY_LOG_EVERY_MS(WARNING, kLogIntervalMs)
<< "Failed to parse cgroup v2 memory usage. memory.current "
<< current_usage_bytes << " inactive " << inactive_file_bytes;
<< "Failed to parse cgroup memory usage. memory usage " << current_usage_bytes
<< " inactive file " << inactive_file_bytes << " active file "
<< active_file_bytes;
return kNull;
}
return current_usage_bytes - inactive_file_bytes;
return current_usage_bytes - inactive_file_bytes - active_file_bytes;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the key.

jjyao marked this conversation as resolved.
Show resolved Hide resolved
}

std::tuple<int64_t, int64_t> MemoryMonitor::GetCGroupMemoryBytes() {
Expand All @@ -291,11 +174,16 @@ std::tuple<int64_t, int64_t> MemoryMonitor::GetCGroupMemoryBytes() {
int64_t used_bytes = kNull;
if (std::filesystem::exists(kCgroupsV2MemoryUsagePath) &&
std::filesystem::exists(kCgroupsV2MemoryStatPath)) {
used_bytes =
GetCGroupV2MemoryUsedBytes(kCgroupsV2MemoryStatPath, kCgroupsV2MemoryUsagePath);
} else if (std::filesystem::exists(kCgroupsV1MemoryStatPath)) {
used_bytes =
GetCGroupV1MemoryUsedBytes(kCgroupsV1MemoryStatPath, kCgroupsV1MemoryUsagePath);
used_bytes = GetCGroupMemoryUsedBytes(kCgroupsV2MemoryStatPath,
kCgroupsV2MemoryUsagePath,
kCgroupsV2MemoryStatInactiveFileKey,
kCgroupsV2MemoryStatActiveFileKey);
} else if (std::filesystem::exists(kCgroupsV1MemoryStatPath) &&
std::filesystem::exists(kCgroupsV1MemoryUsagePath)) {
used_bytes = GetCGroupMemoryUsedBytes(kCgroupsV1MemoryStatPath,
kCgroupsV1MemoryUsagePath,
kCgroupsV1MemoryStatInactiveFileKey,
kCgroupsV1MemoryStatActiveFileKey);
}

/// This can be zero if the memory limit is not set for cgroup v2.
Expand Down
21 changes: 11 additions & 10 deletions src/ray/common/memory_monitor.h
Expand Up @@ -91,10 +91,13 @@ class MemoryMonitor {
static constexpr char kCgroupsV1MemoryUsagePath[] =
"/sys/fs/cgroup/memory/memory.usage_in_bytes";
static constexpr char kCgroupsV1MemoryStatPath[] = "/sys/fs/cgroup/memory/memory.stat";
static constexpr char kCgroupsV1MemoryStatInactiveFileKey[] = "total_inactive_file";
static constexpr char kCgroupsV1MemoryStatActiveFileKey[] = "total_active_file";
static constexpr char kCgroupsV2MemoryMaxPath[] = "/sys/fs/cgroup/memory.max";
static constexpr char kCgroupsV2MemoryUsagePath[] = "/sys/fs/cgroup/memory.current";
static constexpr char kCgroupsV2MemoryStatPath[] = "/sys/fs/cgroup/memory.stat";
static constexpr char kCgroupsV2MemoryStatInactiveKey[] = "inactive_file";
static constexpr char kCgroupsV2MemoryStatInactiveFileKey[] = "inactive_file";
static constexpr char kCgroupsV2MemoryStatActiveFileKey[] = "active_file";
static constexpr char kProcDirectory[] = "/proc";
static constexpr char kCommandlinePath[] = "cmdline";
/// The logging frequency. Decoupled from how often the monitor runs.
Expand All @@ -113,18 +116,16 @@ class MemoryMonitor {
/// \return the used and total memory in bytes from Cgroup.
std::tuple<int64_t, int64_t> GetCGroupMemoryBytes();

/// \param path file path to the memory stat file.
///
/// \return the used memory for cgroup v1.
static int64_t GetCGroupV1MemoryUsedBytes(const char *stat_path,
const char *usage_path);

/// \param stat_path file path to the memory.stat file.
/// \param usage_path file path to the memory.current file
/// \return the used memory for cgroup v2. May return negative value, which should be
/// \param inactive_file_key inactive_file key name in memory.stat file
/// \param active_file_key active_file key name in memory.stat file
/// \return the used memory for cgroup. May return negative value, which should be
/// discarded.
static int64_t GetCGroupV2MemoryUsedBytes(const char *stat_path,
const char *usage_path);
static int64_t GetCGroupMemoryUsedBytes(const char *stat_path,
const char *usage_path,
const char *inactive_file_key,
const char *active_file_key);

/// \return the used and total memory in bytes for linux OS.
std::tuple<int64_t, int64_t> GetLinuxMemoryBytes();
Expand Down