Skip to content

Commit

Permalink
Disk monitor improvements
Browse files Browse the repository at this point in the history
Related to VESC-1015

* Remove `infinity` timeouts
* Improve free disk space retrieval on win32

Run commands with a timeout

This PR fixes an issue I observed while reproducing VESC-1015 on Windows
10. Within an hour or so of running a 3-node cluster that has health
checks being run against it, one or more nodes' memory use would spike.
I would see that the rabbit_disk_monitor process is stuck executing
os:cmd to retrieve free disk space information. Thus, all
gen_server:call calls to the process would never return, especially
since they used an infinity timeout.

Do something with timeout

Fix unit_disk_monitor_mocks_SUITE
  • Loading branch information
lukebakken committed Dec 16, 2021
1 parent ea39234 commit 545da1a
Showing 1 changed file with 75 additions and 37 deletions.
112 changes: 75 additions & 37 deletions deps/rabbit/src/rabbit_disk_monitor.erl
Expand Up @@ -75,42 +75,42 @@
-spec get_disk_free_limit() -> integer().

get_disk_free_limit() ->
gen_server:call(?MODULE, get_disk_free_limit, infinity).
gen_server:call(?MODULE, get_disk_free_limit).

-spec set_disk_free_limit(disk_free_limit()) -> 'ok'.

set_disk_free_limit(Limit) ->
gen_server:call(?MODULE, {set_disk_free_limit, Limit}, infinity).
gen_server:call(?MODULE, {set_disk_free_limit, Limit}).

-spec get_min_check_interval() -> integer().

get_min_check_interval() ->
gen_server:call(?MODULE, get_min_check_interval, infinity).
gen_server:call(?MODULE, get_min_check_interval).

-spec set_min_check_interval(integer()) -> 'ok'.

set_min_check_interval(Interval) ->
gen_server:call(?MODULE, {set_min_check_interval, Interval}, infinity).
gen_server:call(?MODULE, {set_min_check_interval, Interval}).

-spec get_max_check_interval() -> integer().

get_max_check_interval() ->
gen_server:call(?MODULE, get_max_check_interval, infinity).
gen_server:call(?MODULE, get_max_check_interval).

-spec set_max_check_interval(integer()) -> 'ok'.

set_max_check_interval(Interval) ->
gen_server:call(?MODULE, {set_max_check_interval, Interval}, infinity).
gen_server:call(?MODULE, {set_max_check_interval, Interval}).

-spec get_disk_free() -> (integer() | 'unknown').

get_disk_free() ->
gen_server:call(?MODULE, get_disk_free, infinity).
gen_server:call(?MODULE, get_disk_free).

-spec set_enabled(string()) -> 'ok'.

set_enabled(Enabled) ->
gen_server:call(?MODULE, {set_enabled, Enabled}, infinity).
gen_server:call(?MODULE, {set_enabled, Enabled}).

%%----------------------------------------------------------------------------
%% gen_server callbacks
Expand Down Expand Up @@ -227,33 +227,19 @@ get_disk_free(Dir) ->
get_disk_free(Dir, {unix, Sun})
when Sun =:= sunos; Sun =:= sunos4; Sun =:= solaris ->
Df = os:find_executable("df"),
parse_free_unix(rabbit_misc:os_cmd(Df ++ " -k " ++ Dir));
parse_free_unix(run_cmd(Df ++ " -k " ++ Dir));
get_disk_free(Dir, {unix, _}) ->
Df = os:find_executable("df"),
parse_free_unix(rabbit_misc:os_cmd(Df ++ " -kP " ++ Dir));
parse_free_unix(run_cmd(Df ++ " -kP " ++ Dir));
get_disk_free(Dir, {win32, _}) ->
%% On Windows, the Win32 API enforces a limit of 260 characters
%% (MAX_PATH). If we call `dir` with a path longer than that, it
%% fails with "File not found". Starting with Windows 10 version
%% 1607, this limit was removed, but the administrator has to
%% configure that.
%%
%% NTFS supports paths up to 32767 characters. Therefore, paths
%% longer than 260 characters exist but they are "inaccessible" to
%% `dir`.
%%
%% A workaround is to tell the Win32 API to not parse a path and
%% just pass it raw to the underlying filesystem. To do this, the
%% path must be prepended with "\\?\". That's what we do here.
%%
%% However, the underlying filesystem may not support forward
%% slashes transparently, as the Win32 API does. Therefore, we
%% convert all forward slashes to backslashes.
%%
%% See the following page to learn more about this:
%% https://ss64.com/nt/syntax-filenames.html
RawDir = "\\\\?\\" ++ string:replace(Dir, "/", "\\", all),
parse_free_win32(rabbit_misc:os_cmd("dir /-C /W \"" ++ RawDir ++ "\"")).
case win32_get_disk_free_fsutil(Dir) of
{ok, Free0} -> Free0;
error ->
case win32_get_disk_free_pwsh(Dir) of
{ok, Free1} -> Free1;
_ -> exit(could_not_determine_disk_free)
end
end.

parse_free_unix(Str) ->
case string:tokens(Str, "\n") of
Expand All @@ -264,11 +250,46 @@ parse_free_unix(Str) ->
_ -> exit({unparseable, Str})
end.

parse_free_win32(CommandResult) ->
LastLine = lists:last(string:tokens(CommandResult, "\r\n")),
{match, [Free]} = re:run(lists:reverse(LastLine), "(\\d+)",
[{capture, all_but_first, list}]),
list_to_integer(lists:reverse(Free)).
win32_get_disk_free_fsutil(Dir) ->
% Dir:
% "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia"
Drive = string:slice(Dir, 0, 2),

% Drive: c:
FsutilCmd = "fsutil.exe volume diskfree " ++ Drive,

% C:\windows\system32>fsutil volume diskfree c:
% Total free bytes : 812,733,878,272 (756.9 GB)
% Total bytes : 1,013,310,287,872 (943.7 GB)
% Total quota free bytes : 812,733,878,272 (756.9 GB)
case run_cmd(FsutilCmd) of
{error, timeout} ->
error;
FsutilResult ->
case string:slice(FsutilResult, 0, 5) of
"Error" ->
error;
"Total" ->
FirstLine = hd(string:tokens(FsutilResult, "\r\n")),
{match, [FreeStr]} = re:run(FirstLine, "(\\d+,?)+", [{capture, first, list}]),
{ok, list_to_integer(lists:flatten(string:tokens(FreeStr, ",")))}
end
end.


win32_get_disk_free_pwsh(Dir) ->
% Dir:
% "c:/Users/username/AppData/Roaming/RabbitMQ/db/rabbit2@username-z01-mnesia"
Drive = string:slice(Dir, 0, 1),
PoshCmd = "powershell.exe -NoLogo -NoProfile -NonInteractive -Command (Get-PSDrive " ++ Drive ++ ").Free",
case run_cmd(PoshCmd) of
{error, timeout} ->
error;
PoshResultStr ->
% Note: remove \r\n
PoshResult = string:slice(PoshResultStr, 0, length(PoshResultStr) - 2),
{ok, list_to_integer(PoshResult)}
end.

interpret_limit({mem_relative, Relative})
when is_number(Relative) ->
Expand Down Expand Up @@ -318,3 +339,20 @@ enable(#state{dir = Dir, interval = Interval, limit = Limit, retries = Retries}
erlang:send_after(Interval, self(), try_enable),
State#state{enabled = false}
end.

run_cmd(Cmd) ->
Pid = self(),
Ref = make_ref(),
CmdFun = fun() ->
CmdResult = rabbit_misc:os_cmd(Cmd),
Pid ! {Pid, Ref, CmdResult}
end,
CmdPid = spawn(CmdFun),
receive
{Pid, Ref, CmdResult} ->
CmdResult
after 5000 ->
exit(CmdPid, kill),
rabbit_log:error("Command timed out: '~s'", [Cmd]),
{error, timeout}
end.

0 comments on commit 545da1a

Please sign in to comment.