Skip to content

Commit

Permalink
[pytest] Ignore specific syslog error messages when testing autoresta…
Browse files Browse the repository at this point in the history
…rt (#2430)

When pytest script was ran to test the autorestart feature, there will be error messages in the syslog which will cause
the logAnalyzer to fail at the end of testing. In order to fix this issue, I examine all the error messages in the syslog 
during testing the autorestart feature and add them into whitelist. At the same time, the reason why we should ignore 
them are put in the comment of `ignore_expected_loganalyzer_exception(...)` function. 

After adding these error messages into whitelist, we need do the post check to see whether all the critical processes
are alive after testing the autorestart feature.

Signed-off-by: Yong Zhao <yozhao@microsoft.com>
  • Loading branch information
yozhao101 authored and yxieca committed Oct 31, 2020
1 parent 77f123a commit 885053e
Showing 1 changed file with 113 additions and 9 deletions.
122 changes: 113 additions & 9 deletions tests/autorestart/test_container_autorestart.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,76 @@
CONTAINER_RESTART_THRESHOLD_SECS = 180


@pytest.fixture(autouse=True)
def ignore_expected_loganalyzer_exception(duthost, loganalyzer):
"""
Ignore expected failure/error messages during testing the autorestart feature.
First, since we killed a critical process in a specific container to test the feature of
autorestart, we expect to see error messages which were fired by Monit such as
"ERR monit[563]: 'lldp|lldpd' status failed (1) -- 'lldpd' is not running."
Second, if teammgrd process was killed for testing, orchagent process in
swss container would write the error messages into syslog such as
"ERR swss#orchagent: :- removeLag: Failed to remove ref count 1 LAG PortChannel10008."
When teamd was restarted, there was an error message in the syslog: "ERR teamd#teamsyncd:
:- readData: netlink reports an error=-33 on reading a netlink socket."
Third, during pmon container was restarted due to ledd process was killed for testing,
xcvrd process would write an error message into syslog such as "ERR pmon#xcvrd[29]: :-initialize
GlobalConfig: Sonic database config global file doesn't exist at /var/run/redis/sonic-db/database_global.json."
thermalctld process would write an error message into syslog such as "ERR pmon#thermalctld[33]:
Caught exception while initializing thermal manager."
Fourth, if orchagent process was killed and swss container was restarted, then syncd process
would write error messages such as "ERR syncd#syncd: [none] driverEgressMemoryUpdate:1395
Error getting cosq for port 1.". At the same time, syncd process also wrote two WARNING messages
into syslog such as "WARNING syncd#syncd:- saiDiscover: skipping since it causes crash:
SAI_STP_ATTR_BRIDGE_ID". Since there was a keyword "crash" in these warning message, logAnalyzer
would fail.
Fifth, systemd would fire an error message:"ERR systemd[1]: Failed to start SNMP/TEAMD container." since
SNMP/TEAMD container hits the limitation of restart. route_check.py also wrote an error message into syslog.
"""
monit_ignoreRegex = [
".*ERR monit.*",
]
swss_ignoreRegex = [
".*ERR swss#orchagent.*removeLag.*",
]
pmon_ignoreRegex = [
".*ERR pmon#xcvrd.*initializeGlobalConfig.*",
".*ERR pmon#thermalctld.*Caught exception while initializing thermal manager.*",
]
syncd_ignoreRegex = [
".*ERR syncd#syncd.*driverEgressMemoryUpdate.*",
".*ERR syncd#syncd.*brcm_sai*",
".*WARNING syncd#syncd.*saiDiscover: skipping since it causes crash.*",
]
teamd_ignoreRegex = [
".*ERR teamd#teamsyncd.*readData.*netlink reports an error=-33 on reading a netlink socket.*",
]
systemd_ignoreRegex = [
".*ERR systemd.*Failed to start .* container*",
]
kernel_ignoreRegex = [
".*ERR kernel.*PortChannel.*",
]
other_ignoreRegex = [
".*ERR route_check.*",
]
if loganalyzer:
loganalyzer.ignore_regex.extend(monit_ignoreRegex)
loganalyzer.ignore_regex.extend(swss_ignoreRegex)
loganalyzer.ignore_regex.extend(pmon_ignoreRegex)
loganalyzer.ignore_regex.extend(syncd_ignoreRegex)
loganalyzer.ignore_regex.extend(teamd_ignoreRegex)
loganalyzer.ignore_regex.extend(systemd_ignoreRegex)
loganalyzer.ignore_regex.extend(kernel_ignoreRegex)
loganalyzer.ignore_regex.extend(other_ignoreRegex)


def get_group_program_info(duthost, container_name, group_name):
"""
@summary: Get program names, running status and their pids by analyzing the command
Expand All @@ -34,7 +104,7 @@ def get_group_program_info(duthost, container_name, group_name):
program_status = None
program_pid = -1

program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
for program_info in program_list["stdout_lines"]:
if program_info.find(group_name) != -1:
program_name = program_info.split()[0].split(':')[1].strip()
Expand All @@ -59,7 +129,7 @@ def get_program_info(duthost, container_name, program_name):
program_status = None
program_pid = -1

program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name))
program_list = duthost.shell("docker exec {} supervisorctl status".format(container_name), module_ignore_errors=True)
for program_info in program_list["stdout_lines"]:
if program_info.find(program_name) != -1:
program_status = program_info.split()[1].strip()
Expand Down Expand Up @@ -137,6 +207,20 @@ def is_hiting_start_limit(duthost, container_name):
return False


def clear_failed_flag_and_restart(duthost, container_name):
"""
@summary: If a container hits the restart limitation, then we clear the failed flag and
restart it.
"""
logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
duthost.shell("sudo systemctl start {}.service".format(container_name))
restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_container_state, duthost, container_name, True)
pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))


def verify_autorestart_with_critical_process(duthost, container_name, program_name,
program_status, program_pid):
"""
Expand Down Expand Up @@ -165,13 +249,7 @@ def verify_autorestart_with_critical_process(duthost, container_name, program_na
check_container_state, duthost, container_name, True)
if not restarted:
if is_hiting_start_limit(duthost, container_name):
logger.info("{} hits start limit and clear reset-failed flag".format(container_name))
duthost.shell("sudo systemctl reset-failed {}.service".format(container_name))
duthost.shell("sudo systemctl start {}.service".format(container_name))
restarted = wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_container_state, duthost, container_name, True)
pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))
clear_failed_flag_and_restart(duthost, container_name)
else:
pytest.fail("Failed to restart container '{}'".format(container_name))

Expand Down Expand Up @@ -202,6 +280,30 @@ def verify_no_autorestart_with_non_critical_process(duthost, container_name, pro
logger.info("Restart the program '{}' in container '{}'".format(program_name, container_name))
duthost.shell("docker exec {} supervisorctl start {}".format(container_name, program_name))

def check_all_critical_processes_status(duthost):
processes_status = duthost.all_critical_process_status()
for container_name, processes in processes_status.items():
if processes["status"] is False or len(processes["exited_critical_process"]) > 0:
return False

return True


def postcheck_critical_processes_status(duthost, container_autorestart_states):
"""
@summary: Do the post check to see whether all the critical processes are alive after testing
the autorestart feature.
First we restart the containers which hit the restart limitation and then do the post check
"""
for container_name in container_autorestart_states.keys():
if is_hiting_start_limit(duthost, container_name):
clear_failed_flag_and_restart(duthost, container_name)

pytest_assert(wait_until(CONTAINER_RESTART_THRESHOLD_SECS,
CONTAINER_CHECK_INTERVAL_SECS,
check_all_critical_processes_status, duthost),
"Post checking the healthy of critical processes failed.")


def test_containers_autorestart(duthost, tbinfo):
"""
Expand Down Expand Up @@ -275,3 +377,5 @@ def test_containers_autorestart(duthost, tbinfo):
duthost.shell("sudo config feature autorestart {} disabled".format(container_name))

logger.info("End of testing the container '{}'".format(container_name))

postcheck_critical_processes_status(duthost, container_autorestart_states)

0 comments on commit 885053e

Please sign in to comment.