Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/Azure/sonic-buildimage in…
Browse files Browse the repository at this point in the history
…to bgpcfgd_merge
  • Loading branch information
Zhenhong Zhao committed Jan 22, 2021
2 parents 99629bb + 1043678 commit 6a693f4
Show file tree
Hide file tree
Showing 50 changed files with 385 additions and 107 deletions.
2 changes: 1 addition & 1 deletion dockers/docker-database/supervisord.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name database
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=50

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name dhcp_relay
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-fpm-frr/frr/supervisord/supervisord.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=50

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name bgp
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-fpm-gobgp/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name bgp
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-fpm-quagga/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name bgp
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-lldp/supervisord.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name lldp
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-nat/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name nat
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
5 changes: 5 additions & 0 deletions dockers/docker-orchagent/orchagent.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ then
ORCHAGENT_ARGS+="-i $asic_id "
fi

# for multi asic platforms add the asic name to the record file names
if [[ "$NAMESPACE_ID" ]]; then
ORCHAGENT_ARGS+="-f swss.asic$NAMESPACE_ID.rec -j sairedis.asic$NAMESPACE_ID.rec "
fi

# Add platform specific arguments if necessary
if [ "$platform" == "broadcom" ]; then
ORCHAGENT_ARGS+="-m $MAC_ADDRESS"
Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-orchagent/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=100

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=100

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name pmon
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down Expand Up @@ -68,7 +68,7 @@ dependent_startup_wait_for=rsyslogd:running

{% if not skip_ledd %}
[program:ledd]
command=/usr/local/bin/ledd
command={% if API_VERSION == 3 and 'ledd' not in python2_daemons %}python3 {% else %} python2 {% endif %}/usr/local/bin/ledd
priority=5
autostart=false
autorestart=false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-script]
command=/usr/bin/supervisor-proc-exit-listener --container-name radv
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-sflow/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name sflow
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-snmp/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=50

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name snmp
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-sonic-restapi/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name restapi
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=false

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-sonic-telemetry/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=50

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name telemetry
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=false

Expand Down
2 changes: 1 addition & 1 deletion dockers/docker-teamd/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=50

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name teamd
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
21 changes: 18 additions & 3 deletions files/image_config/logrotate/logrotate.d/rsyslog
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
/var/log/telemetry.log
/var/log/frr/bgpd.log
/var/log/frr/zebra.log
/var/log/swss/sairedis.rec
/var/log/swss/swss.rec
/var/log/swss/sairedis*.rec
/var/log/swss/swss*.rec
{
size 1M
rotate 5000
Expand Down Expand Up @@ -85,7 +85,22 @@
endscript
postrotate
if [ $(echo $1 | grep -c "/var/log/swss/") -gt 0 ]; then
pgrep -x orchagent | xargs /bin/kill -HUP 2>/dev/null || true
# for multi asic platforms, there are multiple orchagents
# send the SIGHUP only to the orchagent the which needs log file rotation
PLATFORM=`sonic-cfggen -H -v DEVICE_METADATA.localhost.platform`
ASIC_CONF=/usr/share/sonic/device/$PLATFORM/asic.conf
if [ -f "$ASIC_CONF" ]; then
. $ASIC_CONF
fi
if [ $NUM_ASIC -gt 1 ]; then
log_file=$1
log_file_name=${log_file#/var/log/swss/}
logger -p syslog.info -t "logrotate" "Sending SIGHUP to OA log_file_name: $log_file_name"
pgrep -xa orchagent | grep $log_file_name | awk '{ print $1; }' | xargs /bin/kill -HUP 2>/dev/null || true
else
logger -p syslog.info -t "logrotate" "Sending SIGHUP to OA log_file_name: $1"
pgrep -x orchagent | xargs /bin/kill -HUP 2>/dev/null || true
fi
else
/bin/kill -HUP $(cat /var/run/rsyslogd.pid)
fi
Expand Down
3 changes: 1 addition & 2 deletions files/image_config/sudoers/sudoers
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ Cmnd_Alias READ_ONLY_CMDS = /bin/cat /var/log/syslog*, \
/sbin/brctl show, \
/usr/bin/docker exec snmp cat /etc/snmp/snmpd.conf, \
/usr/bin/docker exec bgp cat /etc/quagga/bgpd.conf, \
/usr/bin/docker exec * ps aux, \
/usr/bin/docker images *, \
/usr/bin/docker ps*, \
/usr/bin/docker ps *, \
/usr/bin/lldpctl, \
/usr/bin/sensors, \
/usr/bin/tail -F /var/log/syslog, \
Expand Down
146 changes: 101 additions & 45 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import getopt
import os
import select
import signal
import sys
import syslog
import time

import swsssdk

from supervisor import childutils

# Each line of this file should specify either one critical process or one
Expand All @@ -20,10 +23,18 @@ CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
# The FEATURE table in config db contains auto-restart field
FEATURE_TABLE_NAME = 'FEATURE'

# Read the critical processes/group names from CRITICAL_PROCESSES_FILE
# Value of parameter 'timeout' in select(...) method
SELECT_TIMEOUT_SECS = 1.0

# Alerting message will be written into syslog in the following interval
ALERTING_INTERVAL_SECS = 60


def get_critical_group_and_process_list():
"""
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
@return: Two lists which contain critical processes and group names respectively.
"""
critical_group_list = []
critical_process_list = []

Expand All @@ -49,6 +60,47 @@ def get_critical_group_and_process_list():
return critical_group_list, critical_process_list


def generate_alerting_message(process_name):
"""
@summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog.
"""
namespace_prefix = os.environ.get("NAMESPACE_PREFIX")
namespace_id = os.environ.get("NAMESPACE_ID")

if not namespace_prefix or not namespace_id:
namespace = "host"
else:
namespace = namespace_prefix + namespace_id

syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'.".format(process_name, namespace))


def get_autorestart_state(container_name):
"""
@summary: Read the status of auto-restart feature from Config_DB.
@return: Return the status of auto-restart feature.
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
features_table = config_db.get_table(FEATURE_TABLE_NAME)
if not features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features table from Config DB. Exiting...")
sys.exit(2)

if container_name not in features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve feature '{}'. Exiting...".format(container_name))
sys.exit(3)

is_auto_restart = features_table[container_name].get('auto_restart')
if not is_auto_restart:
syslog.syslog(
syslog.LOG_ERR, "Unable to determine auto-restart feature status for '{}'. Exiting...".format(container_name))
sys.exit(4)

return is_auto_restart


def main(argv):
container_name = None
opts, args = getopt.getopt(argv, "c:", ["container-name="])
Expand All @@ -62,51 +114,55 @@ def main(argv):

critical_group_list, critical_process_list = get_critical_group_and_process_list()

process_under_alerting = {}
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

while True:
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

line = sys.stdin.readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))

# Transition from READY to ACKNOWLEDGED
childutils.listener.ok()

# We only care about PROCESS_STATE_EXITED events
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')

expected = int(payload_headers['expected'])
processname = payload_headers['processname']
groupname = payload_headers['groupname']

# Read the status of auto-restart feature from Config_DB.
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
features_table = config_db.get_table(FEATURE_TABLE_NAME)
if not features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features table from Config DB. Exiting...")
sys.exit(2)

if container_name not in features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve feature '{}'. Exiting...".format(container_name))
sys.exit(3)

restart_feature = features_table[container_name].get('auto_restart')
if not restart_feature:
syslog.syslog(
syslog.LOG_ERR, "Unable to determine auto-restart feature status for '{}'. Exiting...".format(container_name))
sys.exit(4)

# If auto-restart feature is not disabled and at the same time
# a critical process exited unexpectedly, terminate supervisor
if (restart_feature != 'disabled' and expected == 0 and
(processname in critical_process_list or groupname in critical_group_list)):
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
if len(file_descriptor_list) > 0:
line = file_descriptor_list[0].readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))

# Handle the PROCESS_STATE_EXITED event
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')

expected = int(payload_headers['expected'])
process_name = payload_headers['processname']
group_name = payload_headers['groupname']

if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0:
is_auto_restart = get_autorestart_state(container_name)
if is_auto_restart != "disabled":
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)
else:
process_under_alerting[process_name] = time.time()

# Handle the PROCESS_STATE_RUNNING event
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']

if process_name in process_under_alerting:
process_under_alerting.pop(process_name)

# Transition from BUSY to ACKNOWLEDGED
childutils.listener.ok()

# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

# Check whether we need write alerting messages into syslog
for process in process_under_alerting.keys():
epoch_time = time.time()
if epoch_time - process_under_alerting[process] >= ALERTING_INTERVAL_SECS:
process_under_alerting[process] = epoch_time
generate_alerting_message(process)


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion platform/barefoot/docker-syncd-bfn/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=25

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name syncd
events=PROCESS_STATE_EXITED
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
autostart=true
autorestart=unexpected

Expand Down
Loading

0 comments on commit 6a693f4

Please sign in to comment.