From 8ef6d6c6fd54f7a8e026e7b5db3d7b0c724e74bf Mon Sep 17 00:00:00 2001 From: Thayne Harbaugh Date: Thu, 1 Oct 2015 14:52:18 -0600 Subject: [PATCH] Add additional ZMQ tuning parameters necessary for 1,000+ minions per server. Start collecting tuning parameters together in the master config file. --- conf/master | 78 ++++++++++++++++++++++++++++++--------------- salt/config.py | 13 ++++++++ salt/utils/event.py | 10 ++++++ 3 files changed, 76 insertions(+), 25 deletions(-) diff --git a/conf/master b/conf/master index 42d5b8a7a32d..f9b8bf77a16c 100644 --- a/conf/master +++ b/conf/master @@ -27,31 +27,6 @@ # modified files cause conflicts, set verify_env to False. #user: root -# Max open files -# -# Each minion connecting to the master uses AT LEAST one file descriptor, the -# master subscription connection. If enough minions connect you might start -# seeing on the console (and then salt-master crashes): -# Too many open files (tcp_listener.cpp:335) -# Aborted (core dumped) -# -# By default this value will be the one of `ulimit -Hn`, ie, the hard limit for -# max open files. -# -# If you wish to set a different value than the default one, uncomment and -# configure this setting. Remember that this value CANNOT be higher than the -# hard limit. Raising the hard limit depends on your OS and/or distribution, -# a good way to find the limit is to search the internet. For example: -# raise max open files hard limit debian -# -#max_open_files: 100000 - -# The number of worker threads to start. These threads are used to manage -# return calls made from minions to the master. If the master seems to be -# running slowly, increase the number of threads. This setting can not be -# set lower than 3. -#worker_threads: 5 - # The port used by the communication interface. The ret (return) port is the # interface used for the file server, authentication, job returns, etc. #ret_port: 4506 @@ -204,6 +179,59 @@ # - /etc/salt/extra_config +##### Large-scale tuning settings ##### +########################################## +# Max open files +# +# Each minion connecting to the master uses AT LEAST one file descriptor, the +# master subscription connection. If enough minions connect you might start +# seeing on the console (and then salt-master crashes): +# Too many open files (tcp_listener.cpp:335) +# Aborted (core dumped) +# +# By default this value will be the one of `ulimit -Hn`, ie, the hard limit for +# max open files. +# +# If you wish to set a different value than the default one, uncomment and +# configure this setting. Remember that this value CANNOT be higher than the +# hard limit. Raising the hard limit depends on your OS and/or distribution, +# a good way to find the limit is to search the internet. For example: +# raise max open files hard limit debian +# +#max_open_files: 100000 + +# The number of worker threads to start. These threads are used to manage +# return calls made from minions to the master. If the master seems to be +# running slowly, increase the number of threads. This setting can not be +# set lower than 3. +#worker_threads: 5 + +# Set the ZeroMQ high water marks +# http://api.zeromq.org/3-2:zmq-setsockopt + +# The publisher interface ZeroMQPubServerChannel +#pub_hwm: 1000 + +# These two ZMQ HWM settings, salt_event_pub_hwm and event_publisher_pub_hwm +# are significant for masters with thousands of minions. When these are +# insufficiently high it will manifest in random responses missing in the CLI +# and even missing from the job cache. Masters that have fast CPUs and many +# cores with appropriate worker_threads will not need these set as high. + +# On deployment with 8,000 minions, 2.4GHz CPUs, 24 cores, 32GiB memory has +# these settings: +# +# salt_event_pub_hwm: 128000 +# event_publisher_pub_hwm: 64000 + +# ZMQ high-water-mark for SaltEvent pub socket +#salt_event_pub_hwm: 20000 + +# ZMQ high-water-mark for EventPublisher pub socket +#event_publisher_pub_hwm: 10000 + + + ##### Security settings ##### ########################################## # Enable "open mode", this mode still maintains encryption, but turns off diff --git a/salt/config.py b/salt/config.py index 18a17fd14323..c6807c6ee79f 100644 --- a/salt/config.py +++ b/salt/config.py @@ -420,6 +420,11 @@ # http://api.zeromq.org/3-2:zmq-setsockopt 'pub_hwm': int, + # ZMQ HWM for SaltEvent pub socket + 'salt_event_pub_hwm': int, + # ZMQ HWM for EventPublisher pub socket + 'event_publisher_pub_hwm': int, + # The number of MWorker processes for a master to startup. This number needs to scale up as # the number of connected minions increases. 'worker_threads': int, @@ -957,12 +962,20 @@ 'sudo_user': '', 'http_request_timeout': 1 * 60 * 60.0, # 1 hour 'http_max_body': 100 * 1024 * 1024 * 1024, # 100GB + # ZMQ HWM for SaltEvent pub socket - different for minion vs. master + 'salt_event_pub_hwm': 2000, + # ZMQ HWM for EventPublisher pub socket - different for minion vs. master + 'event_publisher_pub_hwm': 1000, } DEFAULT_MASTER_OPTS = { 'interface': '0.0.0.0', 'publish_port': '4505', 'pub_hwm': 1000, + # ZMQ HWM for SaltEvent pub socket - different for minion vs. master + 'salt_event_pub_hwm': 2000, + # ZMQ HWM for EventPublisher pub socket - different for minion vs. master + 'event_publisher_pub_hwm': 1000, 'auth_mode': 1, 'user': 'root', 'worker_threads': 5, diff --git a/salt/utils/event.py b/salt/utils/event.py index 944f41a00d98..f25d4143c7e8 100644 --- a/salt/utils/event.py +++ b/salt/utils/event.py @@ -296,6 +296,11 @@ def connect_pub(self): Establish the publish connection ''' self.sub = self.context.socket(zmq.SUB) + try: + self.sub.setsockopt(zmq.HWM, self.opts.get('salt_event_pub_hwm')) + except AttributeError: + self.sub.setsockopt(zmq.SNDHWM, self.opts.get('salt_event_pub_hwm')) + self.sub.setsockopt(zmq.RCVHWM, self.opts.get('salt_event_pub_hwm')) self.sub.connect(self.puburi) self.poller.register(self.sub, zmq.POLLIN) self.sub.setsockopt_string(zmq.SUBSCRIBE, u'') @@ -867,6 +872,11 @@ def run(self): self.context = zmq.Context(1) # Prepare the master event publisher self.epub_sock = self.context.socket(zmq.PUB) + try: + self.epub_sock.setsockopt(zmq.HWM, self.opts.get('event_publisher_pub_hwm')) + except AttributeError: + self.epub_sock.setsockopt(zmq.SNDHWM, self.opts.get('event_publisher_pub_hwm')) + self.epub_sock.setsockopt(zmq.RCVHWM, self.opts.get('event_publisher_pub_hwm')) # Prepare master event pull socket self.epull_sock = self.context.socket(zmq.PULL) if self.opts.get('ipc_mode', '') == 'tcp':