From 373f155cb72d8d5053e0914af07bfc7e24627043 Mon Sep 17 00:00:00 2001 From: labedz Date: Wed, 28 Sep 2022 10:42:38 +0000 Subject: [PATCH 1/4] Spread OVN metadata agent heartbeat response in time To avoid mass response of OVN metadata agents on heartbeat update - event on OVN Southbound SB_Global table nb_cfg entry increment, this patch postpone Chassis/Chassis_Private table update for random number of seconds in range of ( cfg.CONF.agent_down_time // 2 ). Related-Bug: #1991817 Change-Id: I6373a3c213b24ec957e4d2ea7fc42524517d10d5 (cherry picked from commit 628442aed7400251f12809a45605bd717f494c4e) --- neutron/agent/ovn/metadata/agent.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py index 950cd5ab2a0..e6eb18cd27c 100644 --- a/neutron/agent/ovn/metadata/agent.py +++ b/neutron/agent/ovn/metadata/agent.py @@ -14,6 +14,7 @@ import collections import functools +from random import randint import re import threading import uuid @@ -21,6 +22,7 @@ import netaddr from neutron_lib import constants as n_const from oslo_concurrency import lockutils +from oslo_config import cfg from oslo_log import log from oslo_utils import netutils from ovsdbapp.backend.ovs_idl import event as row_event @@ -35,10 +37,12 @@ from neutron.common.ovn import constants as ovn_const from neutron.common.ovn import utils as ovn_utils from neutron.common import utils +from neutron.conf.agent.database import agents_db from neutron.conf.plugins.ml2.drivers.ovn import ovn_conf as config LOG = log.getLogger(__name__) +agents_db.register_db_agents_opts() _SYNC_STATE_LOCK = lockutils.ReaderWriterLock() CHASSIS_METADATA_LOCK = 'chassis_metadata_lock' @@ -186,14 +190,27 @@ def __init__(self, metadata_agent): events = (self.ROW_UPDATE,) super(SbGlobalUpdateEvent, self).__init__(events, table, None) self.event_name = self.__class__.__name__ + self.first_run = True def run(self, event, row, old): - table = ('Chassis_Private' if self.agent.has_chassis_private - else 'Chassis') - self.agent.sb_idl.db_set( - table, self.agent.chassis, ('external_ids', { - ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: - str(row.nb_cfg)})).execute() + + def _update_chassis(self, row): + table = ('Chassis_Private' if self.agent.has_chassis_private + else 'Chassis') + self.agent.sb_idl.db_set( + table, self.agent.chassis, ('external_ids', { + ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: + str(row.nb_cfg)})).execute() + + if self.first_run: + interval = 0 + self.first_run = False + else: + interval = randint(0, cfg.CONF.agent_down_time // 2) + + LOG.debug("Delaying updating chassis table for %s seconds", interval) + timer = threading.Timer(interval, _update_chassis, [self, row]) + timer.start() class MetadataAgent(object): From 7746912334092b3c71290b7a063bf286545252e1 Mon Sep 17 00:00:00 2001 From: Sean Mooney Date: Wed, 31 May 2023 13:23:32 +0100 Subject: [PATCH 2/4] Send ovn heatbeat more often. This change modifies the metadata agent heatbeat to use a random offset with a max delay of 10 seconds. The orgial reason for the current logic was to mitigate https://bugs.launchpad.net/neutron/+bug/1991817 so the logic to spread the heatbeats is maintained but we now set an upper bound on the delay. Close-Bug: #2020215 Change-Id: I4d382793255520b9c44ca2aaacebcbda9a432dde (cherry picked from commit 5e0c102830a18850e35f746160867613e96d1dbc) --- neutron/agent/ovn/metadata/agent.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py index e6eb18cd27c..af06a7a7937 100644 --- a/neutron/agent/ovn/metadata/agent.py +++ b/neutron/agent/ovn/metadata/agent.py @@ -202,14 +202,21 @@ def _update_chassis(self, row): ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY: str(row.nb_cfg)})).execute() + delay = 0 if self.first_run: - interval = 0 self.first_run = False else: - interval = randint(0, cfg.CONF.agent_down_time // 2) - - LOG.debug("Delaying updating chassis table for %s seconds", interval) - timer = threading.Timer(interval, _update_chassis, [self, row]) + # We occasionally see port binding failed errors due to + # the ml2 driver refusing to bind the port to a dead agent. + # if all agents heartbeat at the same time, they will all + # cause a load spike on the server. To mitigate that we + # need to spread out the load by introducing a random delay. + # clamp the max delay between 3 and 10 seconds. + max_delay = max(min(cfg.CONF.agent_down_time // 3, 10), 3) + delay = randint(0, max_delay) + + LOG.debug("Delaying updating chassis table for %s seconds", delay) + timer = threading.Timer(delay, _update_chassis, [self, row]) timer.start() From 921cde14c15611d1ad4153881c3eae4f56e09a4f Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Wed, 1 Mar 2023 16:14:18 +0100 Subject: [PATCH 3/4] Reduce lock contention on subnets HINT: This isn't a clean backport, as we keep the subnet in-use field. We can't backport the db update that would remove the field. in [1] a lock was introduced with the goal of preventing subnets from being deleted while ports are being created in them in parallel. This was acheived by aquiring an exclusive lock on the row of the subnet in the Subnet table when adding/modifying a port or deleting the subnet. However as this was a exclusive lock it also prevented concurrent port modifications on the same subnet from happening. This can cause performance issues on environment with large shared subnets (e.g. a large external subnet). To reduce the lock contention for this case we split the lock in two parts: * For normal port operations we will aquire a shared lock on the row of the subnet. This allows multiple such operations to happen in parallel. * For deleting a subnet we will aquire an exclusive lock on the row of the subnet. This lock can not be aquired when there is any shared lock currently on the row. With this we maintain the same locking level as before, but reduce the amount of lock contention happening and thereby improve throughput. The performance improvement can be measured using rally test [2]. (improving from 21 to 18 seconds). Alternatively it can be tested using 250 parallel curl calls to create a port in the same network. This improves from 113s to 42s. [1]: https://review.opendev.org/c/openstack/neutron/+/713045 [2]: https://github.com/openstack/rally-openstack/blob/master/samples/tasks/scenarios/neutron/create-and-delete-ports.json Closes-Bug: #2009055 Change-Id: I31b1a9c2f986f59fee0da265acebbd88d2f8e4f8 (cherry picked from commit c0af5b3b5ea89d3147adf1054625f29d5b01b309) --- neutron/db/db_base_plugin_v2.py | 2 +- neutron/db/ipam_backend_mixin.py | 2 +- neutron/db/models_v2.py | 52 ++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/neutron/db/db_base_plugin_v2.py b/neutron/db/db_base_plugin_v2.py index 8cdf757293d..973bad7cbd9 100644 --- a/neutron/db/db_base_plugin_v2.py +++ b/neutron/db/db_base_plugin_v2.py @@ -75,7 +75,7 @@ def _ensure_subnet_not_used(context, subnet_id): - models_v2.Subnet.lock_register( + models_v2.Subnet.write_lock_register( context, exc.SubnetInUse(subnet_id=subnet_id), id=subnet_id) try: registry.publish( diff --git a/neutron/db/ipam_backend_mixin.py b/neutron/db/ipam_backend_mixin.py index 088bd650b25..75f4c9f14dc 100644 --- a/neutron/db/ipam_backend_mixin.py +++ b/neutron/db/ipam_backend_mixin.py @@ -688,7 +688,7 @@ def _ipam_get_subnets(self, context, network_id, host, service_type=None, msg = ('This subnet is being modified by another concurrent ' 'operation') for subnet in subnets: - subnet.lock_register( + subnet.read_lock_register( context, exc.SubnetInUse(subnet_id=subnet.id, reason=msg), id=subnet.id) subnet_dicts = [self._make_subnet_dict(subnet, context=context) diff --git a/neutron/db/models_v2.py b/neutron/db/models_v2.py index 9e001c7dbdd..d8c74c27deb 100644 --- a/neutron/db/models_v2.py +++ b/neutron/db/models_v2.py @@ -33,25 +33,51 @@ class HasInUse(object): """NeutronBaseV2 mixin, to add the flag "in_use" to a DB model. - The content of this flag (boolean) parameter is not relevant. The goal of - this field is to be used in a write transaction to mark a DB register as - "in_use". Writing any value on this DB parameter will lock the container - register. At the end of the DB transaction, the DB engine will check if - this register was modified or deleted. In such case, the transaction will - fail and won't be commited. - - "lock_register" is the method to write the register "in_use" column. - Because the lifespan of this DB lock is the DB transaction, there isn't an - unlock method. The lock will finish once the transaction ends. + The goal of this class is to allow users lock specific database rows with + a shared or exclusive lock (without necessarily introducing a change in + the table itself). Having these locks allows the DB engine to prevent + concurrent modifications (e.g. the deletion of a resource while we are + currently adding a new dependency on the resource). + + "read_lock_register" takes a shared DB lock on the row specified by the + filters. The lock is automatically released once the transaction ends. + You can have any number of parallel read locks on the same DB row. But + you can not have any write lock in parallel. + + "write_lock_register" takes an exclusive DB lock on the row specified by + the filters. The lock is automatically released on transaction commit. + You may only have one write lock on each row at a time. It therefor + blocks all other read and write locks to this row. """ + # keep this value to not need to update the database schema + # only at backport in_use = sa.Column(sa.Boolean(), nullable=False, server_default=sql.false(), default=False) @classmethod - def lock_register(cls, context, exception, **filters): + def write_lock_register(cls, context, exception, **filters): + # we use `with_for_update()` to include `FOR UPDATE` in the sql + # statement. + # we need to set `enable_eagerloads(False)` so that we do not try to + # load attached resources (e.g. standardattributes) as this breaks the + # `FOR UPDATE` statement. num_reg = context.session.query( - cls).filter_by(**filters).update({'in_use': True}) - if num_reg != 1: + cls).filter_by(**filters).enable_eagerloads( + False).with_for_update().first() + if num_reg is None: + raise exception + + @classmethod + def read_lock_register(cls, context, exception, **filters): + # we use `with_for_update(read=True)` to include `LOCK IN SHARE MODE` + # in the sql statement. + # we need to set `enable_eagerloads(False)` so that we do not try to + # load attached resources (e.g. standardattributes) as this breaks the + # `LOCK IN SHARE MODE` statement. + num_reg = context.session.query( + cls).filter_by(**filters).enable_eagerloads( + False).with_for_update(read=True).first() + if num_reg is None: raise exception From 358e349ebff41746fef086671e481ca8db529d78 Mon Sep 17 00:00:00 2001 From: yatinkarel Date: Tue, 7 Nov 2023 19:33:50 +0530 Subject: [PATCH 4/4] [Stable Only] Fix parent for nftables job The job was inheriting from master variant which is wrong. Closes-Bug: 2042947 Change-Id: If954d7f8150532036ce2b3e0d5b33ef48004e39c --- zuul.d/base.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zuul.d/base.yaml b/zuul.d/base.yaml index 216044f3388..ec0bc1e1875 100644 --- a/zuul.d/base.yaml +++ b/zuul.d/base.yaml @@ -151,7 +151,7 @@ - job: name: neutron-linuxbridge-tempest-plugin-scenario-nftables - parent: neutron-tempest-plugin-scenario-linuxbridge + parent: neutron-tempest-plugin-scenario-linuxbridge-yoga pre-run: playbooks/install_nftables.yaml vars: devstack_local_conf: @@ -162,7 +162,7 @@ - job: name: neutron-ovs-tempest-plugin-scenario-iptables_hybrid-nftables - parent: neutron-tempest-plugin-scenario-openvswitch-iptables_hybrid + parent: neutron-tempest-plugin-scenario-openvswitch-iptables_hybrid-yoga pre-run: playbooks/install_nftables.yaml vars: devstack_local_conf: