From 373f155cb72d8d5053e0914af07bfc7e24627043 Mon Sep 17 00:00:00 2001
From: labedz <krzysztof.tomaszewski@labedz.org>
Date: Wed, 28 Sep 2022 10:42:38 +0000
Subject: [PATCH 1/4] Spread OVN metadata agent heartbeat response in time

To avoid mass response of OVN metadata agents on
heartbeat update - event on OVN Southbound
SB_Global table nb_cfg entry increment, this patch postpone
Chassis/Chassis_Private table update for random number
of seconds in range of ( cfg.CONF.agent_down_time // 2 ).

Related-Bug: #1991817
Change-Id: I6373a3c213b24ec957e4d2ea7fc42524517d10d5
(cherry picked from commit 628442aed7400251f12809a45605bd717f494c4e)
---
 neutron/agent/ovn/metadata/agent.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py
index 950cd5ab2a0..e6eb18cd27c 100644
--- a/neutron/agent/ovn/metadata/agent.py
+++ b/neutron/agent/ovn/metadata/agent.py
@@ -14,6 +14,7 @@
 
 import collections
 import functools
+from random import randint
 import re
 import threading
 import uuid
@@ -21,6 +22,7 @@
 import netaddr
 from neutron_lib import constants as n_const
 from oslo_concurrency import lockutils
+from oslo_config import cfg
 from oslo_log import log
 from oslo_utils import netutils
 from ovsdbapp.backend.ovs_idl import event as row_event
@@ -35,10 +37,12 @@
 from neutron.common.ovn import constants as ovn_const
 from neutron.common.ovn import utils as ovn_utils
 from neutron.common import utils
+from neutron.conf.agent.database import agents_db
 from neutron.conf.plugins.ml2.drivers.ovn import ovn_conf as config
 
 
 LOG = log.getLogger(__name__)
+agents_db.register_db_agents_opts()
 _SYNC_STATE_LOCK = lockutils.ReaderWriterLock()
 CHASSIS_METADATA_LOCK = 'chassis_metadata_lock'
 
@@ -186,14 +190,27 @@ def __init__(self, metadata_agent):
         events = (self.ROW_UPDATE,)
         super(SbGlobalUpdateEvent, self).__init__(events, table, None)
         self.event_name = self.__class__.__name__
+        self.first_run = True
 
     def run(self, event, row, old):
-        table = ('Chassis_Private' if self.agent.has_chassis_private
-                 else 'Chassis')
-        self.agent.sb_idl.db_set(
-            table, self.agent.chassis, ('external_ids', {
-                ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY:
-                    str(row.nb_cfg)})).execute()
+
+        def _update_chassis(self, row):
+            table = ('Chassis_Private' if self.agent.has_chassis_private
+                     else 'Chassis')
+            self.agent.sb_idl.db_set(
+                table, self.agent.chassis, ('external_ids', {
+                    ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY:
+                        str(row.nb_cfg)})).execute()
+
+        if self.first_run:
+            interval = 0
+            self.first_run = False
+        else:
+            interval = randint(0, cfg.CONF.agent_down_time // 2)
+
+        LOG.debug("Delaying updating chassis table for %s seconds", interval)
+        timer = threading.Timer(interval, _update_chassis, [self, row])
+        timer.start()
 
 
 class MetadataAgent(object):

From 7746912334092b3c71290b7a063bf286545252e1 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Wed, 31 May 2023 13:23:32 +0100
Subject: [PATCH 2/4] Send ovn heatbeat more often.

This change modifies the metadata agent heatbeat
to use a random offset with a max delay of 10 seconds.

The orgial reason for the current logic was to mitigate
https://bugs.launchpad.net/neutron/+bug/1991817
so the logic to spread the heatbeats is maintained but
we now set an upper bound on the delay.

Close-Bug: #2020215
Change-Id: I4d382793255520b9c44ca2aaacebcbda9a432dde
(cherry picked from commit 5e0c102830a18850e35f746160867613e96d1dbc)
---
 neutron/agent/ovn/metadata/agent.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py
index e6eb18cd27c..af06a7a7937 100644
--- a/neutron/agent/ovn/metadata/agent.py
+++ b/neutron/agent/ovn/metadata/agent.py
@@ -202,14 +202,21 @@ def _update_chassis(self, row):
                     ovn_const.OVN_AGENT_METADATA_SB_CFG_KEY:
                         str(row.nb_cfg)})).execute()
 
+        delay = 0
         if self.first_run:
-            interval = 0
             self.first_run = False
         else:
-            interval = randint(0, cfg.CONF.agent_down_time // 2)
-
-        LOG.debug("Delaying updating chassis table for %s seconds", interval)
-        timer = threading.Timer(interval, _update_chassis, [self, row])
+            # We occasionally see port binding failed errors due to
+            # the ml2 driver refusing to bind the port to a dead agent.
+            # if all agents heartbeat at the same time, they will all
+            # cause a load spike on the server. To mitigate that we
+            # need to spread out the load by introducing a random delay.
+            # clamp the max delay between 3 and 10 seconds.
+            max_delay = max(min(cfg.CONF.agent_down_time // 3, 10), 3)
+            delay = randint(0, max_delay)
+
+        LOG.debug("Delaying updating chassis table for %s seconds", delay)
+        timer = threading.Timer(delay, _update_chassis, [self, row])
         timer.start()
 
 

From 921cde14c15611d1ad4153881c3eae4f56e09a4f Mon Sep 17 00:00:00 2001
From: Felix Huettner <felix.huettner@mail.schwarz>
Date: Wed, 1 Mar 2023 16:14:18 +0100
Subject: [PATCH 3/4] Reduce lock contention on subnets

HINT: This isn't a clean backport, as we keep the subnet in-use field.
We can't backport the db update that would remove the field.

in [1] a lock was introduced with the goal of preventing subnets from
being deleted while ports are being created in them in parallel.
This was acheived by aquiring an exclusive lock on the row of the
subnet in the Subnet table when adding/modifying a port or deleting
the subnet.

However as this was a exclusive lock it also prevented concurrent port
modifications on the same subnet from happening. This can cause
performance issues on environment with large shared subnets (e.g. a
large external subnet).

To reduce the lock contention for this case we split the lock in two
parts:

* For normal port operations we will aquire a shared lock on the
  row of the subnet. This allows multiple such operations to happen in
  parallel.
* For deleting a subnet we will aquire an exclusive lock on the row of
  the subnet. This lock can not be aquired when there is any shared
  lock currently on the row.

With this we maintain the same locking level as before, but reduce the
amount of lock contention happening and thereby improve throughput.

The performance improvement can be measured using rally test [2].
(improving from 21 to 18 seconds).
Alternatively it can be tested using 250 parallel curl calls to create a
port in the same network. This improves from 113s to 42s.

[1]: https://review.opendev.org/c/openstack/neutron/+/713045
[2]: https://github.com/openstack/rally-openstack/blob/master/samples/tasks/scenarios/neutron/create-and-delete-ports.json

Closes-Bug: #2009055
Change-Id: I31b1a9c2f986f59fee0da265acebbd88d2f8e4f8
(cherry picked from commit c0af5b3b5ea89d3147adf1054625f29d5b01b309)
---
 neutron/db/db_base_plugin_v2.py  |  2 +-
 neutron/db/ipam_backend_mixin.py |  2 +-
 neutron/db/models_v2.py          | 52 ++++++++++++++++++++++++--------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/neutron/db/db_base_plugin_v2.py b/neutron/db/db_base_plugin_v2.py
index 8cdf757293d..973bad7cbd9 100644
--- a/neutron/db/db_base_plugin_v2.py
+++ b/neutron/db/db_base_plugin_v2.py
@@ -75,7 +75,7 @@
 
 
 def _ensure_subnet_not_used(context, subnet_id):
-    models_v2.Subnet.lock_register(
+    models_v2.Subnet.write_lock_register(
         context, exc.SubnetInUse(subnet_id=subnet_id), id=subnet_id)
     try:
         registry.publish(
diff --git a/neutron/db/ipam_backend_mixin.py b/neutron/db/ipam_backend_mixin.py
index 088bd650b25..75f4c9f14dc 100644
--- a/neutron/db/ipam_backend_mixin.py
+++ b/neutron/db/ipam_backend_mixin.py
@@ -688,7 +688,7 @@ def _ipam_get_subnets(self, context, network_id, host, service_type=None,
             msg = ('This subnet is being modified by another concurrent '
                    'operation')
             for subnet in subnets:
-                subnet.lock_register(
+                subnet.read_lock_register(
                     context, exc.SubnetInUse(subnet_id=subnet.id, reason=msg),
                     id=subnet.id)
             subnet_dicts = [self._make_subnet_dict(subnet, context=context)
diff --git a/neutron/db/models_v2.py b/neutron/db/models_v2.py
index 9e001c7dbdd..d8c74c27deb 100644
--- a/neutron/db/models_v2.py
+++ b/neutron/db/models_v2.py
@@ -33,25 +33,51 @@
 class HasInUse(object):
     """NeutronBaseV2 mixin, to add the flag "in_use" to a DB model.
 
-    The content of this flag (boolean) parameter is not relevant. The goal of
-    this field is to be used in a write transaction to mark a DB register as
-    "in_use". Writing any value on this DB parameter will lock the container
-    register. At the end of the DB transaction, the DB engine will check if
-    this register was modified or deleted. In such case, the transaction will
-    fail and won't be commited.
-
-    "lock_register" is the method to write the register "in_use" column.
-    Because the lifespan of this DB lock is the DB transaction, there isn't an
-    unlock method. The lock will finish once the transaction ends.
+    The goal of this class is to allow users lock specific database rows with
+    a shared or exclusive lock (without necessarily introducing a change in
+    the table itself). Having these locks allows the DB engine to prevent
+    concurrent modifications (e.g. the deletion of a resource while we are
+    currently adding a new dependency on the resource).
+
+    "read_lock_register" takes a shared DB lock on the row specified by the
+    filters. The lock is automatically released once the transaction ends.
+    You can have any number of parallel read locks on the same DB row. But
+    you can not have any write lock in parallel.
+
+    "write_lock_register" takes an exclusive DB lock on the row specified by
+    the filters. The lock is automatically released on transaction commit.
+    You may only have one write lock on each row at a time. It therefor
+    blocks all other read and write locks to this row.
     """
+    # keep this value to not need to update the database schema
+    # only at backport
     in_use = sa.Column(sa.Boolean(), nullable=False,
                        server_default=sql.false(), default=False)
 
     @classmethod
-    def lock_register(cls, context, exception, **filters):
+    def write_lock_register(cls, context, exception, **filters):
+        # we use `with_for_update()` to include `FOR UPDATE` in the sql
+        # statement.
+        # we need to set `enable_eagerloads(False)` so that we do not try to
+        # load attached resources (e.g. standardattributes) as this breaks the
+        # `FOR UPDATE` statement.
         num_reg = context.session.query(
-            cls).filter_by(**filters).update({'in_use': True})
-        if num_reg != 1:
+            cls).filter_by(**filters).enable_eagerloads(
+                False).with_for_update().first()
+        if num_reg is None:
+            raise exception
+
+    @classmethod
+    def read_lock_register(cls, context, exception, **filters):
+        # we use `with_for_update(read=True)` to include `LOCK IN SHARE MODE`
+        # in the sql statement.
+        # we need to set `enable_eagerloads(False)` so that we do not try to
+        # load attached resources (e.g. standardattributes) as this breaks the
+        # `LOCK IN SHARE MODE` statement.
+        num_reg = context.session.query(
+            cls).filter_by(**filters).enable_eagerloads(
+                False).with_for_update(read=True).first()
+        if num_reg is None:
             raise exception
 
 

From 358e349ebff41746fef086671e481ca8db529d78 Mon Sep 17 00:00:00 2001
From: yatinkarel <ykarel@redhat.com>
Date: Tue, 7 Nov 2023 19:33:50 +0530
Subject: [PATCH 4/4] [Stable Only] Fix parent for nftables job

The job was inheriting from master variant which
is wrong.

Closes-Bug: 2042947
Change-Id: If954d7f8150532036ce2b3e0d5b33ef48004e39c
---
 zuul.d/base.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zuul.d/base.yaml b/zuul.d/base.yaml
index 216044f3388..ec0bc1e1875 100644
--- a/zuul.d/base.yaml
+++ b/zuul.d/base.yaml
@@ -151,7 +151,7 @@
 
 - job:
     name: neutron-linuxbridge-tempest-plugin-scenario-nftables
-    parent: neutron-tempest-plugin-scenario-linuxbridge
+    parent: neutron-tempest-plugin-scenario-linuxbridge-yoga
     pre-run: playbooks/install_nftables.yaml
     vars:
       devstack_local_conf:
@@ -162,7 +162,7 @@
 
 - job:
     name: neutron-ovs-tempest-plugin-scenario-iptables_hybrid-nftables
-    parent: neutron-tempest-plugin-scenario-openvswitch-iptables_hybrid
+    parent: neutron-tempest-plugin-scenario-openvswitch-iptables_hybrid-yoga
     pre-run: playbooks/install_nftables.yaml
     vars:
       devstack_local_conf: