From b4189dbd9f2c49ca21324536a5c34c72aff51548 Mon Sep 17 00:00:00 2001 From: Terry Wilson Date: Wed, 6 Mar 2024 20:13:58 +0000 Subject: [PATCH 1/4] Use oslo_service's SignalHandler for signals When Neutron is killed with SIGTERM (like via systemctl), when using ML2/OVN neutron workers do not exit and instead are eventually killed with SIGKILL when the graceful timeout is reached (often around 1 minute). This is happening due to the signal handlers for SIGTERM. There are multiple issues. 1) oslo_service, ml2/ovn mech_driver, and ml2/ovo_rpc.py all call signal.signal(signal.SIGTERM, ...) overwriting each others signal handlers. 2) SIGTERM is handled in the main thread, and running blocking code there causes AssertionErrors in eventlet which also prevents the process from exiting. 3) The ml2/ovn cleanup code doesn't cause the process to end, so it interrupts the killing of the process. oslo_service has a singleton SignalHandler class that solves all of these issues Closes-Bug: #2056366 Change-Id: I730a12746bceaa744c658854e38439420efc4629 Signed-off-by: Terry Wilson (cherry picked from commit a4e49b6b8fcf9acfa4e84c65de19ffd56b9022e7) (cherry picked from commit 2a09a4b802d2f959c9b2a847bc37a5c6b3b01dfc) --- neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py | 5 +++-- neutron/plugins/ml2/ovo_rpc.py | 7 ++++--- requirements.txt | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py index c3a63bab80a..65527020732 100644 --- a/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py +++ b/neutron/plugins/ml2/drivers/ovn/mech_driver/mech_driver.py @@ -19,7 +19,6 @@ import functools import multiprocessing import operator -import signal import threading import types import uuid @@ -43,6 +42,7 @@ from oslo_config import cfg from oslo_db import exception as os_db_exc from oslo_log import log +from oslo_service import service as oslo_service from oslo_utils import timeutils from ovsdbapp.backend.ovs_idl import idlutils @@ -312,8 +312,9 @@ def _setup_hash_ring(self): themselves to the hash ring. """ # Attempt to remove the node from the ring when the worker stops + sh = oslo_service.SignalHandler() atexit.register(self._remove_node_from_hash_ring) - signal.signal(signal.SIGTERM, self._remove_node_from_hash_ring) + sh.add_handler("SIGTERM", self._remove_node_from_hash_ring) admin_context = n_context.get_admin_context() if not self._hash_ring_probe_event.is_set(): diff --git a/neutron/plugins/ml2/ovo_rpc.py b/neutron/plugins/ml2/ovo_rpc.py index a63998c1698..788860bf180 100644 --- a/neutron/plugins/ml2/ovo_rpc.py +++ b/neutron/plugins/ml2/ovo_rpc.py @@ -13,7 +13,6 @@ import atexit import queue -import signal import threading import traceback import weakref @@ -24,6 +23,7 @@ from neutron_lib import context as n_ctx from neutron_lib.db import api as db_api from oslo_log import log as logging +from oslo_service import service from neutron.api.rpc.callbacks import events as rpc_events from neutron.api.rpc.handlers import resources_rpc @@ -38,8 +38,9 @@ def _setup_change_handlers_cleanup(): atexit.register(_ObjectChangeHandler.clean_up) - signal.signal(signal.SIGINT, _ObjectChangeHandler.clean_up) - signal.signal(signal.SIGTERM, _ObjectChangeHandler.clean_up) + sh = service.SignalHandler() + sh.add_handler("SIGINT", _ObjectChangeHandler.clean_up) + sh.add_handler("SIGTERM", _ObjectChangeHandler.clean_up) class _ObjectChangeHandler(object): diff --git a/requirements.txt b/requirements.txt index 58531dea3c3..0fc98413d18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,7 +42,7 @@ oslo.privsep>=2.3.0 # Apache-2.0 oslo.reports>=1.18.0 # Apache-2.0 oslo.rootwrap>=5.15.0 # Apache-2.0 oslo.serialization>=2.25.0 # Apache-2.0 -oslo.service>=2.8.0 # Apache-2.0 +oslo.service>=3.1.2 # Apache-2.0 oslo.upgradecheck>=1.3.0 # Apache-2.0 oslo.utils>=4.8.0 # Apache-2.0 oslo.versionedobjects>=1.35.1 # Apache-2.0 From 2e238990e660609927b9464b355b8cac64a8c60d Mon Sep 17 00:00:00 2001 From: Rodolfo Alonso Hernandez Date: Tue, 3 Sep 2024 09:30:54 +0000 Subject: [PATCH 2/4] [SR-IOV] The port status=DOWN has precedence in the VF link status If a ML2/SR-IOV port is disabled (status=DOWN), it will have precedence on the VF link state value over the "auto" value. That will stop any transmission from the VF. Closes-Bug: #2078789 Change-Id: I11d973d245dd391623e501aa14b470daa780b4db (cherry picked from commit 8211c29158d6fc8a1af938c326dfbaa685428a4a) --- .../plugins/ml2/drivers/mech_sriov/agent/pci_lib.py | 8 ++++++-- .../ml2/drivers/mech_sriov/agent/test_pci_lib.py | 11 ++++++++++- ...state-disable-has-precedence-2adecdf959dc0f9e.yaml | 7 +++++++ 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 releasenotes/notes/sriov-vf-state-disable-has-precedence-2adecdf959dc0f9e.yaml diff --git a/neutron/plugins/ml2/drivers/mech_sriov/agent/pci_lib.py b/neutron/plugins/ml2/drivers/mech_sriov/agent/pci_lib.py index 32931841c12..55a43bcf9dc 100644 --- a/neutron/plugins/ml2/drivers/mech_sriov/agent/pci_lib.py +++ b/neutron/plugins/ml2/drivers/mech_sriov/agent/pci_lib.py @@ -73,10 +73,14 @@ def set_vf_state(self, vf_index, state, auto=False): @param auto: set link_state to auto (0) """ ip = self.device(self.dev_name) - if auto: + # NOTE(ralonsoh): the state=False --> "disable" (2) has precedence over + # "auto" (0) and "enable" (1). + if state is False: + link_state = 2 + elif auto: link_state = 0 else: - link_state = 1 if state else 2 + link_state = 1 vf_config = {'vf': vf_index, 'link_state': link_state} ip.link.set_vf_feature(vf_config) diff --git a/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/agent/test_pci_lib.py b/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/agent/test_pci_lib.py index 8276e189a4f..962c0772ee3 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/agent/test_pci_lib.py +++ b/neutron/tests/unit/plugins/ml2/drivers/mech_sriov/agent/test_pci_lib.py @@ -67,20 +67,29 @@ def test_get_vf_state_not_present(self): self.assertEqual(pci_lib.LinkState.disable.name, result) def test_set_vf_state(self): + # state=True, auto=False --> link_state=enable self.pci_wrapper.set_vf_state(self.VF_INDEX, True) vf = {'vf': self.VF_INDEX, 'link_state': 1} self.mock_ip_device.link.set_vf_feature.assert_called_once_with(vf) + # state=False, auto=False --> link_state=disable self.mock_ip_device.link.set_vf_feature.reset_mock() self.pci_wrapper.set_vf_state(self.VF_INDEX, False) vf = {'vf': self.VF_INDEX, 'link_state': 2} self.mock_ip_device.link.set_vf_feature.assert_called_once_with(vf) + # state=True, auto=True --> link_state=auto self.mock_ip_device.link.set_vf_feature.reset_mock() - self.pci_wrapper.set_vf_state(self.VF_INDEX, False, auto=True) + self.pci_wrapper.set_vf_state(self.VF_INDEX, True, auto=True) vf = {'vf': self.VF_INDEX, 'link_state': 0} self.mock_ip_device.link.set_vf_feature.assert_called_once_with(vf) + # state=False, auto=True --> link_state=disable + self.mock_ip_device.link.set_vf_feature.reset_mock() + self.pci_wrapper.set_vf_state(self.VF_INDEX, False, auto=True) + vf = {'vf': self.VF_INDEX, 'link_state': 2} + self.mock_ip_device.link.set_vf_feature.assert_called_once_with(vf) + def test_set_vf_spoofcheck(self): self.pci_wrapper.set_vf_spoofcheck(self.VF_INDEX, True) vf = {'vf': self.VF_INDEX, 'spoofchk': 1} diff --git a/releasenotes/notes/sriov-vf-state-disable-has-precedence-2adecdf959dc0f9e.yaml b/releasenotes/notes/sriov-vf-state-disable-has-precedence-2adecdf959dc0f9e.yaml new file mode 100644 index 00000000000..6ab968a767e --- /dev/null +++ b/releasenotes/notes/sriov-vf-state-disable-has-precedence-2adecdf959dc0f9e.yaml @@ -0,0 +1,7 @@ +--- +security: + - | + A ML2/SR-IOV port with status=DOWN will always set the VF link state to + "disable", regardless of the ``propagate_uplink_status`` port field value. + The port disabling, to stop any transmission, has precedence over the + link state "auto" value. From bb9a4338e9b7138234d199de24d19594ca6d65b5 Mon Sep 17 00:00:00 2001 From: Rodolfo Alonso Hernandez Date: Tue, 3 Sep 2024 10:31:24 +0000 Subject: [PATCH 3/4] Protect the "standardattr" retrieval from a concurrent deletion The method ``_extend_tags_dict`` can be called from a "list" operation. If one resource and its "standardattr" register is deleted concurrently, the "standard_attr" field retrieval will fail. The "list" operation is protected with a READER transaction context; however this is failing with the DB PostgreSQL backend. Closes-Bug: #2078787 Change-Id: I55142ce21cec8bd8e2d6b7b8b20c0147873699da (cherry picked from commit c7d07b7421034c2722fb0d0cfd2371e052928b97) --- neutron/services/tag/tag_plugin.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/neutron/services/tag/tag_plugin.py b/neutron/services/tag/tag_plugin.py index c44860034d5..ee2b468c8bb 100644 --- a/neutron/services/tag/tag_plugin.py +++ b/neutron/services/tag/tag_plugin.py @@ -47,7 +47,17 @@ def __new__(cls, *args, **kwargs): def _extend_tags_dict(response_data, db_data): if not directory.get_plugin(tagging.TAG_PLUGIN_TYPE): return - tags = [tag_db.tag for tag_db in db_data.standard_attr.tags] + try: + tags = [tag_db.tag for tag_db in db_data.standard_attr.tags] + except AttributeError: + # NOTE(ralonsoh): this method can be called from a "list" + # operation. If one resource and its "standardattr" register is + # deleted concurrently, the "standard_attr" field retrieval will + # fail. + # The "list" operation is protected with a READER transaction + # context; however this is failing with the DB PostgreSQL backend. + # https://bugs.launchpad.net/neutron/+bug/2078787 + tags = [] response_data['tags'] = tags @db_api.CONTEXT_READER From f7d1df8ec0cfc10290a1b2e6d91656bb8845d73c Mon Sep 17 00:00:00 2001 From: yatinkarel Date: Thu, 19 Sep 2024 18:32:11 +0530 Subject: [PATCH 4/4] Handle EndpointNotFound in nova notifier Currently if the nova endpoint do not exist exception is raised. Even the endpoint gets created notification keeps on failing until the session expires. If the endpoint not exist the session is not useful so marking it as invalid, this will ensure if endpoint is created later the notification do not fail. Closes-Bug: #2081174 Change-Id: I1f7fd1d1371ca0a3c4edb409cffd2177d44a1f23 (cherry picked from commit 7d1a20ed4d458c6682a52679b71b6bc8dea20d07) --- neutron/notifiers/nova.py | 3 +++ neutron/tests/unit/notifiers/test_nova.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/neutron/notifiers/nova.py b/neutron/notifiers/nova.py index 1269187ec98..9cb6c3880e2 100644 --- a/neutron/notifiers/nova.py +++ b/neutron/notifiers/nova.py @@ -281,6 +281,9 @@ def send_events(self, batched_events): try: response = novaclient.server_external_events.create( batched_events) + except ks_exceptions.EndpointNotFound: + LOG.exception("Nova endpoint not found, invalidating the session") + self.session.invalidate() except nova_exceptions.NotFound: LOG.debug("Nova returned NotFound for event: %s", batched_events) diff --git a/neutron/tests/unit/notifiers/test_nova.py b/neutron/tests/unit/notifiers/test_nova.py index 88c9690c2dd..302081897f1 100644 --- a/neutron/tests/unit/notifiers/test_nova.py +++ b/neutron/tests/unit/notifiers/test_nova.py @@ -237,6 +237,16 @@ def test_no_notification_notify_nova_on_port_data_changes_false(self): {}, {}) self.assertFalse(send_events.called) + @mock.patch('novaclient.client.Client') + def test_nova_send_events_noendpoint_invalidate_session(self, mock_client): + create = mock_client().server_external_events.create + create.side_effect = ks_exc.EndpointNotFound + with mock.patch.object(self.nova_notifier.session, + 'invalidate', return_value=True) as mock_sess: + self.nova_notifier.send_events([]) + create.assert_called() + mock_sess.assert_called() + @mock.patch('novaclient.client.Client') def test_nova_send_events_returns_bad_list(self, mock_client): create = mock_client().server_external_events.create