From e0bd5872cd1a9fbab9699b8e3503c6df4c65e362 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 20 Sep 2024 18:42:39 +0100 Subject: [PATCH 1/5] Remove unused poller No functional change as this wasn't actually in use. --- networking_mlnx/eswitchd/eswitch_daemon.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/networking_mlnx/eswitchd/eswitch_daemon.py b/networking_mlnx/eswitchd/eswitch_daemon.py index c0b527c..ef8de47 100644 --- a/networking_mlnx/eswitchd/eswitch_daemon.py +++ b/networking_mlnx/eswitchd/eswitch_daemon.py @@ -70,8 +70,6 @@ def _init_connections(self): self.conn_os_url = set_conn_url(os_transport, os_addr, os_port) self.socket_os.bind(self.conn_os_url) - self.poller = zmq.Poller() - self.poller.register(self.socket_os, zmq.POLLIN) def _handle_msg(self): data = None From 150e08f3990fd15d86f7c268a88b94547a6fe142 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 20 Sep 2024 18:46:33 +0100 Subject: [PATCH 2/5] Set ZMQ_LINGER This will discard any pending messages immediately when the socket is closed. The default is to wait forever for any pending messages which can cause the the daemon to hang on shutdown. See: http://api.zeromq.org/4-2:zmq-setsockopt --- networking_mlnx/eswitchd/eswitch_daemon.py | 1 + 1 file changed, 1 insertion(+) diff --git a/networking_mlnx/eswitchd/eswitch_daemon.py b/networking_mlnx/eswitchd/eswitch_daemon.py index ef8de47..26ee66d 100644 --- a/networking_mlnx/eswitchd/eswitch_daemon.py +++ b/networking_mlnx/eswitchd/eswitch_daemon.py @@ -64,6 +64,7 @@ def _parse_physical_mapping(self): def _init_connections(self): context = zmq.Context() self.socket_os = context.socket(zmq.REP) + self.socket.setsockopt(zmq.LINGER, 0) os_transport = constants.SOCKET_OS_TRANSPORT os_port = constants.SOCKET_OS_PORT os_addr = constants.SOCKET_OS_ADDR From 0752bd683fcb2ce1510d2cbd74ffb1aefdc0ccc4 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 20 Sep 2024 18:57:18 +0100 Subject: [PATCH 3/5] Always send a response Previously, if data was None, we'd not send a reply. This can lead to clients hanging when they are expecting a reply. --- networking_mlnx/eswitchd/eswitch_daemon.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/networking_mlnx/eswitchd/eswitch_daemon.py b/networking_mlnx/eswitchd/eswitch_daemon.py index 26ee66d..f68c3ae 100644 --- a/networking_mlnx/eswitchd/eswitch_daemon.py +++ b/networking_mlnx/eswitchd/eswitch_daemon.py @@ -80,15 +80,20 @@ def _handle_msg(self): if msg: data = jsonutils.loads(msg) - msg = None + result = { + "status": "FAIL", + "action": data.get("action", "UNKNOWN"), + "reason": "UNKNOWN" + } if data: try: result = self.dispatcher.handle_msg(data) - msg = jsonutils.dumps(result) except Exception as e: LOG.exception("Exception during message handling - %s", e) - msg = jsonutils.dumps(str(e)) - sender.send_string(msg) + result["reason"] = str(e) + + msg = jsonutils.dumps(result) + sender.send_string(msg) def daemon_loop(self): LOG.info("Daemon Started!") From bccbca237ccfbadfa54897449c7206ac56ecf628 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 20 Sep 2024 19:21:12 +0100 Subject: [PATCH 4/5] Fix some file descriptor leaks Symptom was that eswitchd would become unresponsive after some time. The following error was observed: GetVnics failed - [Errno 24] Too many open files: OSError: [Errno 24] Too many open files This was tracked down to forgetting to call close on IPRoute objects as per the documentation. See: - https://docs.pyroute2.org/iproute.html - https://github.com/svinota/pyroute2/issues/631 --- networking_mlnx/eswitchd/utils/pci_utils.py | 4 +- .../internal/netdev_ops/impl_pyroute2.py | 74 +++++++++---------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/networking_mlnx/eswitchd/utils/pci_utils.py b/networking_mlnx/eswitchd/utils/pci_utils.py index 8214300..a9bb74c 100644 --- a/networking_mlnx/eswitchd/utils/pci_utils.py +++ b/networking_mlnx/eswitchd/utils/pci_utils.py @@ -71,8 +71,8 @@ def get_vfs_info(self, pf): def get_dev_attr(self, attr_path): try: - fd = open(attr_path) - return fd.readline().strip() + with open(attr_path) as fd: + return fd.readline().strip() except IOError: return diff --git a/networking_mlnx/internal/netdev_ops/impl_pyroute2.py b/networking_mlnx/internal/netdev_ops/impl_pyroute2.py index 9255796..31233f8 100644 --- a/networking_mlnx/internal/netdev_ops/impl_pyroute2.py +++ b/networking_mlnx/internal/netdev_ops/impl_pyroute2.py @@ -33,16 +33,16 @@ def set_vf_admin_state(self, pf_ifname, vf_idx, state): :param state: desired admin state as defined in networking_mlnx.internal.netdev_ops.constants """ - try: - ip = pyroute2.IPRoute() - link_idx = ip.link_lookup(ifname=pf_ifname)[0] - ip.link( - 'set', index=link_idx, vf={'vf': int(vf_idx), - 'link_state': state}) - except IndexError: - raise exceptions.NetworkInterfaceNotFound(pf_ifname) - except pyroute2.NetlinkError as e: - raise exceptions.NetlinkRuntimeError(e) + with pyroute2.IPRoute() as ip: + try: + link_idx = ip.link_lookup(ifname=pf_ifname)[0] + ip.link( + 'set', index=link_idx, vf={'vf': int(vf_idx), + 'link_state': state}) + except IndexError: + raise exceptions.NetworkInterfaceNotFound(pf_ifname) + except pyroute2.NetlinkError as e: + raise exceptions.NetlinkRuntimeError(e) def set_link_state(self, ifname, state): """Set net device link state @@ -51,14 +51,14 @@ def set_link_state(self, ifname, state): :param state: desired link state as defined in networking_mlnx.internal.netdev_ops.constants """ - try: - ip = pyroute2.IPRoute() - link_idx = ip.link_lookup(ifname=ifname)[0] - ip.link('set', index=link_idx, state=state) - except IndexError: - raise exceptions.NetworkInterfaceNotFound(ifname) - except pyroute2.NetlinkError as e: - raise exceptions.NetlinkRuntimeError(e) + with pyroute2.IPRoute() as ip: + try: + link_idx = ip.link_lookup(ifname=ifname)[0] + ip.link('set', index=link_idx, state=state) + except IndexError: + raise exceptions.NetworkInterfaceNotFound(ifname) + except pyroute2.NetlinkError as e: + raise exceptions.NetlinkRuntimeError(e) def set_vf_guid(self, pf_ifname, vf_idx, guid): """Set vf administrative port and node GUID @@ -68,17 +68,17 @@ def set_vf_guid(self, pf_ifname, vf_idx, guid): :param guid: 64bit guid str in xx:xx:xx:xx:xx:xx:xx:xx format where x is a hexadecimal digit. """ - try: - ip = pyroute2.IPRoute() - link_idx = ip.link_lookup(ifname=pf_ifname)[0] - ip.link('set', index=link_idx, vf={'vf': int(vf_idx), - 'ib_port_guid': guid}) - ip.link('set', index=link_idx, - vf={'vf': int(vf_idx), 'ib_node_guid': guid}) - except IndexError: - raise exceptions.NetworkInterfaceNotFound(pf_ifname) - except pyroute2.NetlinkError as e: - raise exceptions.NetlinkRuntimeError(e) + with pyroute2.IPRoute() as ip: + try: + link_idx = ip.link_lookup(ifname=pf_ifname)[0] + ip.link('set', index=link_idx, vf={'vf': int(vf_idx), + 'ib_port_guid': guid}) + ip.link('set', index=link_idx, + vf={'vf': int(vf_idx), 'ib_node_guid': guid}) + except IndexError: + raise exceptions.NetworkInterfaceNotFound(pf_ifname) + except pyroute2.NetlinkError as e: + raise exceptions.NetlinkRuntimeError(e) def get_vf_guid(self, pf_ifname, vf_idx): """Get vf administrative GUID @@ -91,14 +91,14 @@ def get_vf_guid(self, pf_ifname, vf_idx): NOTE: while there are two GUIDs assigned per VF (port and node GUID) we assume they are the same and return just one value. """ - try: - ip = pyroute2.IPRoute() - link_idx = ip.link_lookup(ifname=pf_ifname)[0] - attrs = ip.link('get', index=link_idx, ext_mask=1)[0] - except IndexError: - raise exceptions.NetworkInterfaceNotFound(pf_ifname) - except pyroute2.NetlinkError as e: - raise exceptions.NetlinkRuntimeError(e) + with pyroute2.IPRoute() as ip: + try: + link_idx = ip.link_lookup(ifname=pf_ifname)[0] + attrs = ip.link('get', index=link_idx, ext_mask=1)[0] + except IndexError: + raise exceptions.NetworkInterfaceNotFound(pf_ifname) + except pyroute2.NetlinkError as e: + raise exceptions.NetlinkRuntimeError(e) vf_attr = (attrs.get_attr('IFLA_VFINFO_LIST'). get_attrs("IFLA_VF_INFO"))[int(vf_idx)] From 73273a61c44edf625cb12646007b9020248a3aec Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 20 Sep 2024 19:35:34 +0100 Subject: [PATCH 5/5] Set manadatory ip_version field in dhcp opts The API is defined in: https://github.com/openstack/neutron-lib/blob/90fa9948fd3b36d121ec5fa0b8004404f6d86b45/neutron_lib/api/definitions/extra_dhcp_opt.py#L32 We were missing the ip_version field. This was causing a bad interaction with the OVN driver. See: https://bugs.launchpad.net/neutron/+bug/2081161 --- networking_mlnx/plugins/ml2/drivers/mlnx/mech_mlnx.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/networking_mlnx/plugins/ml2/drivers/mlnx/mech_mlnx.py b/networking_mlnx/plugins/ml2/drivers/mlnx/mech_mlnx.py index e85dd78..95e08f3 100644 --- a/networking_mlnx/plugins/ml2/drivers/mlnx/mech_mlnx.py +++ b/networking_mlnx/plugins/ml2/drivers/mlnx/mech_mlnx.py @@ -86,7 +86,8 @@ def _gen_client_id_with_prefix(self, port, prefix): def _gen_client_id_opt(self, port): client_id = self._gen_client_id(port) return [{"opt_name": edo_ext.DHCP_OPT_CLIENT_ID, - "opt_value": client_id}] + "opt_value": client_id, + "ip_version": 4}] def _gen_none_client_id_opt(self, port): updated_extra_dhcp_opts = []