Skip to content

Commit

Permalink
[202311] Handling exceptions in CMIS SM to prevent xcvrd crash (#485)
Browse files Browse the repository at this point in the history
Signed-off-by: Mihir Patel <patelmi@microsoft.com>
  • Loading branch information
mihirpat1 committed May 2, 2024
1 parent 4c5f727 commit 867b0c7
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 17 deletions.
51 changes: 51 additions & 0 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,57 @@ def test_CmisManagerTask_task_run_with_exception(self):
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
assert("wait_for_port_config_done" in str(trace))

@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_update_event', MagicMock(return_value=(None, None)))
@patch('xcvrd.xcvrd_utilities.port_mapping.handle_port_update_event', MagicMock())
@patch('xcvrd.xcvrd.CmisManagerTask.wait_for_port_config_done', MagicMock())
@patch('xcvrd.xcvrd.get_cmis_application_desired', MagicMock(side_effect=KeyError))
@patch('xcvrd.xcvrd.log_exception_traceback')
@patch('xcvrd.xcvrd.XcvrTableHelper.get_status_tbl')
@patch('xcvrd.xcvrd.platform_chassis')
def test_CmisManagerTask_get_xcvr_api_exception(self, mock_platform_chassis, mock_get_status_tbl, mock_log_exception_traceback):
mock_get_status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE)
mock_sfp = MagicMock()
mock_sfp.get_presence.return_value = True
mock_platform_chassis.get_sfp = MagicMock(return_value=mock_sfp)
port_mapping = PortMapping()
stop_event = threading.Event()
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.get_host_tx_status = MagicMock(return_value='true')
task.get_port_admin_status = MagicMock(return_value='up')
task.get_cfg_port_tbl = MagicMock()
task.xcvr_table_helper.get_status_tbl.return_value = mock_get_status_tbl
port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET,
{'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'})

# Case 1: get_xcvr_api() raises an exception
task.on_port_update_event(port_change_event)
mock_sfp.get_xcvr_api = MagicMock(side_effect=NotImplementedError)
task.task_worker()
assert mock_log_exception_traceback.call_count == 1
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED

# Case 2: is_flat_memory() raises AttributeError. In this case, CMIS SM should transition to READY state
mock_xcvr_api = MagicMock()
mock_sfp.get_xcvr_api = MagicMock(return_value=mock_xcvr_api)
mock_xcvr_api.is_flat_memory = MagicMock(side_effect=AttributeError)
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.on_port_update_event(port_change_event)
task.task_worker()
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_READY

# Case 3: get_cmis_application_desired() raises an exception
mock_xcvr_api.is_flat_memory = MagicMock(return_value=False)
mock_xcvr_api.is_coherent_module = MagicMock(return_value=False)
mock_xcvr_api.get_module_type_abbreviation = MagicMock(return_value='QSFP-DD')
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.on_port_update_event(port_change_event)
task.get_cmis_host_lanes_mask = MagicMock()
task.task_worker()
assert mock_log_exception_traceback.call_count == 2
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED
assert task.get_cmis_host_lanes_mask.call_count == 0

@patch('xcvrd.xcvrd_utilities.port_mapping.subscribe_port_config_change', MagicMock(side_effect = NotImplementedError))
def test_DomInfoUpdateTask_task_run_with_exception(self):
port_mapping = PortMapping()
Expand Down
33 changes: 16 additions & 17 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,12 @@
#
# Helper functions =============================================================
#

def log_exception_traceback():
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)

def is_cmis_api(api):
return isinstance(api, CmisApi)
Expand Down Expand Up @@ -1346,6 +1351,11 @@ def task_worker(self):
# Skip if these essential routines are not available
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
continue
except Exception as e:
self.log_error("{}: Exception in xcvr api: {}".format(lport, e))
log_exception_traceback()
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
continue

# CMIS expiration and retries
#
Expand Down Expand Up @@ -1574,8 +1584,9 @@ def task_worker(self):
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
self.post_port_active_apsel_to_db(api, lport, host_lanes_mask)

except (NotImplementedError, AttributeError) as e:
except Exception as e:
self.log_error("{}: internal errors due to {}".format(lport, e))
log_exception_traceback()
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)

self.log_notice("Stopped")
Expand All @@ -1593,11 +1604,7 @@ def run(self):
self.task_worker()
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down Expand Up @@ -1757,11 +1764,7 @@ def run(self):
self.task_worker()
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down Expand Up @@ -2182,11 +2185,7 @@ def run(self):
self.task_worker(self.task_stopping_event, self.sfp_error_event)
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down

0 comments on commit 867b0c7

Please sign in to comment.