Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling exceptions in CMIS SM to prevent xcvrd crash #483

Merged
merged 2 commits into from
May 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions sonic-xcvrd/tests/test_xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,56 @@ def test_CmisManagerTask_task_run_with_exception(self):
assert("sonic-xcvrd/xcvrd/xcvrd.py" in str(trace))
assert("wait_for_port_config_done" in str(trace))

@patch('xcvrd.xcvrd.PortChangeObserver', MagicMock(handle_port_update_event=MagicMock()))
@patch('xcvrd.xcvrd.CmisManagerTask.wait_for_port_config_done', MagicMock())
@patch('xcvrd.xcvrd.get_cmis_application_desired', MagicMock(side_effect=KeyError))
@patch('xcvrd.xcvrd.log_exception_traceback')
@patch('xcvrd.xcvrd.XcvrTableHelper.get_status_tbl')
@patch('xcvrd.xcvrd.platform_chassis')
def test_CmisManagerTask_get_xcvr_api_exception(self, mock_platform_chassis, mock_get_status_tbl, mock_log_exception_traceback):
mock_get_status_tbl = Table("STATE_DB", TRANSCEIVER_STATUS_TABLE)
mock_sfp = MagicMock()
mock_sfp.get_presence.return_value = True
mock_platform_chassis.get_sfp = MagicMock(return_value=mock_sfp)
port_mapping = PortMapping()
stop_event = threading.Event()
task = CmisManagerTask(DEFAULT_NAMESPACE, port_mapping, stop_event)
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.get_host_tx_status = MagicMock(return_value='true')
task.get_port_admin_status = MagicMock(return_value='up')
task.get_cfg_port_tbl = MagicMock()
task.xcvr_table_helper.get_status_tbl.return_value = mock_get_status_tbl
port_change_event = PortChangeEvent('Ethernet0', 1, 0, PortChangeEvent.PORT_SET,
{'speed':'400000', 'lanes':'1,2,3,4,5,6,7,8'})

# Case 1: get_xcvr_api() raises an exception
task.on_port_update_event(port_change_event)
mock_sfp.get_xcvr_api = MagicMock(side_effect=NotImplementedError)
task.task_worker()
assert mock_log_exception_traceback.call_count == 1
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED

# Case 2: is_flat_memory() raises AttributeError. In this case, CMIS SM should transition to READY state
mock_xcvr_api = MagicMock()
mock_sfp.get_xcvr_api = MagicMock(return_value=mock_xcvr_api)
mock_xcvr_api.is_flat_memory = MagicMock(side_effect=AttributeError)
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.on_port_update_event(port_change_event)
task.task_worker()
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_READY

# Case 3: get_cmis_application_desired() raises an exception
mock_xcvr_api.is_flat_memory = MagicMock(return_value=False)
mock_xcvr_api.is_coherent_module = MagicMock(return_value=False)
mock_xcvr_api.get_module_type_abbreviation = MagicMock(return_value='QSFP-DD')
task.task_stopping_event.is_set = MagicMock(side_effect=[False, False, True])
task.on_port_update_event(port_change_event)
task.get_cmis_host_lanes_mask = MagicMock()
task.task_worker()
assert mock_log_exception_traceback.call_count == 2
assert get_cmis_state_from_state_db('Ethernet0', task.xcvr_table_helper.get_status_tbl(task.port_mapping.get_asic_id_for_logical_port('Ethernet0'))) == CMIS_STATE_FAILED
assert task.get_cmis_host_lanes_mask.call_count == 0

@patch('xcvrd.xcvrd_utilities.port_event_helper.subscribe_port_config_change', MagicMock(side_effect = NotImplementedError))
def test_DomInfoUpdateTask_task_run_with_exception(self):
port_mapping = PortMapping()
Expand Down
33 changes: 16 additions & 17 deletions sonic-xcvrd/xcvrd/xcvrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,12 @@
#
# Helper functions =============================================================
#

def log_exception_traceback():
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)

def is_cmis_api(api):
return isinstance(api, CmisApi)
Expand Down Expand Up @@ -1376,6 +1381,11 @@ def task_worker(self):
# Skip if these essential routines are not available
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will be good to understand when the Attribute error exception happens. Ideally the below line "except Exception as e:" is a superset and should cover all exceptions.

Also what is the exact error/exception when the eeprom data is corrupted/bad -- sue to bad optics

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@judyjoseph Yes - the below line "except Exception as e:" covers all exceptions apart from AttributeError. However, for AttributeError, the CMIS SM is currently being transitioned to CMIS_STATE_READY and not CMIS_STATE_FAILED.

In case of bad optics, I have seen KeyError and TypeError so far.

self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
continue
except Exception as e:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mihirpat1 why we made distinction between AttributeError vs others?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@prgeor The AttributeError relevant handling here seems to be a day 1 code. However, I am not sure on the original reason for this so decided to leave the existing behavior as is since with AttributeError, we are moving CMIS SM to CMIS_STATE_READY.
Hence, I am now handling all other exceptions in a different block and moving the CMIS SM to CMIS_STATE_FAILED state.

self.log_error("{}: Exception in xcvr api: {}".format(lport, e))
log_exception_traceback()
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
continue

# CMIS expiration and retries
#
Expand Down Expand Up @@ -1608,8 +1618,9 @@ def task_worker(self):
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_READY)
self.post_port_active_apsel_to_db(api, lport, host_lanes_mask)

except (NotImplementedError, AttributeError) as e:
except Exception as e:
self.log_error("{}: internal errors due to {}".format(lport, e))
log_exception_traceback()
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)

self.log_notice("Stopped")
Expand All @@ -1627,11 +1638,7 @@ def run(self):
self.task_worker()
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down Expand Up @@ -1791,11 +1798,7 @@ def run(self):
self.task_worker()
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down Expand Up @@ -2216,11 +2219,7 @@ def run(self):
self.task_worker(self.task_stopping_event, self.sfp_error_event)
except Exception as e:
helper_logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
exc_type, exc_value, exc_traceback = sys.exc_info()
msg = traceback.format_exception(exc_type, exc_value, exc_traceback)
for tb_line in msg:
for tb_line_split in tb_line.splitlines():
helper_logger.log_error(tb_line_split)
log_exception_traceback()
self.exc = e
self.main_thread_stop_event.set()

Expand Down
Loading