diff --git a/diagnostic_aggregator/src/aggregator.cpp b/diagnostic_aggregator/src/aggregator.cpp index 5ae84003..4c4ba992 100644 --- a/diagnostic_aggregator/src/aggregator.cpp +++ b/diagnostic_aggregator/src/aggregator.cpp @@ -230,7 +230,9 @@ void Aggregator::publishData() diag_toplevel_state.name = "toplevel_state"; diag_toplevel_state.level = DiagnosticStatus::STALE; int max_level = -1; - int min_level = 255; + uint8_t max_level_without_stale = 0; + int non_ok_status_depth = 0; + std::shared_ptr msg_to_report; std::vector> processed; { @@ -239,12 +241,23 @@ void Aggregator::publishData() } for (const auto & msg : processed) { diag_array.status.push_back(*msg); + const auto depth = std::count(msg->name.begin(), msg->name.end(), '/'); if (msg->level > max_level) { max_level = msg->level; + non_ok_status_depth = depth; + msg_to_report = msg; } - if (msg->level < min_level) { - min_level = msg->level; + if (msg->level == max_level && depth > non_ok_status_depth) { + // On non okay diagnostics also copy the deepest message to toplevel state + non_ok_status_depth = depth; + msg_to_report = msg; + } + if ( + msg->level > max_level_without_stale && + msg->level != diagnostic_msgs::msg::DiagnosticStatus::STALE) + { + max_level_without_stale = msg->level; } } @@ -252,26 +265,47 @@ void Aggregator::publishData() other_analyzer_->report(); for (const auto & msg : processed_other) { diag_array.status.push_back(*msg); + const auto depth = std::count(msg->name.begin(), msg->name.end(), '/'); if (msg->level > max_level) { max_level = msg->level; + non_ok_status_depth = depth; + msg_to_report = msg; + } + if (msg->level == max_level && depth > non_ok_status_depth) { + // On non okay diagnostics also copy the deepest message to toplevel state + non_ok_status_depth = depth; + msg_to_report = msg; } - if (msg->level < min_level) { - min_level = msg->level; + if ( + msg->level > max_level_without_stale && + msg->level != diagnostic_msgs::msg::DiagnosticStatus::STALE) + { + max_level_without_stale = msg->level; } } + // When a non-ok item was found, surface the offender via message/hardware_id/values + // but keep name stable as "toplevel_state" to avoid breaking downstream consumers + if (max_level > DiagnosticStatus::OK && msg_to_report) { + diag_toplevel_state.message = msg_to_report->name + ": " + msg_to_report->message; + diag_toplevel_state.hardware_id = msg_to_report->hardware_id; + diag_toplevel_state.values = msg_to_report->values; + } + diag_array.header.stamp = clock_->now(); agg_pub_->publish(diag_array); - diag_toplevel_state.level = max_level; - if (max_level < 0 || - (max_level > DiagnosticStatus::ERROR && min_level <= DiagnosticStatus::ERROR)) - { - // Top level is error if we got no diagnostic level or - // have stale items but not all are stale - diag_toplevel_state.level = DiagnosticStatus::ERROR; + if (max_level_without_stale > DiagnosticStatus::OK) { + diag_toplevel_state.level = max_level_without_stale; + } else if (max_level == diagnostic_msgs::msg::DiagnosticStatus::STALE) { + diag_toplevel_state.level = diagnostic_msgs::msg::DiagnosticStatus::STALE; + } else if (max_level < 0) { + diag_toplevel_state.level = diagnostic_msgs::msg::DiagnosticStatus::STALE; + } else { + diag_toplevel_state.level = DiagnosticStatus::OK; } + last_top_level_state_ = diag_toplevel_state.level; toplevel_state_pub_->publish(diag_toplevel_state); diff --git a/diagnostic_aggregator/src/analyzer_group.cpp b/diagnostic_aggregator/src/analyzer_group.cpp index 0873ac4c..a230c326 100644 --- a/diagnostic_aggregator/src/analyzer_group.cpp +++ b/diagnostic_aggregator/src/analyzer_group.cpp @@ -292,7 +292,7 @@ std::vector> AnalyzerGro return output; } - bool all_stale = true; + uint8_t max_level_without_stale = 0; for (auto j = 0u; j < analyzers_.size(); ++j) { std::string path = analyzers_[j]->getPath(); @@ -317,17 +317,22 @@ std::vector> AnalyzerGro kv.key = nice_name; kv.value = processed[i]->message; - all_stale = all_stale && - (processed[i]->level == diagnostic_msgs::msg::DiagnosticStatus::STALE); + if (processed[i]->level != diagnostic_msgs::msg::DiagnosticStatus::STALE) { + max_level_without_stale = max(max_level_without_stale, processed[i]->level); + } header_status->level = max(header_status->level, processed[i]->level); header_status->values.push_back(kv); } } } - // Report stale as errors unless all stale - if (header_status->level == diagnostic_msgs::msg::DiagnosticStatus::STALE && !all_stale) { - header_status->level = diagnostic_msgs::msg::DiagnosticStatus::ERROR; + // WARN/ERROR always beats STALE; if only STALE present, report STALE + if (max_level_without_stale > diagnostic_msgs::msg::DiagnosticStatus::OK) { + header_status->level = max_level_without_stale; + } else if (header_status->level == diagnostic_msgs::msg::DiagnosticStatus::STALE) { + header_status->level = diagnostic_msgs::msg::DiagnosticStatus::STALE; + } else { + header_status->level = diagnostic_msgs::msg::DiagnosticStatus::OK; } header_status->message = valToMsg(header_status->level); diff --git a/diagnostic_aggregator/test/test_discard_behavior.py b/diagnostic_aggregator/test/test_discard_behavior.py index 18112b0e..7b3251a1 100644 --- a/diagnostic_aggregator/test/test_discard_behavior.py +++ b/diagnostic_aggregator/test/test_discard_behavior.py @@ -31,7 +31,7 @@ # POSSIBILITY OF SUCH DAMAGE. # # DESCRIPTION -# This test ensures that a parent AnalyzerGroup does not roll up an ERROR state when a +# This test ensures that a parent AnalyzerGroup does *not* roll up an ERROR state when a # GenericAnalyzer child block is marked with discard_stale: true and either of these # conditions are met: # @@ -86,6 +86,7 @@ # A status value of 'None' means that the state is never sent (it's missing). TEST_METADATA = [ # CASE 1: both 'foo' and 'bar' are marked discard_stale := true + # --> the aggregator should roll up to OK, because the stale statuses are discarded. TestMetadata( foo_discard=True, foo_status=None, @@ -107,7 +108,16 @@ bar_status=DiagnosticStatus.STALE, agg_expected=DiagnosticStatus.OK, ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.STALE, + bar_discard=True, + bar_status=None, + agg_expected=DiagnosticStatus.OK, + ), # CASE 2: both 'foo' and 'bar' are marked discard_stale := false + # --> the aggregator should roll up to STALE, as long as they aggregated are either + # missing or stale, because the stale statuses are *not* discarded. TestMetadata( foo_discard=False, foo_status=None, @@ -129,38 +139,119 @@ bar_status=DiagnosticStatus.STALE, agg_expected=DiagnosticStatus.STALE, ), - # CASE 3: one of 'foo' or 'bar' are marked discard_stale := true + TestMetadata( + foo_discard=False, + foo_status=DiagnosticStatus.STALE, + bar_discard=False, + bar_status=None, + agg_expected=DiagnosticStatus.STALE, + ), + # CASE 3: one of 'foo' or 'bar' are marked discard_stale := true and the corresponding status + # is missing or stale while the other is OK. + # --> the aggregator should roll up to OK, because the stale status is discarded. TestMetadata( foo_discard=True, foo_status=None, bar_discard=False, + bar_status=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.OK, + ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.STALE, + bar_discard=False, + bar_status=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.OK, + ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.OK, + bar_discard=False, + bar_status=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.OK, + ), + TestMetadata( + foo_discard=False, + foo_status=DiagnosticStatus.OK, + bar_discard=True, bar_status=None, - agg_expected=DiagnosticStatus.ERROR, # <-- This is the case we are testing for. - # if one of the children is *not* marked discard_stale := true and - # there are no statuses, then the parent should roll up to ERROR. + agg_expected=DiagnosticStatus.OK, + ), + TestMetadata( + foo_discard=False, + foo_status=DiagnosticStatus.OK, + bar_discard=True, + bar_status=DiagnosticStatus.STALE, + agg_expected=DiagnosticStatus.OK, ), TestMetadata( foo_discard=True, foo_status=DiagnosticStatus.OK, bar_discard=False, + bar_status=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.OK, + ), + # CASE 4: one of 'foo' or 'bar' are marked discard_stale := true and the corresponding status + # is missing or stale while the other is missing or stale. + # --> the aggregator should roll up to STALE + TestMetadata( + foo_discard=True, + foo_status=None, + bar_discard=False, + bar_status=None, + agg_expected=DiagnosticStatus.STALE, + ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.STALE, + bar_discard=False, bar_status=None, - agg_expected=DiagnosticStatus.ERROR, + agg_expected=DiagnosticStatus.STALE, ), TestMetadata( foo_discard=True, foo_status=None, bar_discard=False, - bar_status=DiagnosticStatus.OK, - agg_expected=DiagnosticStatus.OK, # <-- This is the case we are testing for. - # but if a child is marked discard_stale := true and there are no statuses, - # the parent should roll up to OK. + bar_status=DiagnosticStatus.STALE, + agg_expected=DiagnosticStatus.STALE, ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.STALE, + bar_discard=False, + bar_status=DiagnosticStatus.STALE, + agg_expected=DiagnosticStatus.STALE, + ), + # CASE 5: one of 'foo' or 'bar' are marked discard_stale := true and the corresponding status + # is OK while the other is missing or stale. + # --> the aggregator should roll up to STALE, because it has a higher severity than OK. TestMetadata( foo_discard=True, foo_status=DiagnosticStatus.OK, bar_discard=False, + bar_status=None, + agg_expected=DiagnosticStatus.STALE, + ), + TestMetadata( + foo_discard=True, + foo_status=DiagnosticStatus.OK, + bar_discard=False, + bar_status=DiagnosticStatus.STALE, + agg_expected=DiagnosticStatus.STALE, + ), + TestMetadata( + foo_discard=False, + foo_status=None, + bar_discard=True, bar_status=DiagnosticStatus.OK, - agg_expected=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.STALE, + ), + TestMetadata( + foo_discard=False, + foo_status=DiagnosticStatus.STALE, + bar_discard=True, + bar_status=DiagnosticStatus.OK, + agg_expected=DiagnosticStatus.STALE, ), ]