Skip to content

Commit

Permalink
[QoS] Resolve an issue in the sequence where a referenced object remo…
Browse files Browse the repository at this point in the history
…ved and then the referencing object deleting and then re-adding (#2210)

- What I did
Resolve an issue in the following scenario
Suppose object a in table A references object b in table B. When a user is going to remove items in table A and B, the notifications can be received in the following order:

The notification of removing object b
Then the notification of removing object a
And then the notification of re-adding object a
The notification of re-adding object b.
Object b can not be removed in the 1st step because it is still referenced by object a. In case the system is busy, the notification of removing a remains in m_toSync when the notification of re-adding it is coming, which results in both notifications being handled together and the reference to object b not being cleared.
As a result, notification of removing b will never be handled and remain in m_toSync forever.

Solution:

Introduce a flag pending remove indicating whether an object is about to be removed but pending on some reference.
pending remove is set once a DEL notification is received for an object with non-zero reference.
When resolving references in step 3, a pending remove object will be skipped and the notification will remain in m_toSync.
SET operation will not be carried out in case there is a pending remove flag on the object to be set.
By doing so, when object a is re-added in step 3, it can not retrieve the dependent object b. And step 1 will be handled and drained successfully.

- Why I did it
Fix bug.

- How I verified it
Mock test and manually test (eg. config qos reload)

Signed-off-by: Stephen Sun <stephens@nvidia.com>
  • Loading branch information
stephenxs committed Apr 18, 2022
1 parent eaf7264 commit d8fadc6
Show file tree
Hide file tree
Showing 8 changed files with 614 additions and 75 deletions.
14 changes: 14 additions & 0 deletions orchagent/bufferorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,11 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
{
sai_object = (*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId;
SWSS_LOG_DEBUG("found existing object:%s of type:%s", object_name.c_str(), map_type_name.c_str());
if ((*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove && op == SET_COMMAND)
{
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", map_type_name.c_str(), object_name.c_str());
return task_process_status::task_need_retry;
}
}
SWSS_LOG_DEBUG("processing command:%s", op.c_str());
if (object_name == "ingress_zero_pool")
Expand Down Expand Up @@ -565,6 +570,7 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
}

(*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId = sai_object;
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = false;
// Here we take the PFC watchdog approach to update the COUNTERS_DB metadata (e.g., PFC_WD_DETECTION_TIME per queue)
// at initialization (creation and registration phase)
// Specifically, we push the buffer pool name to oid mapping upon the creation of the oid
Expand All @@ -579,6 +585,7 @@ task_process_status BufferOrch::processBufferPool(KeyOpFieldsValuesTuple &tuple)
{
auto hint = objectReferenceInfo(m_buffer_type_maps, map_type_name, object_name);
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", object_name.c_str(), hint.c_str());
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = true;

return task_process_status::task_need_retry;
}
Expand Down Expand Up @@ -647,6 +654,11 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
{
sai_object = (*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId;
SWSS_LOG_DEBUG("found existing object:%s of type:%s", object_name.c_str(), map_type_name.c_str());
if ((*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove && op == SET_COMMAND)
{
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", map_type_name.c_str(), object_name.c_str());
return task_process_status::task_need_retry;
}
}
SWSS_LOG_DEBUG("processing command:%s", op.c_str());
if (op == SET_COMMAND)
Expand Down Expand Up @@ -787,6 +799,7 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
}
}
(*(m_buffer_type_maps[map_type_name]))[object_name].m_saiObjectId = sai_object;
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = false;
SWSS_LOG_NOTICE("Created buffer profile %s with type %s", object_name.c_str(), map_type_name.c_str());
}

Expand All @@ -799,6 +812,7 @@ task_process_status BufferOrch::processBufferProfile(KeyOpFieldsValuesTuple &tup
{
auto hint = objectReferenceInfo(m_buffer_type_maps, map_type_name, object_name);
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", object_name.c_str(), hint.c_str());
(*(m_buffer_type_maps[map_type_name]))[object_name].m_pendingRemove = true;

return task_process_status::task_need_retry;
}
Expand Down
5 changes: 5 additions & 0 deletions orchagent/orch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,11 @@ bool Orch::parseReference(type_map &type_maps, string &ref_in, const string &typ
SWSS_LOG_INFO("map:%s does not contain object with name:%s\n", type_name.c_str(), ref_in.c_str());
return false;
}
if (obj_it->second.m_pendingRemove)
{
SWSS_LOG_NOTICE("map:%s contains a pending removed object %s, skip\n", type_name.c_str(), ref_in.c_str());
return false;
}
object_name = ref_in;
SWSS_LOG_DEBUG("parsed: type_name:%s, object_name:%s", type_name.c_str(), object_name.c_str());
return true;
Expand Down
1 change: 1 addition & 0 deletions orchagent/orch.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ typedef struct
// multiple objects being referenced are separated by ','
std::map<std::string, std::string> m_objsReferencingByMe;
sai_object_id_t m_saiObjectId;
bool m_pendingRemove;
} referenced_object;

typedef std::map<std::string, referenced_object> object_reference_map;
Expand Down
86 changes: 14 additions & 72 deletions orchagent/qosorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
if (QosOrch::getTypeMap()[qos_map_type_name]->find(qos_object_name) != QosOrch::getTypeMap()[qos_map_type_name]->end())
{
sai_object = (*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_saiObjectId;
if ((*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove && op == SET_COMMAND)
{
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", qos_map_type_name.c_str(), qos_object_name.c_str());
return task_process_status::task_need_retry;
}
}
if (op == SET_COMMAND)
{
Expand Down Expand Up @@ -138,6 +143,7 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
return task_process_status::task_failed;
}
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_saiObjectId = sai_object;
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove = false;
SWSS_LOG_NOTICE("Created [%s:%s]", qos_map_type_name.c_str(), qos_object_name.c_str());
}
freeAttribResources(attributes);
Expand All @@ -153,6 +159,7 @@ task_process_status QosMapHandler::processWorkItem(Consumer& consumer, KeyOpFiel
{
auto hint = gQosOrch->objectReferenceInfo(QosOrch::getTypeMap(), qos_map_type_name, qos_object_name);
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", qos_object_name.c_str(), hint.c_str());
(*(QosOrch::getTypeMap()[qos_map_type_name]))[qos_object_name].m_pendingRemove = true;
return task_process_status::task_need_retry;
}
if (!removeQosItem(sai_object))
Expand Down Expand Up @@ -1121,6 +1128,11 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
SWSS_LOG_ERROR("Error sai_object must exist for key %s", qos_object_name.c_str());
return task_process_status::task_invalid_entry;
}
if ((*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove && op == SET_COMMAND)
{
SWSS_LOG_NOTICE("Entry %s %s is pending remove, need retry", qos_map_type_name.c_str(), qos_object_name.c_str());
return task_process_status::task_need_retry;
}
}
if (op == SET_COMMAND)
{
Expand Down Expand Up @@ -1228,6 +1240,7 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
}
SWSS_LOG_NOTICE("Created [%s:%s]", qos_map_type_name.c_str(), qos_object_name.c_str());
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_saiObjectId = sai_object;
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove = false;
}
}
else if (op == DEL_COMMAND)
Expand All @@ -1241,6 +1254,7 @@ task_process_status QosOrch::handleSchedulerTable(Consumer& consumer, KeyOpField
{
auto hint = gQosOrch->objectReferenceInfo(QosOrch::getTypeMap(), qos_map_type_name, qos_object_name);
SWSS_LOG_NOTICE("Can't remove object %s due to being referenced (%s)", qos_object_name.c_str(), hint.c_str());
(*(m_qos_maps[qos_map_type_name]))[qos_object_name].m_pendingRemove = true;
return task_process_status::task_need_retry;
}
sai_status = sai_scheduler_api->remove_scheduler(sai_object);
Expand Down Expand Up @@ -1613,78 +1627,6 @@ task_process_status QosOrch::handleQueueTable(Consumer& consumer, KeyOpFieldsVal
return task_process_status::task_success;
}

bool QosOrch::applyMapToPort(Port &port, sai_attr_id_t attr_id, sai_object_id_t map_id)
{
SWSS_LOG_ENTER();

sai_attribute_t attr;
attr.id = attr_id;
attr.value.oid = map_id;

sai_status_t status = sai_port_api->set_port_attribute(port.m_port_id, &attr);
if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_ERROR("Failed setting sai object:%" PRIx64 " for port:%s, status:%d", map_id, port.m_alias.c_str(), status);
task_process_status handle_status = handleSaiSetStatus(SAI_API_PORT, status);
if (handle_status != task_success)
{
return parseHandleSaiStatusFailure(handle_status);
}
}
return true;
}

task_process_status QosOrch::ResolveMapAndApplyToPort(
Port &port,
sai_port_attr_t port_attr,
string field_name,
KeyOpFieldsValuesTuple &tuple,
string op)
{
SWSS_LOG_ENTER();

sai_object_id_t sai_object = SAI_NULL_OBJECT_ID;
string object_name;
bool result;
ref_resolve_status resolve_result = resolveFieldRefValue(m_qos_maps, field_name,
qos_to_ref_table_map.at(field_name), tuple, sai_object, object_name);
if (ref_resolve_status::success == resolve_result)
{
if (op == SET_COMMAND)
{
result = applyMapToPort(port, port_attr, sai_object);
}
else if (op == DEL_COMMAND)
{
// NOTE: The map is un-bound from the port. But the map itself still exists.
result = applyMapToPort(port, port_attr, SAI_NULL_OBJECT_ID);
}
else
{
SWSS_LOG_ERROR("Unknown operation type %s", op.c_str());
return task_process_status::task_invalid_entry;
}
if (!result)
{
SWSS_LOG_ERROR("Failed setting field:%s to port:%s, line:%d", field_name.c_str(), port.m_alias.c_str(), __LINE__);
return task_process_status::task_failed;
}
SWSS_LOG_DEBUG("Applied field:%s to port:%s, line:%d", field_name.c_str(), port.m_alias.c_str(), __LINE__);
return task_process_status::task_success;
}
else if (resolve_result != ref_resolve_status::field_not_found)
{
if(ref_resolve_status::not_resolved == resolve_result)
{
SWSS_LOG_INFO("Missing or invalid %s reference", field_name.c_str());
return task_process_status::task_need_retry;
}
SWSS_LOG_ERROR("Resolving %s reference failed", field_name.c_str());
return task_process_status::task_failed;
}
return task_process_status::task_success;
}

task_process_status QosOrch::handlePortQosMapTable(Consumer& consumer, KeyOpFieldsValuesTuple &tuple)
{
SWSS_LOG_ENTER();
Expand Down
3 changes: 0 additions & 3 deletions orchagent/qosorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,8 @@ class QosOrch : public Orch

sai_object_id_t getSchedulerGroup(const Port &port, const sai_object_id_t queue_id);

bool applyMapToPort(Port &port, sai_attr_id_t attr_id, sai_object_id_t sai_dscp_to_tc_map);
bool applySchedulerToQueueSchedulerGroup(Port &port, size_t queue_ind, sai_object_id_t scheduler_profile_id);
bool applyWredProfileToQueue(Port &port, size_t queue_ind, sai_object_id_t sai_wred_profile);
task_process_status ResolveMapAndApplyToPort(Port &port,sai_port_attr_t port_attr,
string field_name, KeyOpFieldsValuesTuple &tuple, string op);

private:
qos_table_handler_map m_qos_handler_map;
Expand Down
1 change: 1 addition & 0 deletions tests/mock_tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ tests_SOURCES = aclorch_ut.cpp \
portsorch_ut.cpp \
routeorch_ut.cpp \
qosorch_ut.cpp \
bufferorch_ut.cpp \
saispy_ut.cpp \
consumer_ut.cpp \
ut_saihelper.cpp \
Expand Down
Loading

0 comments on commit d8fadc6

Please sign in to comment.