Skip to content

Commit

Permalink
Properly handle crashes/reboots during FabricTable commit (#20010)
Browse files Browse the repository at this point in the history
* Properly handle crashes/reboots during FabricTable commit

- Since #19819, commits are very small and safer. There is less surface
  to fail during commit. The previous large-scale fail-safe behavior
  stored too much data, for too long and could cause larger reverts
  even if nothing was committed yet. FabricTable data no longer is
  ever persisted without commit.
- The existing code deleted fabrics unwittingly when not required,
  such as when powering off a light during a fail-safe for an
  update when there was nothing committed yet, assuming we still
  committed immediately.

This change:
- Detects failed commits
- Only deletes data on failed commits
- Fixes Thread driver to detect stale data where a backup was done
  (since we cannot prevent internal commits from OpenThread)

Testing done:
- Added unit test to FabricTable to generate the condition
- Did manual testing of all-clusters-app/chip-tool Linux that
  aborted on second commissioning, during commit. Found that
  cleanup occurred as expected on restart
- Integration/cert testing (including Cirque that validates
  fail-safe) still pass

* Restyled by clang-format

* Restyled by gn

* Clear commit token after deferred cleanup

* Applied code review comments

* Restyled by clang-format

* Fix CI

* Fix CI

* Remove revert at restart of openthread after other changes

* Apply review comments

* Try to fix nRF CI

* Another attempt to fix nrfconnect CI

* Fix CI some more

* Fix CI again

Co-authored-by: Restyled.io <commits@restyled.io>
  • Loading branch information
2 people authored and pull[bot] committed Jun 22, 2023
1 parent 22850fc commit 1007016
Show file tree
Hide file tree
Showing 22 changed files with 515 additions and 264 deletions.
2 changes: 2 additions & 0 deletions src/app/BUILD.gn
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ static_library("app") {
"DeviceProxy.h",
"EventManagement.cpp",
"EventPathParams.h",
"FailSafeContext.cpp",
"FailSafeContext.h",
"GlobalAttributes.h",
"InteractionModelEngine.cpp",
"InteractionModelRevision.h",
Expand Down
113 changes: 10 additions & 103 deletions src/platform/FailSafeContext.cpp → src/app/FailSafeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,15 @@
* Provides the implementation of the FailSafeContext object.
*/

#include <platform/FailSafeContext.h>

#include <lib/support/DefaultStorageKeyAllocator.h>
#include <lib/support/SafeInt.h>
#include <platform/ConfigurationManager.h>
#include <platform/internal/CHIPDeviceLayerInternal.h>

namespace chip {
namespace DeviceLayer {
#include "FailSafeContext.h"

using namespace chip::DeviceLayer;

namespace {
constexpr TLV::Tag kFabricIndexTag = TLV::ContextTag(0);
constexpr TLV::Tag kAddNocCommandTag = TLV::ContextTag(1);
constexpr TLV::Tag kUpdateNocCommandTag = TLV::ContextTag(2);
} // anonymous namespace
namespace chip {
namespace app {

void FailSafeContext::HandleArmFailSafeTimer(System::Layer * layer, void * aAppState)
{
Expand Down Expand Up @@ -89,7 +84,7 @@ void FailSafeContext::ScheduleFailSafeCleanup(FabricIndex fabricIndex, bool addN
PlatformMgr().ScheduleWork(HandleDisarmFailSafe, reinterpret_cast<intptr_t>(this));
}

CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, System::Clock::Timeout expiryLength)
CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, System::Clock::Seconds16 expiryLengthSeconds)
{
CHIP_ERROR err = CHIP_NO_ERROR;
bool cancelTimersIfError = false;
Expand All @@ -100,9 +95,8 @@ CHIP_ERROR FailSafeContext::ArmFailSafe(FabricIndex accessingFabricIndex, System
cancelTimersIfError = true;
}

SuccessOrExit(err = DeviceLayer::SystemLayer().StartTimer(expiryLength, HandleArmFailSafeTimer, this));
SuccessOrExit(err = CommitToStorage());
SuccessOrExit(err = ConfigurationMgr().SetFailSafeArmed(true));
SuccessOrExit(
err = DeviceLayer::SystemLayer().StartTimer(System::Clock::Seconds16(expiryLengthSeconds), HandleArmFailSafeTimer, this));

mFailSafeArmed = true;
mFabricIndex = accessingFabricIndex;
Expand All @@ -124,96 +118,9 @@ void FailSafeContext::DisarmFailSafe()

ResetState();

if (ConfigurationMgr().SetFailSafeArmed(false) != CHIP_NO_ERROR)
{
ChipLogError(FailSafe, "Failed to set FailSafeArmed config to false");
}

if (DeleteFromStorage() != CHIP_NO_ERROR)
{
ChipLogError(FailSafe, "Failed to delete FailSafeContext from configuration");
}

ChipLogProgress(FailSafe, "Fail-safe cleanly disarmed");
}

CHIP_ERROR FailSafeContext::SetAddNocCommandInvoked(FabricIndex nocFabricIndex)
{
mAddNocCommandHasBeenInvoked = true;
mFabricIndex = nocFabricIndex;

ReturnErrorOnFailure(CommitToStorage());

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::SetUpdateNocCommandInvoked()
{
mUpdateNocCommandHasBeenInvoked = true;

ReturnErrorOnFailure(CommitToStorage());

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::CommitToStorage()
{
DefaultStorageKeyAllocator keyAlloc;
uint8_t buf[FailSafeContextTLVMaxSize()];
TLV::TLVWriter writer;
writer.Init(buf);

TLV::TLVType outerType;
ReturnErrorOnFailure(writer.StartContainer(TLV::AnonymousTag(), TLV::kTLVType_Structure, outerType));
ReturnErrorOnFailure(writer.Put(kFabricIndexTag, mFabricIndex));

// TODO: Stop storing this, and just make the fail-safe context volatile and sweep-up stale data next boot on partial commits
ReturnErrorOnFailure(writer.Put(kAddNocCommandTag, mAddNocCommandHasBeenInvoked));
ReturnErrorOnFailure(writer.Put(kUpdateNocCommandTag, mUpdateNocCommandHasBeenInvoked));
ReturnErrorOnFailure(writer.EndContainer(outerType));

const auto failSafeContextTLVLength = writer.GetLengthWritten();
VerifyOrReturnError(CanCastTo<uint16_t>(failSafeContextTLVLength), CHIP_ERROR_BUFFER_TOO_SMALL);

return PersistedStorage::KeyValueStoreMgr().Put(keyAlloc.FailSafeContextKey(), buf,
static_cast<uint16_t>(failSafeContextTLVLength));
}

CHIP_ERROR FailSafeContext::LoadFromStorage(FabricIndex & fabricIndex, bool & addNocCommandInvoked, bool & updateNocCommandInvoked)
{
DefaultStorageKeyAllocator keyAlloc;
uint8_t buf[FailSafeContextTLVMaxSize()];
ReturnErrorOnFailure(PersistedStorage::KeyValueStoreMgr().Get(keyAlloc.FailSafeContextKey(), buf, sizeof(buf)));

TLV::ContiguousBufferTLVReader reader;
reader.Init(buf, sizeof(buf));
ReturnErrorOnFailure(reader.Next(TLV::kTLVType_Structure, TLV::AnonymousTag()));

TLV::TLVType containerType;
ReturnErrorOnFailure(reader.EnterContainer(containerType));

ReturnErrorOnFailure(reader.Next(kFabricIndexTag));
ReturnErrorOnFailure(reader.Get(fabricIndex));

ReturnErrorOnFailure(reader.Next(kAddNocCommandTag));
ReturnErrorOnFailure(reader.Get(addNocCommandInvoked));

ReturnErrorOnFailure(reader.Next(kUpdateNocCommandTag));
ReturnErrorOnFailure(reader.Get(updateNocCommandInvoked));

ReturnErrorOnFailure(reader.VerifyEndOfContainer());
ReturnErrorOnFailure(reader.ExitContainer(containerType));

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::DeleteFromStorage()
{
DefaultStorageKeyAllocator keyAlloc;

return PersistedStorage::KeyValueStoreMgr().Delete(keyAlloc.FailSafeContextKey());
}

void FailSafeContext::ForceFailSafeTimerExpiry()
{
if (!IsFailSafeArmed())
Expand All @@ -228,5 +135,5 @@ void FailSafeContext::ForceFailSafeTimerExpiry()
FailSafeTimerExpired();
}

} // namespace DeviceLayer
} // namespace app
} // namespace chip
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@

#pragma once

#include <lib/core/CHIPError.h>
#include <lib/core/DataModelTypes.h>
#include <platform/internal/CHIPDeviceLayerInternal.h>
#include <system/SystemClock.h>

namespace chip {
namespace DeviceLayer {
namespace app {

class FailSafeContext
{
Expand All @@ -39,14 +42,18 @@ class FailSafeContext
* when the fail-safe timer is currently armed, the currently-running fail-safe timer will
* first be cancelled, then the fail-safe timer will be re-armed.
*/
CHIP_ERROR ArmFailSafe(FabricIndex accessingFabricIndex, System::Clock::Timeout expiryLength);
CHIP_ERROR ArmFailSafe(FabricIndex accessingFabricIndex, System::Clock::Seconds16 expiryLengthSeconds);

/**
* @brief Cleanly disarm failsafe timer, such as on CommissioningComplete
*/
void DisarmFailSafe();
CHIP_ERROR SetAddNocCommandInvoked(FabricIndex nocFabricIndex);
CHIP_ERROR SetUpdateNocCommandInvoked();
void SetAddNocCommandInvoked(FabricIndex nocFabricIndex)
{
mAddNocCommandHasBeenInvoked = true;
mFabricIndex = nocFabricIndex;
}
void SetUpdateNocCommandInvoked() { mUpdateNocCommandHasBeenInvoked = true; }
void SetAddTrustedRootCertInvoked() { mAddTrustedRootCertHasBeenInvoked = true; }
void SetCsrRequestForUpdateNoc(bool isForUpdateNoc) { mIsCsrRequestForUpdateNoc = isForUpdateNoc; }

Expand Down Expand Up @@ -90,9 +97,6 @@ class FailSafeContext
// If the failsafe is not armed, this is a no-op.
void ForceFailSafeTimerExpiry();

static CHIP_ERROR LoadFromStorage(FabricIndex & fabricIndex, bool & addNocCommandInvoked, bool & updateNocCommandInvoked);
static CHIP_ERROR DeleteFromStorage();

private:
bool mFailSafeArmed = false;
bool mFailSafeBusy = false;
Expand All @@ -103,13 +107,6 @@ class FailSafeContext
bool mIsCsrRequestForUpdateNoc = false;
FabricIndex mFabricIndex = kUndefinedFabricIndex;

// TODO:: Track the state of what was mutated during fail-safe.

static constexpr size_t FailSafeContextTLVMaxSize()
{
return TLV::EstimateStructOverhead(sizeof(FabricIndex), sizeof(bool), sizeof(bool));
}

/**
* @brief
* The callback function to be called when "fail-safe timer" expires.
Expand Down Expand Up @@ -146,5 +143,5 @@ class FailSafeContext
CHIP_ERROR CommitToStorage();
};

} // namespace DeviceLayer
} // namespace app
} // namespace chip
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ bool emberAfAdministratorCommissioningClusterOpenCommissioningWindowCallback(

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down Expand Up @@ -169,7 +169,7 @@ bool emberAfAdministratorCommissioningClusterOpenBasicCommissioningWindowCallbac

FabricIndex fabricIndex = commandObj->GetAccessingFabricIndex();
FabricInfo * fabricInfo = Server::GetInstance().GetFabricTable().FindFabricWithIndex(fabricIndex);
auto & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
auto & commissionMgr = Server::GetInstance().GetCommissioningWindowManager();

VerifyOrExit(fabricInfo != nullptr, status.Emplace(StatusCode::EMBER_ZCL_STATUS_CODE_PAKE_PARAMETER_ERROR));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ bool emberAfGeneralCommissioningClusterArmFailSafeCallback(app::CommandHandler *
const Commands::ArmFailSafe::DecodableType & commandData)
{
MATTER_TRACE_EVENT_SCOPE("ArmFailSafe", "GeneralCommissioning");
FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = Server::GetInstance().GetFailSafeContext();
Commands::ArmFailSafeResponse::Type response;

ChipLogProgress(FailSafe, "GeneralCommissioning: Received ArmFailSafe (%us)",
Expand Down Expand Up @@ -219,9 +219,9 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
{
MATTER_TRACE_EVENT_SCOPE("CommissioningComplete", "GeneralCommissioning");

DeviceControlServer * server = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = server->GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();
DeviceControlServer * devCtrl = &DeviceLayer::DeviceControlServer::DeviceControlSvr();
auto & failSafe = Server::GetInstance().GetFailSafeContext();
auto & fabricTable = Server::GetInstance().GetFabricTable();

ChipLogProgress(FailSafe, "GeneralCommissioning: Received CommissioningComplete");

Expand Down Expand Up @@ -267,7 +267,7 @@ bool emberAfGeneralCommissioningClusterCommissioningCompleteCallback(
*/
failSafe.DisarmFailSafe();
CheckSuccess(
server->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
devCtrl->PostCommissioningCompleteEvent(handle->AsSecureSession()->GetPeerNodeId(), handle->GetFabricIndex()),
Failure);

Breadcrumb::Set(commandPath.mEndpointId, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <app/CommandHandlerInterface.h>
#include <app/InteractionModelEngine.h>
#include <app/clusters/general-commissioning-server/general-commissioning-server.h>
#include <app/server/Server.h>
#include <app/util/attribute-storage.h>
#include <lib/support/SafeInt.h>
#include <lib/support/ThreadOperationalDataset.h>
Expand Down Expand Up @@ -300,7 +301,7 @@ void FillDebugTextAndNetworkIndex(Commands::NetworkConfigResponse::Type & respon

bool CheckFailSafeArmed(CommandHandlerInterface::HandlerContext & ctx)
{
DeviceLayer::FailSafeContext & failSafeContext = DeviceLayer::DeviceControlServer::DeviceControlSvr().GetFailSafeContext();
auto & failSafeContext = chip::Server::GetInstance().GetFailSafeContext();

if (failSafeContext.IsFailSafeArmed(ctx.mCommandHandler.GetAccessingFabricIndex()))
{
Expand Down
Loading

0 comments on commit 1007016

Please sign in to comment.