Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@shardeum-foundation/core",
"version": "2.16.0-prerelease.1",
"version": "2.16.0-prerelease.4",
"engines": {
"node": "20.19.3"
},
Expand Down
2 changes: 1 addition & 1 deletion src/config/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,14 @@ const SERVER_CONFIG: StrictServerConfiguration = {
maxRotatedPerCycle: 1,
flexibleRotationDelta: 1,
flexibleRotationEnabled: false,
enableDangerousProblematicNodeRemoval: false,
enableProblematicNodeRemoval: false,
enableProblematicNodeRemovalOnCycle: 20000,
maxProblematicNodeRemovalsPerCycle: 1,
problematicNodeConsecutiveRefuteThreshold: 6,
problematicNodeRefutePercentageThreshold: 0.1,
problematicNodeHistoryLength: 60,
problematicNodeRemovalCycleFrequency: 5,
problematicNodeRemovalSafetyDelta: 1, // How far below minNodes before we stop removing problematic nodes
// New flags for problematic node cache v2
useProblematicNodeCacheV2: false, // When true, use the new cache-based implementation
enableProblematicNodeCacheBuilding: false, // Enable shadow mode cache building for validation
Expand Down
12 changes: 6 additions & 6 deletions src/debug/debug.ts
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ class Debug {
id: nodeId.substring(0, 8),
fullId: nodeId,
refuteCycles: node.refuteCycles || [],
consecutiveRefutes: ProblemNodeHandler.getConsecutiveRefutes(
consecutiveRefutes: ProblemNodeHandler.getMaxConsecutiveRefutes(
node.refuteCycles || [],
currentCycleRecord.counter
),
Expand Down Expand Up @@ -231,11 +231,11 @@ class Debug {
this.network.registerExternalGet('debug_simulateProblematic', isDebugModeMiddleware, (req, res) => {
try {
// Parse parameters with defaults
const missConsensus = req.query.missConsensus ? parseFloat(req.query.missConsensus as string) : 0
const networkDelay = req.query.networkDelay ? parseInt(req.query.networkDelay as string) : 0
const dropMessages = req.query.dropMessages ? parseFloat(req.query.dropMessages as string) : 0
const slowResponse = req.query.slowResponse ? parseFloat(req.query.slowResponse as string) : 0
const slowDelayMs = req.query.slowDelayMs ? parseInt(req.query.slowDelayMs as string) : 0
const missConsensus = req.query.missConsensus ? parseFloat(req.query.missConsensus as string) : 1
const networkDelay = req.query.networkDelay ? parseInt(req.query.networkDelay as string) : 100
const dropMessages = req.query.dropMessages ? parseFloat(req.query.dropMessages as string) : 1
const slowResponse = req.query.slowResponse ? parseFloat(req.query.slowResponse as string) : 1
const slowDelayMs = req.query.slowDelayMs ? parseInt(req.query.slowDelayMs as string) : 10000
const reset = req.query.reset === 'true'

// Validate parameters
Expand Down
37 changes: 30 additions & 7 deletions src/p2p/ModeSystemFuncs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -637,17 +637,31 @@ export function getExpiredRemovedV3(
// filter out apoptosized nodes from the problematic nodes
const problematicNodes = problematicWithApoptosizedNodes.filter((id) => !apoptosizedNodesList.includes(id))
const canRemoveProblematicNodesThisCycle = prevRecord.counter % config.p2p.problematicNodeRemovalCycleFrequency === 0
const numProblematicRemovals = Math.min(
problematicNodes.length,
canRemoveProblematicNodesThisCycle ? config.p2p.maxProblematicNodeRemovalsPerCycle || 1 : 0
)

// we can remove `remove` nodes, but we *must* remove the number of apoptosized nodes,
// the remainder is the number of expired nodes we can remove this cycle
// if there are more nodes apopped than we can remove, we can't remove any expired nodes
const numPotentiallyExpiredRemovals = Math.max(remove - numApoptosizedRemovals, 0)
const numExpiredRemovals = Math.min(numPotentiallyExpiredRemovals, numExpiredNodes)

// Calculate the safety threshold for problematic node removal
const safetyThreshold = config.p2p.minNodes - config.p2p.problematicNodeRemovalSafetyDelta
const activeAfterOtherRemovals = active - numApoptosizedRemovals - numExpiredRemovals + add

// Check if we're below the safety threshold or would go below it
let numProblematicRemovals = 0
let problematicRemovalPrevented = false
if (activeAfterOtherRemovals > safetyThreshold && canRemoveProblematicNodesThisCycle) {
// Calculate how many problematic nodes we can safely remove without going below threshold
const maxSafeProblematicRemovals = Math.max(0, activeAfterOtherRemovals - safetyThreshold)
const maxConfiguredRemovals = config.p2p.maxProblematicNodeRemovalsPerCycle

numProblematicRemovals = Math.min(problematicNodes.length, maxConfiguredRemovals, maxSafeProblematicRemovals)
} else if (canRemoveProblematicNodesThisCycle && problematicNodes.length > 0) {
// We would have removed problematic nodes but safety threshold prevents it
problematicRemovalPrevented = true
}

const cycle = CycleChain.newest.counter

if (cycle > lastLoggedCycle && remove > 0) {
Expand Down Expand Up @@ -675,14 +689,23 @@ export function getExpiredRemovedV3(
canRemoveProblematicNodesThisCycle: canRemoveProblematicNodesThisCycle,
numPotentiallyExpiredRemovals: numPotentiallyExpiredRemovals,
numExpiredRemovals: numExpiredRemovals,
activeAfterOtherRemovals: activeAfterOtherRemovals,
safetyThreshold: safetyThreshold,
problematicRemovalPrevented: problematicRemovalPrevented,
}
nestedCountersInstance.countEvent('p2p', `results of getExpiredRemovedV3: ${JSON.stringify(counters)}`)

if (problematicRemovalPrevented) {
nestedCountersInstance.countEvent(
'p2p',
`problematic node removal prevented: active after removals (${activeAfterOtherRemovals}) <= safety threshold (${safetyThreshold})`
)
}
}

const dangerousRemoval = remove === 0 && config.p2p.enableDangerousProblematicNodeRemoval === false
const nodesNeverExpire = config.p2p.nodeExpiryAge < 0
// if its recommended that we dont remove any nodes, Or the nodes never expire, we MUST adhere to this.
if (dangerousRemoval || nodesNeverExpire) {
// if nodes never expire, we MUST adhere to this.
if (nodesNeverExpire) {
return {
problematic: 0,
expired: 0,
Expand Down
52 changes: 49 additions & 3 deletions src/p2p/ProblemNodeHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
}

try {
return problematicNodeCache.toCompressedJSON()
return problematicNodeCache.toJSON()
} catch (err) {
error('Failed to export ProblematicNodeCache:', err)
return null
Expand All @@ -95,6 +95,52 @@
const activeNodeIds = new Set(NodeList.activeByIdOrder.map((node) => node.id))
return problematicNodeCache.getProblematicNodes(prevRecord.counter, activeNodeIds)
}
// Return empty array when feature is disabled
return []
}

// Get problematic node info for reporting to monitor - info about self node only
export function getProblematicNodeInfoForSelf(nodeId: string): any | null {
if (!problematicNodeCache || !config.p2p.enableProblematicNodeCacheBuilding) {
return null
}

try {
const currentCycle = CycleChain.newest?.counter || 0

const metrics = problematicNodeCache.calculateNodeMetrics(nodeId, currentCycle)
const refuteHistory = problematicNodeCache.refuteHistory.get(nodeId) || []

// Get the cache's cycle range to determine what cycles we have data for
const cacheInfo = problematicNodeCache.getCycleCoverage()
let startCycle = cacheInfo.cycleRange ? cacheInfo.cycleRange.min : currentCycle

Check failure on line 116 in src/p2p/ProblemNodeHandler.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

'startCycle' is never reassigned. Use 'const' instead
let endCycle = cacheInfo.cycleRange ? cacheInfo.cycleRange.max : currentCycle

Check failure on line 117 in src/p2p/ProblemNodeHandler.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

'endCycle' is never reassigned. Use 'const' instead

// Build cycle history for all cycles in the cache
const cycleRefuteHistory: boolean[] = []
for (let cycle = startCycle; cycle <= endCycle; cycle++) {
cycleRefuteHistory.push(refuteHistory.includes(cycle))
}

// Calculate total refutes (all refutes in the cache)
const totalRefutes = refuteHistory.length

const problematicNodeInfo = {
isProblematic:
metrics.consecutiveRefutes >= config.p2p.problematicNodeConsecutiveRefuteThreshold ||
metrics.refutePercentage >= config.p2p.problematicNodeRefutePercentageThreshold,
totalRefutes: totalRefutes,
maxConsecutiveRefutes: metrics.consecutiveRefutes,
refutePercentage: metrics.refutePercentage,
cycleRefuteHistory: cycleRefuteHistory,
newestCycle: endCycle, // The newest cycle in cache
}

return problematicNodeInfo
} catch (err) {
error('Failed to get problematic node info for self:', err)
return null
}
}

export function getRefutePercentage(refuteCycles: number[], currentCycle: number): number {
Expand All @@ -104,24 +150,24 @@
return 0
}

export function getConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number {
export function getMaxConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number {
if (config.p2p.useProblematicNodeCacheV2 && problematicNodeCache) {
return problematicNodeCache.getConsecutiveRefutes(refuteCycles, currentCycle)
return problematicNodeCache.getMaxConsecutiveRefutes(refuteCycles, currentCycle)
}
return 0
}

function info(...msg: unknown[]) {

Check failure on line 160 in src/p2p/ProblemNodeHandler.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

Missing return type on function
const entry = `ProblemNodeHandler: ${msg.join(' ')}`
p2pLogger.info(entry)
}

function warn(...msg: unknown[]) {

Check failure on line 165 in src/p2p/ProblemNodeHandler.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

Missing return type on function
const entry = `ProblemNodeHandler: ${msg.join(' ')}`
p2pLogger.warn(entry)
}

function error(...msg: unknown[]) {

Check failure on line 170 in src/p2p/ProblemNodeHandler.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

Missing return type on function
const entry = `ProblemNodeHandler: ${msg.join(' ')}`
p2pLogger.error(entry)
}
44 changes: 21 additions & 23 deletions src/p2p/ProblematicNodeCache.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
}
}

addCycle(cycle: P2P.CycleCreatorTypes.CycleRecord, autoPrune: boolean = false): void {

Check failure on line 80 in src/p2p/ProblematicNodeCache.ts

View workflow job for this annotation

GitHub Actions / ci / QA merge checks

Type boolean trivially inferred from a boolean literal, remove type annotation
// Validate cycle number
if (cycle.counter <= this.lastProcessedCycle) {
throw new Error(`Cannot add cycle ${cycle.counter}, last processed cycle is ${this.lastProcessedCycle}`)
Expand Down Expand Up @@ -196,7 +196,7 @@
const refuteCycles = this.refuteHistory.get(nodeId) || []

// Calculate consecutive refutes
const consecutiveRefutes = this.getConsecutiveRefutes(refuteCycles, currentCycle)
const consecutiveRefutes = this.getMaxConsecutiveRefutes(refuteCycles, currentCycle)

// Calculate refute percentage
const refutePercentage = this.getRefutePercentage(refuteCycles, currentCycle)
Expand All @@ -213,40 +213,38 @@
return metrics
}

getConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number {
getMaxConsecutiveRefutes(refuteCycles: number[], currentCycle: number): number {
if (refuteCycles.length === 0) return 0

// Filter to only include refutes up to current cycle
const relevantRefutes = refuteCycles.filter((cycle) => cycle <= currentCycle)
if (relevantRefutes.length === 0) return 0

// Count consecutive refutes ending at current cycle or one before
let count = 0
// Find the maximum consecutive refutes in the retained cycle history
const sortedRefutes = [...relevantRefutes].sort((a, b) => a - b)

// Start from the end and count backwards
for (let i = sortedRefutes.length - 1; i >= 0; i--) {
const cycle = sortedRefutes[i]
let maxConsecutive = 1 // At least 1 if we have any refutes
let currentConsecutive = 1

if (i === sortedRefutes.length - 1) {
// Last refute must be at current cycle or one before
if (cycle === currentCycle || cycle === currentCycle - 1) {
count = 1
} else {
break
}
// Iterate through sorted refutes to find all consecutive sequences
for (let i = 1; i < sortedRefutes.length; i++) {
const currentCycle = sortedRefutes[i]
const prevCycle = sortedRefutes[i - 1]

if (currentCycle === prevCycle + 1) {
// Consecutive with previous cycle
currentConsecutive++
} else {
// Check if consecutive with next cycle
const nextCycle = sortedRefutes[i + 1]
if (cycle === nextCycle - 1) {
count++
} else {
break
}
// Gap found, update max and reset current count
maxConsecutive = Math.max(maxConsecutive, currentConsecutive)
currentConsecutive = 1
}
}

return count
// Update max with final sequence
maxConsecutive = Math.max(maxConsecutive, currentConsecutive)

return maxConsecutive
}

getRefutePercentage(refuteCycles: number[], currentCycle: number): number {
Expand Down Expand Up @@ -274,7 +272,7 @@
}

// Sort by score and return top N based on maxProblematicNodeRemovalsPerCycle
const maxRemovals = this.config.p2p.maxProblematicNodeRemovalsPerCycle || 1
const maxRemovals = this.config.p2p.maxProblematicNodeRemovalsPerCycle

return problematicNodes
.sort((a, b) => b.score - a.score)
Expand Down
5 changes: 5 additions & 0 deletions src/reporter/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { finishedSyncingCycle } from '../p2p/Join'
import { currentCycle } from '../p2p/CycleCreator'
import process from 'process'
import { fireAndForget } from '../utils/functions/promises'
import * as ProblemNodeHandler from '../p2p/ProblemNodeHandler'

const http = require('../http')
const allZeroes64 = '0'.repeat(64)
Expand Down Expand Up @@ -350,6 +351,9 @@ class Reporter {
const archiverListHash = Archivers.getArchiverListHash()
const lastInSyncResult = this.stateManager.accountPatcher.lastInSyncResult

// Get problematic node info for this validator
const problematicNodeInfo = ProblemNodeHandler.getProblematicNodeInfoForSelf(Self.id)

try {
await this._sendReport({
repairsStarted,
Expand Down Expand Up @@ -397,6 +401,7 @@ class Reporter {
cycleFinishedSyncing,
stillNeedsInitialPatchPostActive,
memory: process.memoryUsage(),
problematicNodeInfo,
})
if (this.stateManager != null && config.mode === 'debug' && !config.debug.disableTxCoverageReport) {
this.stateManager.transactionQueue.resetTxCoverageMap()
Expand Down
4 changes: 2 additions & 2 deletions src/shardus/shardus-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -774,13 +774,13 @@ export interface ServerConfiguration {
/** Problematic Node configurations */
/** enable problematic node removal */
enableProblematicNodeRemoval?: boolean
/** when true, we will remove problematic nodes even when calculateToAcceptV2 says we should not remove any nodes. This is useful in development when testing this feature. */
enableDangerousProblematicNodeRemoval?: boolean
/** enable problematic node removal on a specific cycle. This is to allow the network to stabilize before removing problematic nodes.
* enableProblematicNodeRemoval must be true for this to take effect*/
enableProblematicNodeRemovalOnCycle?: number
/** The problematicNodeRemovalCycleFrequency parameter is an Integer specifying the number of cycles between problematic node removals. */
problematicNodeRemovalCycleFrequency?: number
/** The problematicNodeRemovalSafetyDelta parameter is an Integer specifying how far below minNodes the active count can be before we stop removing problematic nodes. For example, if minNodes is 100 and this value is 10, we won't remove problematic nodes if active count < 90. */
problematicNodeRemovalSafetyDelta?: number
/** The maxProblematicNodeRemovalsPerCycle parameter is an Integer specifying the maximum number of problematic nodes that can be removed from the network each cycle. */
maxProblematicNodeRemovalsPerCycle?: number
/** The problematicNodeConsecutiveRefuteThreshold parameter is an Integer specifying the number of consecutive refutes a node must have before it is considered problematic. */
Expand Down
2 changes: 1 addition & 1 deletion test/unit/debug/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jest.mock('../../../src/p2p/CycleChain', () => ({
jest.mock('../../../src/p2p/ProblemNodeHandler', () => ({
getProblematicNodes: jest.fn().mockReturnValue([]),
getRefutePercentage: jest.fn().mockReturnValue(0),
getConsecutiveRefutes: jest.fn().mockReturnValue(0),
getMaxConsecutiveRefutes: jest.fn().mockReturnValue(0),
isNodeProblematic: jest.fn().mockReturnValue(false),
exportProblematicNodeCache: jest.fn().mockReturnValue(null),
}))
Expand Down
2 changes: 1 addition & 1 deletion test/unit/src/debug/debug.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jest.mock('../../../../src/p2p/Context', () => ({
}))
jest.mock('../../../../src/p2p/ProblemNodeHandler', () => ({
getRefutePercentage: jest.fn(),
getConsecutiveRefutes: jest.fn(),
getMaxConsecutiveRefutes: jest.fn(),
isNodeProblematic: jest.fn(),
}))
jest.mock('../../../../src/utils/nestedCounters')
Expand Down
Loading
Loading