From 9a60325b0feb6c2f44f356222e28b2a2cf39ec7c Mon Sep 17 00:00:00 2001 From: Mickael Bourgois Date: Mon, 19 May 2025 11:47:28 +0200 Subject: [PATCH 1/5] ARSN-495: Retry internal thales session timeout --- lib/network/kmip/Client.ts | 3 ++ lib/network/kmip/error-comparison.jsonc | 6 ++- lib/network/kmip/index.ts | 48 +++++++++++++++++++ .../kmip/transport/TransportTemplate.ts | 14 +++++- 4 files changed, 68 insertions(+), 3 deletions(-) diff --git a/lib/network/kmip/Client.ts b/lib/network/kmip/Client.ts index bebfd0555..31186a50e 100644 --- a/lib/network/kmip/Client.ts +++ b/lib/network/kmip/Client.ts @@ -231,6 +231,9 @@ function _queryOperationsAndObjects(client: any, logger: werelogs.Logger, cb: an { vendorIdentification: client.vendorIdentification, kmip: kmipLog }); } + // flag handshake as done, so if the connection is closed and reconnected + // another handshake won't be done as we already have the server's info + client.kmip.handshakeDone = true; return cb(); }); } diff --git a/lib/network/kmip/error-comparison.jsonc b/lib/network/kmip/error-comparison.jsonc index d9ffd1257..0b61250d5 100644 --- a/lib/network/kmip/error-comparison.jsonc +++ b/lib/network/kmip/error-comparison.jsonc @@ -33,8 +33,12 @@ "resultReason": "General Failure", "resultMessage": [ "[NCERRInsufficientPermissions]: ", + // 24h idle token revoke is undocumented, but thales described us this behavior + // we asked them a tweaked lab to test with a 10m session. // TLS session idle timeout (24h) - needs a new connection or no idle - "[NCERRUnauthorizedAccess]: Token has been revoked" + "[NCERRUnauthorizedAccess]: Token has been revoked", + // write after receiving above token revoked error + "[NCERRUnauthorizedAccess]: Invalid token" ] } } diff --git a/lib/network/kmip/index.ts b/lib/network/kmip/index.ts index d51ffb050..80b4ef3b9 100644 --- a/lib/network/kmip/index.ts +++ b/lib/network/kmip/index.ts @@ -28,6 +28,26 @@ function _PrimitiveType(tagName: string, type: string, value: any) { return { [tagName]: { type, value } }; } +/** + * Thales has a 24h idle tls session error. We can close and reopen new connection. + * This is not part of KMIP spec and it is not documented. + * We asked for a tweaked lab with a 10m timeout to test this behavior. + */ +const thalesInternalTokenError = { + status: 'Operation Failed', + reason: 'General Failure', + messages: [ + '[NCERRUnauthorizedAccess]: Token has been revoked', + '[NCERRUnauthorizedAccess]: Invalid token' + ], +}; + +function isThalesInternalTokenError(status, reason, message) { + return status === thalesInternalTokenError.status + && reason === thalesInternalTokenError.reason + && thalesInternalTokenError.messages.includes(message); +} + export type Options = { codec: any; transport: any; }; export default class KMIP { protocolVersion: { @@ -38,6 +58,7 @@ export default class KMIP { options: Options; codec: any; transport: any; + handshakeDone: boolean; /** * Construct a new KMIP Object @@ -54,6 +75,7 @@ export default class KMIP { this.options = options.kmip; this.codec = new Codec(options.kmip.codec); this.transport = new Transport(options.kmip.transport); + this.handshakeDone = false; } /* Static class methods */ @@ -307,6 +329,18 @@ export default class KMIP { KMIP.Structure('Request Payload', payload), ])])]); const encodedMessage = this._encodeMessage(message); + this._sendEncodedMessage(logger, operation, encodedMessage, uuid, false, cb, resource); + } + + _sendEncodedMessage( + logger: werelogs.Logger, + operation: string, + encodedMessage: Message, + uuid: Buffer, + isRetry: boolean, + cb: (error: Error | null, response?: any) => void, + resource?: string + ) { const startDate = Date.now(); this.transport.send( logger, encodedMessage, @@ -371,6 +405,20 @@ export default class KMIP { got: { resultStatus, resultReason, resultMessage }, expected: undefined, }); + + if (!isRetry && isThalesInternalTokenError(resultStatus, resultReason, resultMessage)) { + logger.info('KMIP::reconnecting', { errorList, kmipLog, handshakeDone: this.handshakeDone }); + // Close TLS channel socket + this.transport.abortPipeline(conversation, () => { + // Once completely closed and all callbacks are drained, + // reopen the TLS channel connection and skip KMIP handshake if it was already done + // as we already have kmip details + this.transport._createConversation(logger, () => {}, this.handshakeDone); + this._sendEncodedMessage(logger, operation, encodedMessage, uuid, true, cb, resource); + }); + return undefined; + } + // Use AccessDenied as default to avoid retryable error // Error message does not match AWS, generic message for KMIP provide every details const kmsErr = (errorMapping[resultStatus]?.[resultReason] ?? errorInstances.AccessDenied) diff --git a/lib/network/kmip/transport/TransportTemplate.ts b/lib/network/kmip/transport/TransportTemplate.ts index 3c1fce244..2d42680cd 100644 --- a/lib/network/kmip/transport/TransportTemplate.ts +++ b/lib/network/kmip/transport/TransportTemplate.ts @@ -43,6 +43,7 @@ export default class TransportTemplate { cb: KmipCallback; }>; pipelineDrainedCallback: any | null; + onEndCallback: any | null; handshakeFunction: any | null; socket: any; @@ -63,6 +64,7 @@ export default class TransportTemplate { this.callbackPipeline = []; this.deferedRequests = []; this.pipelineDrainedCallback = null; + this.onEndCallback = null; this.handshakeFunction = null; this.socket = null; } @@ -114,13 +116,14 @@ export default class TransportTemplate { _createConversation( logger: werelogs.Logger, readyCallback: (error: Error | null) => void, + skipHandshake: boolean = false, ) { try { const socket = this.channel.connect( this.options.tls.port || DEFAULT_KMIP_PORT, this.options.tls, () => { - if (this.handshakeFunction) { + if (!skipHandshake && this.handshakeFunction) { this.handshakeFunction(logger, readyCallback); } else { readyCallback(null); @@ -158,6 +161,11 @@ export default class TransportTemplate { const error = Error('Conversation interrupted'); this.socket = null; this._drainQueuesWithError(error); + const onEnd = this.onEndCallback; + if (onEnd) { + this.onEndCallback = null; + onEnd(); + } }); socket.on('error', err => { this._drainQueuesWithError(err); @@ -230,8 +238,10 @@ export default class TransportTemplate { * Abruptly interrupt the conversation and cancel the outstanding and * defered requests * @param conversation - the conversation to abort + * @param onEndCallback - callback called once the socket is ended and pipelines are drained */ - abortPipeline(conversation: any) { + abortPipeline(conversation: any, onEndCallback: any) { + this.onEndCallback = onEndCallback; conversation.end(); } } From 9e901f21ffb07861c582be7b10d380c2d22f5b88 Mon Sep 17 00:00:00 2001 From: Mickael Bourgois Date: Mon, 19 May 2025 16:14:27 +0200 Subject: [PATCH 2/5] ARSN-495: Replace logger.error with logger.warn To avoid logging all debug and trace on cluster retry if one host is unhealthy --- lib/network/kmip/Client.ts | 31 ++++++++++++++++++++----------- lib/network/kmip/index.ts | 9 ++++++--- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/lib/network/kmip/Client.ts b/lib/network/kmip/Client.ts index 31186a50e..d73a22929 100644 --- a/lib/network/kmip/Client.ts +++ b/lib/network/kmip/Client.ts @@ -339,7 +339,8 @@ export default class Client implements KMSInterface { kmipMsg(operation, keyIdentifier, 'Server did not return the expected identifier') ); - logger.error(`KMIP::${operation}`, + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn(`KMIP::${operation}`, { error, uniqueIdentifier, keyIdentifier }); return error; } @@ -360,7 +361,8 @@ export default class Client implements KMSInterface { KMIP.TextString('Unique Identifier', keyIdentifier), ], (error, response) => { if (error) { - logger.error('KMIP::_activateBucketKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::_activateBucketKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -404,7 +406,8 @@ export default class Client implements KMSInterface { KMIP.Structure('Template-Attribute', attributes), ], (error, response) => { if (error) { - logger.error('KMIP::createBucketKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::createBucketKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -419,7 +422,8 @@ export default class Client implements KMSInterface { kmipMsg('Create', bucketName, 'Server created an object of wrong type') ); - logger.error('KMIP::createBucketKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::createBucketKey', { error, createdObjectType }); return cb(error); } @@ -451,7 +455,8 @@ export default class Client implements KMSInterface { ]), ], (error, response) => { if (error) { - logger.error('KMIP::_revokeBucketKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::_revokeBucketKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -472,7 +477,8 @@ export default class Client implements KMSInterface { return this._revokeBucketKey(bucketKeyId, logger, err => { if (err) { const error = arsenalErrorKMIP(err); - logger.error('KMIP::destroyBucketKey: revocation failed', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::destroyBucketKey: revocation failed', { error, serverInformation: this.serverInformation }); return cb(error); @@ -482,7 +488,8 @@ export default class Client implements KMSInterface { ], (err, response) => { if (err) { const error = arsenalErrorKMIP(err); - logger.error('KMIP::destroyBucketKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::destroyBucketKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -524,7 +531,8 @@ export default class Client implements KMSInterface { KMIP.ByteString('IV/Counter/Nonce', CRYPTOGRAPHIC_DEFAULT_IV), ], (error, response) => { if (error) { - logger.error('KMIP::cipherDataKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::cipherDataKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -566,7 +574,8 @@ export default class Client implements KMSInterface { KMIP.ByteString('IV/Counter/Nonce', CRYPTOGRAPHIC_DEFAULT_IV), ], (error, response) => { if (error) { - logger.error('KMIP::decipherDataKey', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::decipherDataKey', { error, serverInformation: this.serverInformation }); return cb(error); @@ -582,7 +591,7 @@ export default class Client implements KMSInterface { // the bucket does not have to exist, just passing a common bucket name here this.createBucketKey('kmip-healthcheck-test-bucket', logger, (err, bucketKeyId) => { if (err) { - logger.error('KMIP::healthcheck: failure to create a test bucket key', { + logger.warn('KMIP::healthcheck: failure to create a test bucket key', { error: err, kmip: kmipLog, }); return cb(err); @@ -591,7 +600,7 @@ export default class Client implements KMSInterface { { kmip: kmipLog }); this.destroyBucketKey(bucketKeyId, logger, err => { if (err) { - logger.error('KMIP::healthcheck: failure to remove the test bucket key', { + logger.warn('KMIP::healthcheck: failure to remove the test bucket key', { bucketKeyId, error: err, kmip: kmipLog, diff --git a/lib/network/kmip/index.ts b/lib/network/kmip/index.ts index 80b4ef3b9..d64e16af4 100644 --- a/lib/network/kmip/index.ts +++ b/lib/network/kmip/index.ts @@ -360,7 +360,8 @@ export default class KMIP { // Retryable most likely network related const error = errorInstances.InternalError .customizeDescription(kmipMsg(operation, resource, err.toString())); - logger.error('KMIP::request: Failed to send message', + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::request: Failed to send message', { error: err, kmip: kmipLog }); return cb(error); } @@ -425,11 +426,13 @@ export default class KMIP { .customizeDescription( kmipMsg(operation, resource, `${resultReason}: ${resultMessage}`) ); - logger.error('KMIP::request error', { errorList, kmip: kmipLog, error: kmsErr }); + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::request error', { errorList, kmip: kmipLog, error: kmsErr }); return cb(kmsErr); } if (errorList.length) { - logger.error('KMIP::request error', { errorList, kmip: kmipLog }); + // warn level to avoid dumping debug and trace logs on retryable errors + logger.warn('KMIP::request error', { errorList, kmip: kmipLog }); // Retryable as connection is closed and all messages errored return cb(errorInstances.InternalError.customizeDescription( kmipMsg(operation, resource, `Internal ${errorList.map(e => e.msg)}`) From 576f0e9f994fc9b7595482eebb5e1204a34fc937 Mon Sep 17 00:00:00 2001 From: Mickael Bourgois Date: Mon, 19 May 2025 16:17:18 +0200 Subject: [PATCH 3/5] ARSN-495: Implement retry and unhealthy hosts --- lib/network/kmip/ClusterClient.ts | 195 +++++++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 6 deletions(-) diff --git a/lib/network/kmip/ClusterClient.ts b/lib/network/kmip/ClusterClient.ts index 64d81e42b..e4bcbb64a 100644 --- a/lib/network/kmip/ClusterClient.ts +++ b/lib/network/kmip/ClusterClient.ts @@ -5,13 +5,44 @@ import TTLVCodec from './codec/ttlv'; import TlsTransport from './transport/tls'; import KMIPClient, { KmipClientOptions } from './Client'; import { KmsBackend, KMSInterface, KmsType } from '../KMSInterface'; -import type { Logger } from 'werelogs'; +import { Logger } from 'werelogs'; import async from 'async'; +import { ArsenalError, errorInstances } from '../../errors'; +import { kmipMsg } from './errorMapping'; + +interface UnhealthyClient { + client: KMIPClient, + healthcheckTimeout: ReturnType, +}; + +/** Array without last item */ +type ArrayPopped = Type extends [...infer R, unknown] ? R : never; +/** Array's last item */ +type ArrayLast = Type extends [...unknown[], infer R] ? R : never + +interface actions { + createBucketKey: Parameters; + destroyBucketKey: Parameters; + cipherDataKey: Parameters; + decipherDataKey: Parameters; + healthcheck: Parameters; +}; + +interface ClusterClientOptions { + logger: Logger; + retries?: number; +}; + +const UNHEALTHY_DURATION = 30_000; // 30s export default class ClusterClient implements KMSInterface { + /** Healthy clients */ private readonly clients: KMIPClient[]; + private readonly unhealthyClients: UnhealthyClient[] = []; private roundRobinIndex = 0; public readonly backend: KmsBackend; + private readonly logger: Logger; + private readonly retries: number; /** * Construct a high level cluster of KMIP drivers suitable for cloudserver @@ -36,7 +67,7 @@ export default class ClusterClient implements KMSInterface { * defaults to TlsTransport */ constructor( - options: KmipClientOptions, + options: KmipClientOptions & ClusterClientOptions, CodecClass: any, TransportClass: any, ) { @@ -47,6 +78,9 @@ export default class ClusterClient implements KMSInterface { TransportClass || TlsTransport, )); this.backend = this.clients[0].backend; + this.logger = options.logger; + // if retries is not configured, we retry as much host there are in the cluster + this.retries = typeof options.retries === 'number' ? options.retries : this.clients.length - 1; } next() { @@ -56,25 +90,174 @@ export default class ClusterClient implements KMSInterface { return this.clients[this.roundRobinIndex++]; } + checkUnhealthyClient(unhealthyClient: UnhealthyClient) { + const client = unhealthyClient.client; + const healthyIndex = this.clients.indexOf(client); + if (healthyIndex === -1) { + this.clients.push(client); + } else { + // should not happen (S3C-9956) + this.logger.warn('checkUnhealthyClient: already moved in healthy', { unhealthyHost: client.host }); + } + + const unhealthyIndex = this.unhealthyClients.indexOf(unhealthyClient); + if (unhealthyIndex === -1) { + // should not happen (S3C-9956) + this.logger.warn('checkUnhealthyClient: already moved out of unhealthy', { unhealthyHost: client.host }); + } else { + this.unhealthyClients.splice(unhealthyIndex, 1); + } + this.logger.info('kmip host healthy again', { + unhealthyHost: client.host, + unhealthy: this.unhealthyClients.length, + healthy: this.clients.length, + }); + } + + markUnhealthyClient(clientUsed: KMIPClient, logger: Logger, err: Error) { + logger.info('mark kmip host unhealthy', { + err, + msg: err?.toString?.(), + unhealthyHost: clientUsed.host, + unhealthy: this.unhealthyClients.length, + healthy: this.clients.length, + }); + const index = this.clients.indexOf(clientUsed); + if (index === -1) { + // should not happen (S3C-9956) + logger.warn('already marked unhealthy'); + return; + } + const spliced = this.clients.splice(index, 1); + const unhealthy = { + client: spliced[0], + healthcheckTimeout: setTimeout(() => this.checkUnhealthyClient(unhealthy), UNHEALTHY_DURATION), + }; + this.unhealthyClients.push(unhealthy); + // the current index was removed + // decrease so next roundrobin uses current index again with new client + this.roundRobinIndex--; + } + + callback( + clientUsed: KMIPClient, + funcName: T, + args: ArrayPopped, + logger: ArrayLast, + originalCallback: ArrayLast, + ) { + let retries = 0; + const newCB = (err: (ArsenalError | Error | null) & { code?: number }, ...rest: any[]) => { + if (!err) { + // @ts-expect-error ts2556 typescript won't accept the spread on args array + return originalCallback(err, ...rest); + } + if (err.code && err.code >= 400 && err.code <= 499) { + // @ts-expect-error ts2556 typescript won't accept the spread on args array + return originalCallback(err, ...rest); + } + + if (retries === this.retries) { + logger.warn(`kmip max retries reached: ${retries} / ${this.retries}`); + // @ts-expect-error ts2556 typescript won't accept the spread on args array + return originalCallback(err, ...rest); + } + retries++; + + this.markUnhealthyClient(clientUsed, logger, err); + + if (this.clients.length === 0) { + logger.warn('kmip cluster has no healthy hosts'); + // @ts-expect-error ts2556 typescript won't accept the spread on args array + return originalCallback(err, ...rest); + } + + const nextClient = this.next(); + // @ts-expect-error ts2556 typescript won't accept the spread on args array + nextClient[funcName](...args, newCB); + return undefined; + }; + return newCB; + } createBucketKey(...args: Parameters) { + const originalCallback = args.pop() as ArrayLast; + const poppedArgs = args as unknown as ArrayPopped; + const logger = args[args.length - 1] as ArrayLast; + + if (this.clients.length === 0) { + logger.warn('kmip cluster has no healthy hosts'); + return originalCallback(errorInstances.InternalError.customizeDescription( + kmipMsg('Create', args[0], `no healthy host in the cluster`))); + } + const client = this.next(); - client.createBucketKey.apply(client, args); + + client.createBucketKey( + ...poppedArgs, + this.callback(client, 'createBucketKey', poppedArgs, logger, originalCallback), + ); + return undefined; } destroyBucketKey(...args: Parameters) { + const originalCallback = args.pop() as ArrayLast; + const poppedArgs = args as unknown as ArrayPopped; + const logger = args[args.length - 1] as ArrayLast; + + if (this.clients.length === 0) { + logger.warn('kmip cluster has no healthy hosts'); + return originalCallback(errorInstances.InternalError.customizeDescription( + kmipMsg('Destroy', args[0], `no healthy host in the cluster`))); + } + const client = this.next(); - client.destroyBucketKey.apply(client, args); + + client.destroyBucketKey( + ...poppedArgs, + this.callback(client, 'destroyBucketKey', poppedArgs, logger, originalCallback), + ); + return undefined; } cipherDataKey(...args: Parameters) { + const originalCallback = args.pop() as ArrayLast; + const poppedArgs = args as unknown as ArrayPopped; + const logger = args[args.length - 1] as ArrayLast; + + if (this.clients.length === 0) { + logger.warn('kmip cluster has no healthy hosts'); + return originalCallback(errorInstances.InternalError.customizeDescription( + kmipMsg('Encrypt', args[1], `no healthy host in the cluster`))); + } + const client = this.next(); - client.cipherDataKey.apply(client, args); + + client.cipherDataKey( + ...poppedArgs, + this.callback(client, 'cipherDataKey', poppedArgs, logger, originalCallback), + ); + return undefined; } decipherDataKey(...args: Parameters) { + const originalCallback = args.pop() as ArrayLast; + const poppedArgs = args as unknown as ArrayPopped; + const logger = args[args.length - 1] as ArrayLast; + + if (this.clients.length === 0) { + logger.warn('kmip cluster has no healthy hosts'); + return originalCallback(errorInstances.InternalError.customizeDescription( + kmipMsg('Decrypt', args[1], `no healthy host in the cluster`))); + } + const client = this.next(); - client.decipherDataKey.apply(client, args); + + client.decipherDataKey( + ...poppedArgs, + this.callback(client, 'decipherDataKey', poppedArgs, logger, originalCallback), + ); + return undefined; } clusterHealthcheck(logger: Logger, cb: (err: Error | null) => void) { From c0d6cd8261a760755dbe4fefc50e613e4c82af6e Mon Sep 17 00:00:00 2001 From: Mickael Bourgois Date: Mon, 19 May 2025 19:31:16 +0200 Subject: [PATCH 4/5] ARSN-495: Print error msg --- lib/network/kmip/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/network/kmip/index.ts b/lib/network/kmip/index.ts index d64e16af4..a6f4c1be2 100644 --- a/lib/network/kmip/index.ts +++ b/lib/network/kmip/index.ts @@ -362,7 +362,7 @@ export default class KMIP { .customizeDescription(kmipMsg(operation, resource, err.toString())); // warn level to avoid dumping debug and trace logs on retryable errors logger.warn('KMIP::request: Failed to send message', - { error: err, kmip: kmipLog }); + { error: err, msg: err?.toString?.(), kmip: kmipLog }); return cb(error); } const response = this._decodeMessage(logger, rawResponse); From 0ef69d30b34aa3aa33c59c275e46918cda7531eb Mon Sep 17 00:00:00 2001 From: Mickael Bourgois Date: Mon, 19 May 2025 19:29:01 +0200 Subject: [PATCH 5/5] ARSN-495: Bump version --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 1c3109e81..f9c737774 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "engines": { "node": ">=16" }, - "version": "7.70.44", + "version": "7.70.45", "description": "Common utilities for the S3 project components", "main": "build/index.js", "repository": {