scality · jonathan-gramain · May 31, 2018 · May 29, 2018 · philipyoo · May 29, 2018
diff --git a/conf/config.json b/conf/config.json
@@ -80,6 +80,7 @@
             "replicationStatusTopic": "backbeat-replication-status",
             "replicationFailedTopic": "backbeat-replication-failed",
             "monitorReplicationFailures": true,
+            "monitorReplicationFailureExpiryTimeS": 86400,
             "queueProcessor": {
                 "groupId": "backbeat-replication-group",
                 "retryTimeoutS": 300,

diff --git a/extensions/replication/ReplicationConfigValidator.js b/extensions/replication/ReplicationConfigValidator.js
@@ -6,6 +6,8 @@ const { hostPortJoi, bootstrapListJoi, adminCredsJoi } =
 const transportJoi = joi.alternatives().try('http', 'https')
     .default('http');
 
+const CRR_FAILURE_EXPIRY = 24 * 60 * 60; // Expire Redis keys after 24 hours.
+
 const joiSchema = {
     source: {
         transport: transportJoi,
@@ -52,6 +54,8 @@ const joiSchema = {
     replicationStatusTopic: joi.string().required(),
     monitorReplicationFailures: joi.boolean().default(true),
     replicationFailedTopic: joi.string().required(),
+    monitorReplicationFailureExpiryTimeS:
+        joi.number().default(CRR_FAILURE_EXPIRY),
     queueProcessor: {
         groupId: joi.string().required(),
         retryTimeoutS: joi.number().default(300),

diff --git a/extensions/replication/failedCRR/FailedCRRConsumer.js b/extensions/replication/failedCRR/FailedCRRConsumer.js
@@ -6,7 +6,6 @@ const redisClient = require('../../replication/utils/getRedisClient')();
 const FailedCRRProducer = require('./FailedCRRProducer');
 const BackbeatConsumer = require('../../../lib/BackbeatConsumer');
 const BackbeatTask = require('../../../lib/tasks/BackbeatTask');
-const redisKeys = require('../constants').redisKeys;
 const config = require('../../../conf/Config');
 
 // BackbeatConsumer constant defaults
@@ -18,6 +17,7 @@ class FailedCRRConsumer {
      * Create the retry consumer.
      */
     constructor() {
+        this._repConfig = config.extensions.replication;
         this._kafkaConfig = config.kafka;
         this._topic = config.extensions.replication.replicationFailedTopic;
         this.logger = new Logger('Backbeat:FailedCRRConsumer');
@@ -77,7 +77,7 @@ class FailedCRRConsumer {
             log.end();
             return cb();
         }
-        return this._setRedisHash(data, kafkaEntry, log, cb);
+        return this._setRedisKey(data, kafkaEntry, log, cb);
     }
 
     /**
@@ -90,11 +90,11 @@ class FailedCRRConsumer {
      * @param {Function} cb - The callback function
      * @return {undefined}
      */
-    _setRedisHash(data, kafkaEntry, log, cb) {
+    _setRedisKey(data, kafkaEntry, log, cb) {
         this._backbeatTask.retry({
             actionDesc: 'set redis key',
             logFields: {},
-            actionFunc: done => this._setRedisHashOnce(data, log, done),
+            actionFunc: done => this._setRedisKeyOnce(data, log, done),
             shouldRetryFunc: err => err.retryable,
             log,
         }, err => {
@@ -110,14 +110,16 @@ class FailedCRRConsumer {
 
     /**
      * Attempt to set the Redis hash.
-     * @param {Object} data - The field and value for the Redis hash
+     * @param {Object} data - The key and value for the Redis key
      * @param {Werelogs} log - The werelogs logger
      * @param {Function} cb - The callback function
      * @return {undefined}
      */
-    _setRedisHashOnce(data, log, cb) {
-        const cmds = ['hmset', redisKeys.failedCRR, [data.field, data.value]];
-        return redisClient.batch([cmds], (err, res) => {
+    _setRedisKeyOnce(data, log, cb) {
+        const { key, value } = data;
+        const expiry = this._repConfig.monitorReplicationFailureExpiryTimeS;
+        const cmd = ['set', key, value, 'EX', expiry];
+        return redisClient.batch([cmd], (err, res) => {
             if (err) {
                 return cb({ retryable: true });
             }

diff --git a/extensions/replication/replicationStatusProcessor/ReplicationStatusProcessor.js b/extensions/replication/replicationStatusProcessor/ReplicationStatusProcessor.js
@@ -14,6 +14,7 @@ const ReplicationTaskScheduler = require('../utils/ReplicationTaskScheduler');
 const UpdateReplicationStatus = require('../tasks/UpdateReplicationStatus');
 const QueueEntry = require('../../../lib/models/QueueEntry');
 const ObjectQueueEntry = require('../utils/ObjectQueueEntry');
+const { redisKeys } = require('../constants');
 
 /**
  * @class ReplicationStatusProcessor
@@ -145,7 +146,8 @@ class ReplicationStatusProcessor {
             const versionId = queueEntry.getEncodedVersionId();
             const { site } = backend;
             const message = {
-                field: `${bucket}:${key}:${versionId}:${site}`,
+                key: `${redisKeys.failedCRR}:` +
+                    `${bucket}:${key}:${versionId}:${site}`,
                 value: Buffer.from(kafkaEntry.value).toString(),
             };
             return this._FailedCRRProducer

diff --git a/lib/api/BackbeatAPI.js b/lib/api/BackbeatAPI.js
@@ -12,6 +12,7 @@ const QueueEntry = require('../../lib/models/QueueEntry');
 const Healthcheck = require('./Healthcheck');
 const routes = require('./routes');
 const { redisKeys } = require('../../extensions/replication/constants');
+const getFailedCRRKey = require('../util/getFailedCRRKey');
 const monitoringClient = require('../clients/monitoringHandler').client;
 
 // StatsClient constant defaults
@@ -216,57 +217,85 @@ class BackbeatAPI {
     /**
      * Builds the failed CRR response.
      * @param {String} cursor - The Redis HSCAN cursor
-     * @param {Array} hashes - The collection of Redis hashes for the iteration
-     * @return {Object} - The response object
+     * @param {Array} keys - The collection of Redis keys for the iteration
+     * @param {Function} cb - The callback function
+     * @return {undefined}
      */
-    _getFailedCRRResponse(cursor, hashes) {
+    _getFailedCRRResponse(cursor, keys, cb) {
         const response = {
             IsTruncated: Number.parseInt(cursor, 10) !== 0,
             Versions: [],
         };
         if (response.IsTruncated) {
             response.NextMarker = Number.parseInt(cursor, 10);
         }
-        for (let i = 0; i < hashes.length; i += 2) {
-            const [bucket, key, versionId, site] = hashes[i].split(':');
-            const entry = hashes[i + 1];
-            const value = JSON.parse(JSON.parse(entry).value);
-            response.Versions.push({
-                Bucket: bucket,
-                Key: key,
-                VersionId: versionId,
-                StorageClass: site,
-                Size: value['content-length'],
-                LastModified: value['last-modified'],
-            });
-        }
-        return response;
+        const cmds = keys.map(k => ['get', k]);
+        return this._redisClient.batch(cmds, (err, res) => {
+            if (err) {
+                return cb(err);
+            }
+            for (let i = 0; i < res.length; i++) {
+                const [cmdErr, value] = res[i];
+                if (cmdErr) {
+                    return cb(cmdErr);
+                }
+                const queueEntry = QueueEntry.createFromKafkaEntry({ value });
+                response.Versions.push({
+                    Bucket: queueEntry.getBucket(),
+                    Key: queueEntry.getObjectKey(),
+                    VersionId: queueEntry.getEncodedVersionId(),
+                    StorageClass: queueEntry.getSite(),
+                    Size: queueEntry.getContentLength(),
+                    LastModified: queueEntry.getLastModified(),
+                });
+            }
+            return cb(null, response);
+        });
     }
 
     /**
-     * Find all failed CRR operations that match the bucket, key, and versionID.
-     * @param {Object} details - The route details
-     * @param {Function} cb - The callback to call
+     * Recursively scan all existing keys with a count of 1000. Call callback if
+     * the response is greater or equal to 1000 keys, or we have scanned all
+     * keys (i.e. when the cursor is 0).
+     * @param {String} pattern - The key pattern to match
+     * @param {Number} marker - The cursor to start scanning from
+     * @param {Array} allKeys - The collection of all matching keys found
+     * @param {Function} cb - The callback function
      * @return {undefined}
      */
-    getFailedCRR(details, cb) {
-        const { bucket, key, versionId } = details;
-        const pattern = `${bucket}:${key}:${versionId}:*`;
-        const cmds =
-            ['hscan', redisKeys.failedCRR, 0, 'MATCH', pattern, 'COUNT', 1000];
-        this._redisClient.batch([cmds], (err, res) => {
+    _scanAllKeys(pattern, marker, allKeys, cb) {
+        const cmd = ['scan', marker, 'MATCH', pattern, 'COUNT', 1000];
+        this._redisClient.batch([cmd], (err, res) => {
             if (err) {
                 return cb(err);
             }
             const [cmdErr, collection] = res[0];
             if (cmdErr) {
                 return cb(cmdErr);
             }
-            const [cursor, hashes] = collection;
-            return cb(null, this._getFailedCRRResponse(cursor, hashes));
+            const [cursor, keys] = collection;
+            allKeys.push(...keys);
+            if (allKeys.length >= 1000 || Number.parseInt(cursor, 10) === 0) {
+                return cb(null, cursor, allKeys);
+            }
+            return this._scanAllKeys(pattern, cursor, allKeys, cb);
         });
     }
 
+    /**
+     * Find all failed CRR operations that match the bucket, key, and versionID.
+     * @param {Object} details - The route details
+     * @param {Function} cb - The callback to call
+     * @return {undefined}
+     */
+    getFailedCRR(details, cb) {
+        const { bucket, key, versionId } = details;
+        const { failedCRR } = redisKeys;
+        const pattern = `${failedCRR}:${bucket}:${key}:${versionId}:*`;
+        return this._scanAllKeys(pattern, 0, [], (err, cursor, keys) =>
+            this._getFailedCRRResponse(cursor, keys, cb));
+    }
+
     /**
      * Get all CRR operations that have failed.
      * @param {Object} details - The route details
@@ -275,26 +304,17 @@ class BackbeatAPI {
      */
     getAllFailedCRR(details, cb) {
         const marker = Number.parseInt(details.marker, 10) || 0;
-        const cmds = ['hscan', redisKeys.failedCRR, marker, 'COUNT', 1000];
-        this._redisClient.batch([cmds], (err, res) => {
-            if (err) {
-                return cb(err);
-            }
-            const [cmdErr, collection] = res[0];
-            if (cmdErr) {
-                return cb(cmdErr);
-            }
-            const [cursor, hashes] = collection;
-            return cb(null, this._getFailedCRRResponse(cursor, hashes));
-        });
+        const pattern = `${redisKeys.failedCRR}:*`;
+        return this._scanAllKeys(pattern, marker, [], (err, cursor, keys) =>
+            this._getFailedCRRResponse(cursor, keys, cb));
     }
 
     /**
-     * For the given queue enry's site, send an entry with PENDING status to the
-     * replication status topic, then send an entry to the replication topic so
-     * that the queue processor re-attempts replication.
+     * For the given queue entry's site, send an entry with PENDING status to
+     * the replication status topic, then send an entry to the replication topic
+     * so that the queue processor re-attempts replication.
      * @param {QueueEntry} queueEntry - The queue entry constructed from the
-     * failed kafka entry that was stored as a Redis hash value.
+     * failed kafka entry that was stored as a Redis key value.
      * @param {Function} cb - The callback.
      * @return {undefined}
      */
@@ -312,29 +332,27 @@ class BackbeatAPI {
     }
 
     /**
-     * Delete the failed CRR Redis hash field.
-     * @param {String} field - The field in the hash to delete
+     * Delete the failed CRR Redis key.
+     * @param {String} key - The key to delete
      * @param {Function} cb - The callback function
      * @return {undefined}
      */
-    _deleteFailedCRRField(field, cb) {
-        const cmds = ['hdel', redisKeys.failedCRR, field];
-        return this._redisClient.batch([cmds], (err, res) => {
+    _deleteFailedCRRField(key, cb) {
+        const cmd = ['del', key];
+        return this._redisClient.batch([cmd], (err, res) => {
             if (err) {
-                this._logger.error('error deleting redis hash field', {
+                this._logger.error('error deleting redis key', {
                     method: 'BackbeatAPI._deleteFailedCRRField',
-                    key: redisKeys.failedCRR,
-                    field,
+                    key,
                     error: err,
                 });
                 return cb(err);
             }
             const [cmdErr] = res[0];
             if (cmdErr) {
-                this._logger.error('error deleting redis hash field', {
+                this._logger.error('error deleting redis key', {
                     method: 'BackbeatAPI._deleteFailedCRRField',
-                    key: redisKeys.failedCRR,
-                    field,
+                    key,
                     error: cmdErr,
                 });
                 return cb(cmdErr);
@@ -380,11 +398,12 @@ class BackbeatAPI {
                     LastModified: queueEntry.getLastModified(),
                     ReplicationStatus: 'PENDING',
                 });
-                const field = `${Bucket}:${Key}:${VersionId}:${StorageClass}`;
-                return this._deleteFailedCRRField(field, err => {
+                const key =
+                    getFailedCRRKey(Bucket, Key, VersionId, StorageClass);
+                return this._deleteFailedCRRField(key, err => {
                     if (err) {
-                        this._logger.error('could not delete redis hash key ' +
-                        'after pushing to kafka topics', {
+                        this._logger.error('could not delete redis key after ' +
+                        'pushing to kafka topics', {
                             method: 'BackbeatAPI._processFailedKafkaEntries',
                             error: err,
                         });
@@ -408,20 +427,24 @@ class BackbeatAPI {
         if (error) {
             return cb(error);
         }
-        const fields = reqBody.map(o => {
+        const cmds = reqBody.map(o => {
             const { Bucket, Key, VersionId, StorageClass } = o;
-            return `${Bucket}:${Key}:${VersionId}:${StorageClass}`;
+            const key = getFailedCRRKey(Bucket, Key, VersionId, StorageClass);
+            return ['get', key];
         });
-        const cmds = ['hmget', redisKeys.failedCRR, ...fields];
-        return this._redisClient.batch([cmds], (err, res) => {
+        return this._redisClient.batch(cmds, (err, res) => {
             if (err) {
                 return cb(err);
             }
-            const [cmdErr, results] = res[0];
-            if (cmdErr) {
-                return cb(cmdErr);
+            const entries = [];
+            for (let i = 0; i < res.length; i++) {
+                const [cmdErr, entry] = res[i];
+                if (cmdErr) {
+                    return cb(cmdErr);
+                }
+                entries.push(entry);
             }
-            return this._processFailedKafkaEntries(results, cb);
+            return this._processFailedKafkaEntries(entries, cb);
         });
     }
 

diff --git a/lib/util/getFailedCRRKey.js b/lib/util/getFailedCRRKey.js
@@ -0,0 +1,16 @@
+const { redisKeys } = require('../../extensions/replication/constants');
+
+/**
+ * Returns the schema used for failed CRR entry Redis keys.
+ * @param {String} bucket - The name of the bucket
+ * @param {String} key - The name of the key
+ * @param {String} versionId - The encoded version ID
+ * @param {String} storageClass - The storage class of the object
+ * @return {String} - The Redis key used for the failed CRR entry
+ */
+function getFailedCRRKey(bucket, key, versionId, storageClass) {
+    const { failedCRR } = redisKeys;
+    return `${failedCRR}:${bucket}:${key}:${versionId}:${storageClass}`;
+}
+
+module.exports = getFailedCRRKey;
diff --git a/tests/config.json b/tests/config.json
@@ -43,6 +43,7 @@
             "topic": "backbeat-test-replication",
             "replicationStatusTopic": "backbeat-test-replication-status",
             "monitorReplicationFailures": true,
+            "monitorReplicationFailureExpiryTimeS": 86400,
             "groupId": "backbeat-test-replication-group",
             "destination": {
                 "bootstrapList": [