forked from cockroachdb/cockroach
-
Notifications
You must be signed in to change notification settings - Fork 0
/
keys.go
477 lines (427 loc) · 20.4 KB
/
keys.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
// Copyright 2015 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License. See the AUTHORS file
// for names of contributors.
//
// Author: Spencer Kimball (spencer.kimball@gmail.com)
// Author: Tobias Schottdorf (tobias.schottdorf@gmail.com)
package engine
import (
"bytes"
"fmt"
"github.com/cockroachdb/cockroach/proto"
"github.com/cockroachdb/cockroach/util/encoding"
"github.com/cockroachdb/cockroach/util/log"
)
// MakeKey makes a new key which is the concatenation of the
// given inputs, in order.
func MakeKey(keys ...proto.Key) proto.Key {
return proto.MakeKey(keys...)
}
// MakeStoreKey creates a store-local key based on the metadata key
// suffix, and optional detail.
func MakeStoreKey(suffix, detail proto.Key) proto.Key {
return MakeKey(KeyLocalStorePrefix, suffix, detail)
}
// StoreIdentKey returns a store-local key for the store metadata.
func StoreIdentKey() proto.Key {
return MakeStoreKey(KeyLocalStoreIdentSuffix, proto.Key{})
}
// StoreStatKey returns the key for accessing the named stat.
func StoreStatKey(stat proto.Key) proto.Key {
return MakeStoreKey(KeyLocalStoreStatSuffix, stat)
}
// StoreStatusKey returns the key for accessing the store status for the
// specified store ID.
func StoreStatusKey(storeID int32) proto.Key {
return MakeKey(KeyStatusStorePrefix, encoding.EncodeUvarint(nil, uint64(storeID)))
}
// NodeStatusKey returns the key for accessing the node status for the
// specified node ID.
func NodeStatusKey(nodeID int32) proto.Key {
return MakeKey(KeyStatusNodePrefix, encoding.EncodeUvarint(nil, uint64(nodeID)))
}
// MakeRangeIDKey creates a range-local key based on the range's
// Raft ID, metadata key suffix, and optional detail (e.g. the
// encoded command ID for a response cache entry, etc.).
func MakeRangeIDKey(raftID int64, suffix, detail proto.Key) proto.Key {
if len(suffix) != KeyLocalSuffixLength {
panic(fmt.Sprintf("suffix len(%q) != %d", suffix, KeyLocalSuffixLength))
}
return MakeKey(KeyLocalRangeIDPrefix, encoding.EncodeUvarint(nil, uint64(raftID)), suffix, detail)
}
// RaftLogKey returns a system-local key for a Raft log entry.
func RaftLogKey(raftID int64, logIndex uint64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftLogSuffix, encoding.EncodeUint64(nil, logIndex))
}
// RaftLogPrefix returns the system-local prefix shared by all entries in a Raft log.
func RaftLogPrefix(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftLogSuffix, proto.Key{})
}
// RaftHardStateKey returns a system-local key for a Raft HardState.
func RaftHardStateKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftHardStateSuffix, proto.Key{})
}
// DecodeRaftStateKey extracts the Raft ID from a RaftStateKey.
func DecodeRaftStateKey(key proto.Key) int64 {
if !bytes.HasPrefix(key, KeyLocalRangeIDPrefix) {
panic(fmt.Sprintf("key %q does not have %q prefix", key, KeyLocalRangeIDPrefix))
}
// Cut the prefix and the Raft ID.
b := key[len(KeyLocalRangeIDPrefix):]
_, raftID := encoding.DecodeUvarint(b)
return int64(raftID)
}
// RaftTruncatedStateKey returns a system-local key for a RaftTruncatedState.
func RaftTruncatedStateKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftTruncatedStateSuffix, proto.Key{})
}
// RaftAppliedIndexKey returns a system-local key for a raft applied index.
func RaftAppliedIndexKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftAppliedIndexSuffix, proto.Key{})
}
// RaftLeaderLeaseKey returns a system-local key for a raft leader lease.
func RaftLeaderLeaseKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftLeaderLeaseSuffix, proto.Key{})
}
// RaftLastIndexKey returns a system-local key for a raft last index.
func RaftLastIndexKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRaftLastIndexSuffix, proto.Key{})
}
// RangeStatKey returns the key for accessing the named stat
// for the specified Raft ID.
func RangeStatKey(raftID int64, stat proto.Key) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRangeStatSuffix, stat)
}
// ResponseCacheKey returns a range-local key by Raft ID for a
// response cache entry, with detail specified by encoding the
// supplied client command ID.
func ResponseCacheKey(raftID int64, cmdID *proto.ClientCmdID) proto.Key {
detail := proto.Key{}
if cmdID != nil {
detail = encoding.EncodeUvarint(nil, uint64(cmdID.WallTime)) // wall time helps sort for locality
detail = encoding.EncodeUint64(detail, uint64(cmdID.Random))
}
return MakeRangeIDKey(raftID, KeyLocalResponseCacheSuffix, detail)
}
// MakeRangeKey creates a range-local key based on the range
// start key, metadata key suffix, and optional detail (e.g. the
// transaction ID for a txn record, etc.).
func MakeRangeKey(key, suffix, detail proto.Key) proto.Key {
if len(suffix) != KeyLocalSuffixLength {
panic(fmt.Sprintf("suffix len(%q) != %d", suffix, KeyLocalSuffixLength))
}
return MakeKey(KeyLocalRangeKeyPrefix, encoding.EncodeBytes(nil, key), suffix, detail)
}
// DecodeRangeKey decodes the range key into range start key,
// suffix and optional detail (may be nil).
func DecodeRangeKey(key proto.Key) (startKey, suffix, detail proto.Key) {
if !bytes.HasPrefix(key, KeyLocalRangeKeyPrefix) {
panic(fmt.Sprintf("key %q does not have %q prefix", key, KeyLocalRangeKeyPrefix))
}
// Cut the prefix and the Raft ID.
b := key[len(KeyLocalRangeKeyPrefix):]
b, startKey = encoding.DecodeBytes(b)
if len(b) < KeyLocalSuffixLength {
panic(fmt.Sprintf("key %q does not have suffix of length %d", key, KeyLocalSuffixLength))
}
// Cut the response cache suffix.
suffix = b[:KeyLocalSuffixLength]
detail = b[KeyLocalSuffixLength:]
return
}
// RangeGCMetadataKey returns a range-local key for range garbage
// collection metadata.
func RangeGCMetadataKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRangeGCMetadataSuffix, proto.Key{})
}
// RangeLastVerificationTimestampKey returns a range-local key for
// the range's last verification timestamp.
func RangeLastVerificationTimestampKey(raftID int64) proto.Key {
return MakeRangeIDKey(raftID, KeyLocalRangeLastVerificationTimestampSuffix, proto.Key{})
}
// RangeTreeNodeKey returns a range-local key for the the range's
// node in the range tree.
func RangeTreeNodeKey(key proto.Key) proto.Key {
return MakeRangeKey(key, KeyLocalRangeTreeNodeSuffix, proto.Key{})
}
// RangeDescriptorKey returns a range-local key for the descriptor
// for the range with specified key.
func RangeDescriptorKey(key proto.Key) proto.Key {
return MakeRangeKey(key, KeyLocalRangeDescriptorSuffix, proto.Key{})
}
// TransactionKey returns a transaction key based on the provided
// transaction key and ID. The base key is encoded in order to
// guarantee that all transaction records for a range sort together.
func TransactionKey(key proto.Key, id []byte) proto.Key {
return MakeRangeKey(key, KeyLocalTransactionSuffix, proto.Key(id))
}
// KeyAddress returns the address for the key, used to lookup the
// range containing the key. In the normal case, this is simply the
// key's value. However, for local keys, such as transaction records,
// range-spanning binary tree node pointers, and message queues, the
// address is the trailing suffix of the key, with the local key
// prefix removed. In this way, local keys address to the same range
// as non-local keys, but are stored separately so that they don't
// collide with user-space or global system keys.
//
// However, not all local keys are addressable in the global map. Only
// range local keys incorporating a range key (start key or transaction
// key) are addressable (e.g. range metadata and txn records). Range
// local keys incorporating the Raft ID are not (e.g. response cache
// entries, and range stats).
func KeyAddress(k proto.Key) proto.Key {
if !bytes.HasPrefix(k, KeyLocalPrefix) {
return k
}
if bytes.HasPrefix(k, KeyLocalRangeKeyPrefix) {
k = k[len(KeyLocalRangeKeyPrefix):]
_, k = encoding.DecodeBytes(k)
return k
}
log.Fatalf("local key %q malformed; should contain prefix %q", k, KeyLocalRangeKeyPrefix)
return nil
}
// RangeMetaKey returns a range metadata (meta1, meta2) indexing key
// for the given key. For ordinary keys this returns a level 2
// metadata key - for level 2 keys, it returns a level 1 key. For
// level 1 keys and local keys, KeyMin is returned.
func RangeMetaKey(key proto.Key) proto.Key {
if len(key) == 0 {
return KeyMin
}
addr := KeyAddress(key)
if !bytes.HasPrefix(addr, KeyMetaPrefix) {
return MakeKey(KeyMeta2Prefix, addr)
}
if bytes.HasPrefix(addr, KeyMeta2Prefix) {
return MakeKey(KeyMeta1Prefix, addr[len(KeyMeta2Prefix):])
}
return KeyMin
}
// ValidateRangeMetaKey validates that the given key is a valid Range Metadata
// key. It must have an appropriate metadata range prefix, and the original key
// value must be less than KeyMax. As a special case, KeyMin is considered a
// valid Range Metadata Key.
func ValidateRangeMetaKey(key proto.Key) error {
// KeyMin is a valid key.
if len(key) == 0 {
return nil
}
// Key must be at least as long as KeyMeta1Prefix.
if len(key) < len(KeyMeta1Prefix) {
return NewInvalidRangeMetaKeyError("too short", key)
}
prefix, body := key[:len(KeyMeta1Prefix)], key[len(KeyMeta1Prefix):]
// The prefix must be equal to KeyMeta1Prefix or KeyMeta2Prefix
if !bytes.HasPrefix(key, KeyMetaPrefix) {
return NewInvalidRangeMetaKeyError(fmt.Sprintf("does not have %q prefix", KeyMetaPrefix), key)
}
if lvl := string(prefix[len(KeyMetaPrefix)]); lvl != "1" && lvl != "2" {
return NewInvalidRangeMetaKeyError("meta level is not 1 or 2", key)
}
// Body of the key must sort before KeyMax
if !body.Less(KeyMax) {
return NewInvalidRangeMetaKeyError("body of range lookup is >= KeyMax", key)
}
return nil
}
// Constants for stat key construction.
var (
// StatLiveBytes counts how many bytes are "live", including bytes
// from both keys and values. Live rows include only non-deleted
// keys and only the most recent value.
StatLiveBytes = proto.Key("live-bytes")
// StatKeyBytes counts how many bytes are used to store all keys,
// including bytes from deleted keys. Key bytes are re-counted for
// each versioned value.
StatKeyBytes = proto.Key("key-bytes")
// StatValBytes counts how many bytes are used to store all values,
// including all historical versions and deleted tombstones.
StatValBytes = proto.Key("val-bytes")
// StatIntentBytes counts how many bytes are used to store values
// which are unresolved intents. Includes bytes used for both intent
// keys and values.
StatIntentBytes = proto.Key("intent-bytes")
// StatLiveCount counts how many keys are "live". This includes only
// non-deleted keys.
StatLiveCount = proto.Key("live-count")
// StatKeyCount counts the total number of keys, including both live
// and deleted keys.
StatKeyCount = proto.Key("key-count")
// StatValCount counts the total number of values, including all
// historical versions and deleted tombstones.
StatValCount = proto.Key("val-count")
// StatIntentCount counts the number of unresolved intents.
StatIntentCount = proto.Key("intent-count")
// StatIntentAge counts the total age of unresolved intents.
StatIntentAge = proto.Key("intent-age")
// StatGCBytesAge counts the total age of gc'able bytes.
StatGCBytesAge = proto.Key("gc-age")
// StatLastUpdateNanos counts nanoseconds since the unix epoch for
// the last update to the intent / GC'able bytes ages. This really
// is tracking the wall time as at last update, but is a merged
// stat, with successive counts of elapsed nanos being added at each
// stat computation.
StatLastUpdateNanos = proto.Key("update-nanos")
)
// Constants for system-reserved keys in the KV map.
var (
// KeyMaxLength is the maximum key length in bytes. This value is
// somewhat arbitrary. It is chosen high enough to allow most
// conceivable use cases while also still being comfortably short of
// a limit which would affect the performance of the system, both
// from performance of key comparisons and from memory usage for
// things like the timestamp cache, lookup cache, and command queue.
KeyMaxLength = proto.KeyMaxLength
// KeyMin is a minimum key value which sorts before all other keys.
KeyMin = proto.KeyMin
// KeyMax is a maximum key value which sorts after all other keys.
KeyMax = proto.KeyMax
// MVCCKeyMax is a maximum mvcc-encoded key value which sorts after
// all other keys.
MVCCKeyMax = MVCCEncodeKey(KeyMax)
// KeyLocalPrefix is the prefix for keys which hold data local to a
// RocksDB instance, such as store and range-specific metadata which
// must not pollute the user key space, but must be collocate with
// the store and/or ranges which they refer to. Storing this
// information in the normal system keyspace would place the data on
// an arbitrary set of stores, with no guarantee of collocation.
// Local data includes store metadata, range metadata, response
// cache values, transaction records, range-spanning binary tree
// node pointers, and message queues.
//
// The local key prefix has been deliberately chosen to sort before
// the KeySystemPrefix, because these local keys are not addressable
// via the meta range addressing indexes.
//
// Some local data are not replicated, such as the store's 'ident'
// record. Most local data are replicated, such as response cache
// entries and transaction rows, but are not addressable as normal
// MVCC values as part of transactions. Finally, some local data are
// stored as MVCC values and are addressable as part of distributed
// transactions, such as range metadata, range-spanning binary tree
// node pointers, and message queues.
KeyLocalPrefix = proto.Key("\x00\x00\x00")
// KeyLocalSuffixLength specifies the length in bytes of all local
// key suffixes.
KeyLocalSuffixLength = 4
// There are three types of local key data enumerated below:
// store-local, range-local by ID, and range-local by key.
// KeyLocalStorePrefix is the prefix identifying per-store data.
KeyLocalStorePrefix = MakeKey(KeyLocalPrefix, proto.Key("s"))
// KeyLocalStoreIdentSuffix stores an immutable identifier for this
// store, created when the store is first bootstrapped.
KeyLocalStoreIdentSuffix = proto.Key("iden")
// KeyLocalStoreStatSuffix is the suffix for store statistics.
KeyLocalStoreStatSuffix = proto.Key("sst-")
// KeyLocalRangeIDPrefix is the prefix identifying per-range data
// indexed by Raft ID. The Raft ID is appended to this prefix,
// encoded using EncodeUvarint. The specific sort of per-range
// metadata is identified by one of the suffixes listed below, along
// with potentially additional encoded key info, such as a command
// ID in the case of response cache entry.
//
// NOTE: KeyLocalRangeIDPrefix must be kept in sync with the value
// in storage/engine/db.cc.
KeyLocalRangeIDPrefix = MakeKey(KeyLocalPrefix, proto.Key("i"))
// KeyLocalResponseCacheSuffix is the suffix for keys storing
// command responses used to guarantee idempotency (see ResponseCache).
// NOTE: if this value changes, it must be updated in C++
// (storage/engine/db.cc).
KeyLocalResponseCacheSuffix = proto.Key("res-")
// KeyLocalRaftLeaderLeaseSuffix is the suffix for the raft leader lease.
KeyLocalRaftLeaderLeaseSuffix = proto.Key("rfll")
// KeyLocalRaftHardStateSuffix is the Suffix for the raft HardState.
KeyLocalRaftHardStateSuffix = proto.Key("rfth")
// KeyLocalRaftAppliedIndexSuffix is the suffix for the raft applied index.
KeyLocalRaftAppliedIndexSuffix = proto.Key("rfta")
// KeyLocalRaftLogSuffix is the suffix for the raft log.
KeyLocalRaftLogSuffix = proto.Key("rftl")
// KeyLocalRaftTruncatedStateSuffix is the suffix for the RaftTruncatedState.
KeyLocalRaftTruncatedStateSuffix = proto.Key("rftt")
// KeyLocalRaftLastIndexSuffix is the suffix for raft's last index.
KeyLocalRaftLastIndexSuffix = proto.Key("rfti")
// KeyLocalRangeGCMetadataSuffix is the suffix for a range's GC metadata.
KeyLocalRangeGCMetadataSuffix = proto.Key("rgcm")
// KeyLocalRangeLastVerificationTimestampSuffix is the suffix for a range's
// last verification timestamp (for checking integrity of on-disk data).
KeyLocalRangeLastVerificationTimestampSuffix = proto.Key("rlvt")
// KeyLocalRangeStatSuffix is the suffix for range statistics.
KeyLocalRangeStatSuffix = proto.Key("rst-")
// KeyLocalRangeKeyPrefix is the prefix identifying per-range data
// indexed by range key (either start key, or some key in the
// range). The key is appended to this prefix, encoded using
// EncodeBytes. The specific sort of per-range metadata is
// identified by one of the suffixes listed below, along with
// potentially additional encoded key info, such as the txn ID in
// the case of a transaction record.
//
// NOTE: KeyLocalRangeKeyPrefix must be kept in sync with the value
// in storage/engine/db.cc.
KeyLocalRangeKeyPrefix = MakeKey(KeyLocalPrefix, proto.Key("k"))
// KeyLocalRangeDescriptorSuffix is the suffix for keys storing
// range descriptors. The value is a struct of type RangeDescriptor.
KeyLocalRangeDescriptorSuffix = proto.Key("rdsc")
// KeyLocalRangeTreeNodeSuffix is the suffix for keys storing
// range tree nodes. The value is a struct of type RangeTreeNode.
KeyLocalRangeTreeNodeSuffix = proto.Key("rtn-")
// KeyLocalTransactionSuffix specifies the key suffix for
// transaction records. The additional detail is the transaction id.
// NOTE: if this value changes, it must be updated in C++
// (storage/engine/db.cc).
KeyLocalTransactionSuffix = proto.Key("txn-")
// KeyLocalMax is the end of the local key range.
KeyLocalMax = KeyLocalPrefix.PrefixEnd()
// KeySystemPrefix indicates the beginning of the key range for
// global, system data which are replicated across the cluster.
KeySystemPrefix = proto.Key("\x00")
KeySystemMax = proto.Key("\x01")
// KeyMetaPrefix is the prefix for range metadata keys. Notice that
// an extra null character in the prefix causes all range addressing
// records to sort before any system tables which they might describe.
KeyMetaPrefix = MakeKey(KeySystemPrefix, proto.Key("\x00meta"))
// KeyMeta1Prefix is the first level of key addressing. The value is a
// RangeDescriptor struct.
KeyMeta1Prefix = MakeKey(KeyMetaPrefix, proto.Key("1"))
// KeyMeta2Prefix is the second level of key addressing. The value is a
// RangeDescriptor struct.
KeyMeta2Prefix = MakeKey(KeyMetaPrefix, proto.Key("2"))
// KeyMetaMax is the end of the range of addressing keys.
KeyMetaMax = MakeKey(KeySystemPrefix, proto.Key("\x01"))
// KeyConfigAccountingPrefix specifies the key prefix for accounting
// configurations. The suffix is the affected key prefix.
KeyConfigAccountingPrefix = MakeKey(KeySystemPrefix, proto.Key("acct"))
// KeyConfigPermissionPrefix specifies the key prefix for accounting
// configurations. The suffix is the affected key prefix.
KeyConfigPermissionPrefix = MakeKey(KeySystemPrefix, proto.Key("perm"))
// KeyConfigZonePrefix specifies the key prefix for zone
// configurations. The suffix is the affected key prefix.
KeyConfigZonePrefix = MakeKey(KeySystemPrefix, proto.Key("zone"))
// KeyNodeIDGenerator is the global node ID generator sequence.
KeyNodeIDGenerator = MakeKey(KeySystemPrefix, proto.Key("node-idgen"))
// KeyRaftIDGenerator is the global Raft consensus group ID generator sequence.
KeyRaftIDGenerator = MakeKey(KeySystemPrefix, proto.Key("raft-idgen"))
// KeySchemaPrefix specifies key prefixes for schema definitions.
KeySchemaPrefix = MakeKey(KeySystemPrefix, proto.Key("schema"))
// KeyStoreIDGenerator is the global store ID generator sequence.
KeyStoreIDGenerator = MakeKey(KeySystemPrefix, proto.Key("store-idgen"))
// KeyRangeTreeRoot specifies the root range in the range tree.
KeyRangeTreeRoot = MakeKey(KeySystemPrefix, proto.Key("range-tree-root"))
// KeyStatusPrefix specifies the key prefix to store all status details.
KeyStatusPrefix = MakeKey(KeySystemPrefix, proto.Key("status-"))
// KeyStatusStorePrefix stores all status info for stores.
KeyStatusStorePrefix = MakeKey(KeyStatusPrefix, proto.Key("store-"))
// KeyStatusNodePrefix stores all status info for nodes.
KeyStatusNodePrefix = MakeKey(KeyStatusPrefix, proto.Key("node-"))
)