forked from dotnet/runtime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
FrozenHashTable.cs
289 lines (249 loc) · 14 KB
/
FrozenHashTable.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace System.Collections.Frozen
{
/// <summary>Provides the core hash table for use in frozen collections.</summary>
/// <remarks>
/// This hash table doesn't track any of the collection state. It merely keeps track
/// of hash codes and of mapping these hash codes to spans of entries within the collection.
/// </remarks>
internal readonly struct FrozenHashTable
{
private readonly Bucket[] _buckets;
private readonly ulong _fastModMultiplier;
/// <summary>Initializes the hashtable with the computed hashcodes and bucket information.</summary>
/// <param name="hashCodes">The array of hashcodes grouped into contiguous regions by bucket. Each bucket is one and only one region of the array.</param>
/// <param name="buckets">
/// The array of buckets, indexed by hashCodes % buckets.Length, where each bucket is
/// the start/end index into <paramref name="hashCodes"/> for all items in that bucket.
/// </param>
/// <param name="fastModMultiplier">The multiplier to use as part of a FastMod method call.</param>
private FrozenHashTable(int[] hashCodes, Bucket[] buckets, ulong fastModMultiplier)
{
Debug.Assert(hashCodes.Length != 0);
Debug.Assert(buckets.Length != 0);
HashCodes = hashCodes;
_buckets = buckets;
_fastModMultiplier = fastModMultiplier;
}
/// <summary>Initializes a frozen hash table.</summary>
/// <param name="entriesLength">The number of entries to track from the hash table.</param>
/// <param name="hashAtIndex">A delegate that produces a hash code for a given entry. It's passed the index of the entry and returns that entry's hash code.</param>
/// <param name="storeDestIndexFromSrcIndex">A delegate that assigns the index to a specific entry. It's passed the destination and source indices.</param>
/// <param name="optimizeForReading">true to spend additional effort tuning for subsequent read speed on the table; false to prioritize construction time.</param>
/// <remarks>
/// This method will iterate through the incoming entries and will invoke the hasher on each once.
/// It will then determine the optimal number of hash buckets to allocate and will populate the
/// bucket table. In the process of doing so, it calls out to the <paramref name="storeDestIndexFromSrcIndex"/> to indicate
/// the resulting index for that entry. <see cref="FindMatchingEntries(int, out int, out int)"/>
/// then uses this index to reference individual entries by indexing into <see cref="HashCodes"/>.
/// </remarks>
/// <returns>A frozen hash table.</returns>
public static FrozenHashTable Create(int entriesLength, Func<int, int> hashAtIndex, Action<int, int> storeDestIndexFromSrcIndex, bool optimizeForReading = true)
{
Debug.Assert(entriesLength != 0);
// Calculate the hashcodes for every entry.
int[] arrayPoolHashCodes = ArrayPool<int>.Shared.Rent(entriesLength);
Span<int> hashCodes = arrayPoolHashCodes.AsSpan(0, entriesLength);
for (int i = 0; i < entriesLength; i++)
{
hashCodes[i] = hashAtIndex(i);
}
// Determine how many buckets to use. This might be fewer than the number of entries
// if any entries have identical hashcodes (not just different hashcodes that might
// map to the same bucket).
int numBuckets = CalcNumBuckets(hashCodes, optimizeForReading);
ulong fastModMultiplier = HashHelpers.GetFastModMultiplier((uint)numBuckets);
// Create two spans:
// - bucketStarts: initially filled with all -1s, the ith element stores the index
// into hashCodes of the head element of that bucket's chain.
// - nexts: the ith element stores the index of the next item in the chain.
int[] arrayPoolBuckets = ArrayPool<int>.Shared.Rent(numBuckets + hashCodes.Length);
Span<int> bucketStarts = arrayPoolBuckets.AsSpan(0, numBuckets);
Span<int> nexts = arrayPoolBuckets.AsSpan(numBuckets, hashCodes.Length);
bucketStarts.Fill(-1);
// Populate the bucket entries and starts. For each hash code, compute its bucket,
// and store at the bucket entry corresponding to the hashcode item the entry for that
// item, which includes a copy of the hash code and the current bucket start, which
// is then replaced by this entry as it's pushed into the bucket list.
for (int index = 0; index < hashCodes.Length; index++)
{
int hashCode = hashCodes[index];
int bucketNum = (int)HashHelpers.FastMod((uint)hashCode, (uint)bucketStarts.Length, fastModMultiplier);
ref int bucketStart = ref bucketStarts[bucketNum];
nexts[index] = bucketStart;
bucketStart = index;
}
// Write out the hashcodes and buckets arrays to be used by the FrozenHashtable instance.
// We iterate through each bucket start, and from each, each item in that chain, writing
// out all of the items in each chain next to each other in the hashcodes list (and
// calling the setter to allow the consumer to reorder its entries appropriately).
// Along the way we could how many items are in each chain, and use that along with
// the starting index to write out the bucket information for indexing into hashcodes.
var hashtableHashcodes = new int[hashCodes.Length];
var hashtableBuckets = new Bucket[bucketStarts.Length];
int count = 0;
for (int bucketNum = 0; bucketNum < hashtableBuckets.Length; bucketNum++)
{
int bucketStart = bucketStarts[bucketNum];
if (bucketStart < 0)
{
continue;
}
int bucketCount = 0;
int index = bucketStart;
bucketStart = count;
while (index >= 0)
{
hashtableHashcodes[count] = hashCodes[index];
storeDestIndexFromSrcIndex(count, index);
count++;
bucketCount++;
index = nexts[index];
}
hashtableBuckets[bucketNum] = new Bucket(bucketStart, bucketCount);
}
Debug.Assert(count == hashtableHashcodes.Length);
ArrayPool<int>.Shared.Return(arrayPoolBuckets);
ArrayPool<int>.Shared.Return(arrayPoolHashCodes);
return new FrozenHashTable(hashtableHashcodes, hashtableBuckets, fastModMultiplier);
}
/// <summary>
/// Given a hash code, return the first index and last index for the associated matching entries.
/// </summary>
/// <param name="hashCode">The hash code to probe for.</param>
/// <param name="startIndex">A variable that receives the index of the first matching entry.</param>
/// <param name="endIndex">A variable that receives the index of the last matching entry plus 1.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void FindMatchingEntries(int hashCode, out int startIndex, out int endIndex)
{
Bucket[] buckets = _buckets;
ref Bucket b = ref buckets[HashHelpers.FastMod((uint)hashCode, (uint)buckets.Length, _fastModMultiplier)];
startIndex = b.StartIndex;
endIndex = b.EndIndex;
}
public int Count => HashCodes.Length;
internal int[] HashCodes { get; }
/// <summary>
/// Given a span of hash codes, figure out the best number of hash buckets to use.
/// </summary>
/// <remarks>
/// This tries to select a prime number of buckets. Rather than iterating through all possible bucket
/// sizes, starting at the exact number of hash codes and incrementing the bucket count by 1 per trial,
/// this is a trade-off between speed of determining a good number of buckets and maximal density.
/// </remarks>
private static int CalcNumBuckets(ReadOnlySpan<int> hashCodes, bool optimizeForReading)
{
Debug.Assert(hashCodes.Length != 0);
const double AcceptableCollisionRate = 0.05; // What is a satisfactory rate of hash collisions?
const int LargeInputSizeThreshold = 1000; // What is the limit for an input to be considered "small"?
const int MaxSmallBucketTableMultiplier = 16; // How large a bucket table should be allowed for small inputs?
const int MaxLargeBucketTableMultiplier = 3; // How large a bucket table should be allowed for large inputs?
if (!optimizeForReading)
{
return HashHelpers.GetPrime(hashCodes.Length);
}
// Filter out duplicate codes, since no increase in buckets will avoid collisions from duplicate input hash codes.
var codes =
#if NETCOREAPP2_0_OR_GREATER
new HashSet<int>(hashCodes.Length);
#else
new HashSet<int>();
#endif
foreach (int hashCode in hashCodes)
{
codes.Add(hashCode);
}
Debug.Assert(codes.Count != 0);
// In our precomputed primes table, find the index of the smallest prime that's at least as large as our number of
// hash codes. If there are more codes than in our precomputed primes table, which accommodates millions of values,
// give up and just use the next prime.
int minPrimeIndexInclusive = 0;
while (minPrimeIndexInclusive < HashHelpers.s_primes.Length && codes.Count > HashHelpers.s_primes[minPrimeIndexInclusive])
{
minPrimeIndexInclusive++;
}
if (minPrimeIndexInclusive >= HashHelpers.s_primes.Length)
{
return HashHelpers.GetPrime(codes.Count);
}
// Determine the largest number of buckets we're willing to use, based on a multiple of the number of inputs.
// For smaller inputs, we allow for a larger multiplier.
int maxNumBuckets =
codes.Count *
(codes.Count >= LargeInputSizeThreshold ? MaxLargeBucketTableMultiplier : MaxSmallBucketTableMultiplier);
// Find the index of the smallest prime that accommodates our max buckets.
int maxPrimeIndexExclusive = minPrimeIndexInclusive;
while (maxPrimeIndexExclusive < HashHelpers.s_primes.Length && maxNumBuckets > HashHelpers.s_primes[maxPrimeIndexExclusive])
{
maxPrimeIndexExclusive++;
}
if (maxPrimeIndexExclusive < HashHelpers.s_primes.Length)
{
Debug.Assert(maxPrimeIndexExclusive != 0);
maxNumBuckets = HashHelpers.s_primes[maxPrimeIndexExclusive - 1];
}
const int BitsPerInt32 = 32;
int[] seenBuckets = ArrayPool<int>.Shared.Rent((maxNumBuckets / BitsPerInt32) + 1);
int bestNumBuckets = maxNumBuckets;
int bestNumCollisions = codes.Count;
// Iterate through each available prime between the min and max discovered. For each, compute
// the collision ratio.
for (int primeIndex = minPrimeIndexInclusive; primeIndex < maxPrimeIndexExclusive; primeIndex++)
{
// Get the number of buckets to try, and clear our seen bucket bitmap.
int numBuckets = HashHelpers.s_primes[primeIndex];
Array.Clear(seenBuckets, 0, Math.Min(numBuckets, seenBuckets.Length));
// Determine the bucket for each hash code and mark it as seen. If it was already seen,
// track it as a collision.
int numCollisions = 0;
foreach (int code in codes)
{
uint bucketNum = (uint)code % (uint)numBuckets;
if ((seenBuckets[bucketNum / BitsPerInt32] & (1 << (int)bucketNum)) != 0)
{
numCollisions++;
if (numCollisions >= bestNumCollisions)
{
// If we've already hit the previously known best number of collisions,
// there's no point in continuing as worst case we'd just use that.
break;
}
}
else
{
seenBuckets[bucketNum / BitsPerInt32] |= 1 << (int)bucketNum;
}
}
// If this evaluation resulted in fewer collisions, use it as the best instead.
// And if it's below our collision threshold, we're done.
if (numCollisions < bestNumCollisions)
{
bestNumBuckets = numBuckets;
if (numCollisions / (double)codes.Count <= AcceptableCollisionRate)
{
break;
}
bestNumCollisions = numCollisions;
}
}
ArrayPool<int>.Shared.Return(seenBuckets);
return bestNumBuckets;
}
private readonly struct Bucket
{
public readonly int StartIndex;
public readonly int EndIndex;
public Bucket(int startIndex, int count)
{
Debug.Assert(count > 0);
StartIndex = startIndex;
EndIndex = startIndex + count - 1;
}
}
}
}