/
sample.ts
161 lines (147 loc) · 6.05 KB
/
sample.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import type { IterableContainer } from "./_types";
import { purry } from "./purry";
type Sampled<T extends IterableContainer, N extends number> =
// Check if N is generic (e.g. not '5' but 'number')
number extends N
? SampledGeneric<T>
: // We check if the input tuple is shorter than N. We need to check this
// outside the recursive loop because T changes inside that loop
undefined extends T[N]
? T
: SampledLiteral<T, N>;
type SampledGeneric<T extends IterableContainer> =
// Stop the recursion when the array is empty
T[number] extends never
? T
: // As long as the tuple has non-rest elements we continue expanding the type
// by both taking the item, and not taking it.
T extends readonly [infer First, ...infer Rest]
? SampledGeneric<Rest> | [First, ...SampledGeneric<Rest>]
: // Stop the recursion also when we have an array, or the rest element of the
// tuple
Array<T[number]>;
type SampledLiteral<
T extends IterableContainer,
N extends number,
Iteration extends Array<unknown> = [],
> =
// Stop the recursion when the Iteration "array" is full
Iteration["length"] extends N
? []
: // If the tuple has a defined (non-rest) element, cut it and add it to the
// output tuple.
T extends readonly [infer First, ...infer Tail]
? [
First | Tail[number],
...SampledLiteral<Tail, N, [unknown, ...Iteration]>,
]
: T extends readonly [...infer Head, infer Last]
? [...SampledLiteral<Head, N, [unknown, ...Iteration]>, Last]
: // If the input is an array, or a tuple's rest-element we need to split the
// recursion in 2, one type adds an element to the output, and the other
// skips it, just like the sample method itself.
| SampledLiteral<T, N, [unknown, ...Iteration]>
| [T[number], ...SampledLiteral<T, N, [unknown, ...Iteration]>];
/**
* Returns a random subset of size `sampleSize` from `array`.
*
* Maintains and infers most of the typing information that could be passed
* along to the output. This means that when using tuples, the output will be
* a tuple too, and when using literals, those literals would be preserved.
*
* The items in the result are kept in the same order as they are in the input.
* If you need to get a shuffled response you can pipe the shuffle function
* after this one.
*
* @param data - The array.
* @param sampleSize - The number of elements to take.
* @signature
* R.sample(array, sampleSize)
* @example
* R.sample(["hello", "world"], 1); // => ["hello"] // typed string[]
* R.sample(["hello", "world"] as const, 1); // => ["world"] // typed ["hello" | "world"]
* @dataFirst
* @pipeable
* @category Array
*/
export function sample<T extends IterableContainer, N extends number = number>(
data: T,
sampleSize: N,
): Sampled<T, N>;
/**
* Returns a random subset of size `sampleSize` from `array`.
*
* Maintains and infers most of the typing information that could be passed
* along to the output. This means that when using tuples, the output will be
* a tuple too, and when using literals, those literals would be preserved.
*
* The items in the result are kept in the same order as they are in the input.
* If you need to get a shuffled response you can pipe the shuffle function
* after this one.
*
* @param sampleSize - The number of elements to take.
* @signature
* R.sample(sampleSize)(array)
* @example
* R.sample(1)(["hello", "world"]); // => ["hello"] // typed string[]
* R.sample(1)(["hello", "world"] as const); // => ["world"] // typed ["hello" | "world"]
* @dataLast
* @pipeable
* @category Array
*/
export function sample<T extends IterableContainer, N extends number = number>(
sampleSize: N,
): (data: T) => Sampled<T, N>;
export function sample(...args: ReadonlyArray<unknown>): unknown {
return purry(sampleImplementation, args);
}
function sampleImplementation<T>(
data: ReadonlyArray<T>,
sampleSize: number,
): Array<T> {
if (sampleSize < 0) {
throw new RangeError(`sampleSize must cannot be negative: ${sampleSize}`);
}
if (!Number.isInteger(sampleSize)) {
throw new TypeError(`sampleSize must be an integer: ${sampleSize}`);
}
if (sampleSize >= data.length) {
// Trivial
return data.slice();
}
if (sampleSize === 0) {
// Trivial
return [];
}
// We have 2 modes of sampling, depending on the size of the sample requested.
// 1. If sampleSize is _small_, we generate indices that we then use to
// *EXTRACT* individual elements from the array.
// 2. If sampleSize is _large_, we instead generate indices to *EXCLUDE* from
// a full scan of the input array (via filtering).
//
// This allows us 2 optimizations that are the core of how this function
// works:
// 1. It is hard to generate a large number of unique indices, as the more
// indices we generate the more likely we are to generate one that is already
// in the set, which would require more iterations of the generation loop.
// Capping our effective sampleSize at n/2 would put an upper limit to the
// average number of iterations required (as a function of n).
// 2. If sample size is small enough, we never need to actually iterate over
// the full input array at all; instead we simply project the values we need
// via random access into the array. This means that for sampleSize (K) less
// than n/2, we run at O(klogk). For large sampleSize we need to iterate over
// the full input array, but we don't need to sort the indices because we can
// use the Set's 'has' method, so we effectively run at O(n).
const actualSampleSize = Math.min(sampleSize, data.length - sampleSize);
const sampleIndices = new Set<number>();
while (sampleIndices.size < actualSampleSize) {
const randomIndex = Math.floor(Math.random() * data.length);
sampleIndices.add(randomIndex);
}
if (sampleSize === actualSampleSize) {
return Array.from(sampleIndices)
.sort((a, b) => a - b)
.map((index) => data[index]!);
}
return data.filter((_, index) => !sampleIndices.has(index));
}