-
Notifications
You must be signed in to change notification settings - Fork 386
/
Copy pathGlobalsHelper.ts
65 lines (55 loc) · 1.72 KB
/
GlobalsHelper.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import { encodingForModel } from "js-tiktoken";
export enum Tokenizers {
CL100K_BASE = "cl100k_base",
}
/**
* @internal Helper class singleton
*/
class GlobalsHelper {
defaultTokenizer: {
encode: (text: string) => Uint32Array;
decode: (tokens: Uint32Array) => string;
};
constructor() {
const encoding = encodingForModel("text-embedding-ada-002"); // cl100k_base
this.defaultTokenizer = {
encode: (text: string) => {
return new Uint32Array(encoding.encode(text));
},
decode: (tokens: Uint32Array) => {
const numberArray = Array.from(tokens);
const text = encoding.decode(numberArray);
const uint8Array = new TextEncoder().encode(text);
return new TextDecoder().decode(uint8Array);
},
};
}
tokenizer(encoding?: Tokenizers) {
if (encoding && encoding !== Tokenizers.CL100K_BASE) {
throw new Error(`Tokenizer encoding ${encoding} not yet supported`);
}
return this.defaultTokenizer!.encode.bind(this.defaultTokenizer);
}
tokenizerDecoder(encoding?: Tokenizers) {
if (encoding && encoding !== Tokenizers.CL100K_BASE) {
throw new Error(`Tokenizer encoding ${encoding} not yet supported`);
}
return this.defaultTokenizer!.decode.bind(this.defaultTokenizer);
}
}
export function truncateMaxTokens(
tokenizer: Tokenizers,
value: string,
maxTokens: number,
): string {
const encoder = globalsHelper.tokenizer(tokenizer);
let tokens = encoder(value);
if (tokens.length > maxTokens) {
// truncate tokens
tokens = tokens.slice(0, maxTokens);
const decoder = globalsHelper.tokenizerDecoder(tokenizer);
return decoder(tokens);
}
return value;
}
export const globalsHelper = new GlobalsHelper();