-
Notifications
You must be signed in to change notification settings - Fork 360
/
TextSplitter.ts
314 lines (278 loc) · 9.63 KB
/
TextSplitter.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
import { EOL } from "node:os";
// GitHub translated
import { globalsHelper } from "./GlobalsHelper";
import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants";
class TextSplit {
textChunk: string;
numCharOverlap: number | undefined;
constructor(
textChunk: string,
numCharOverlap: number | undefined = undefined,
) {
this.textChunk = textChunk;
this.numCharOverlap = numCharOverlap;
}
}
type SplitRep = { text: string; numTokens: number };
const defaultregex = /[.?!][\])'"`’”]*(?:\s|$)/g;
export const defaultSentenceTokenizer = (text: string): string[] => {
const slist = [];
const iter = text.matchAll(defaultregex);
let lastIdx = 0;
for (const match of iter) {
slist.push(text.slice(lastIdx, match.index! + 1));
lastIdx = match.index! + 1;
}
slist.push(text.slice(lastIdx));
return slist.filter((s) => s.length > 0);
};
// Refs: https://github.com/fxsjy/jieba/issues/575#issuecomment-359637511
const resentencesp =
/([﹒﹔﹖﹗.;。!?]["’”」』]{0,2}|:(?=["‘“「『]{1,2}|$))/;
/**
* Tokenizes sentences. Suitable for Chinese, Japanese, and Korean. Use instead of `defaultSentenceTokenizer`.
* @param text
* @returns string[]
*/
export function cjkSentenceTokenizer(sentence: string): string[] {
const slist = [];
const parts = sentence.split(resentencesp);
for (let i = 0; i < parts.length; i++) {
const part = parts[i];
if (resentencesp.test(part) && slist.length > 0) {
slist[slist.length - 1] += part;
} else if (part) {
slist.push(part);
}
}
return slist.filter((s) => s.length > 0);
}
export const defaultParagraphSeparator = EOL + EOL + EOL;
// In theory there's also Mac style \r only, but it's pre-OSX and I don't think
// many documents will use it.
/**
* SentenceSplitter is our default text splitter that supports splitting into sentences, paragraphs, or fixed length chunks with overlap.
*
* One of the advantages of SentenceSplitter is that even in the fixed length chunks it will try to keep sentences together.
*/
export class SentenceSplitter {
private chunkSize: number;
private chunkOverlap: number;
private tokenizer: any;
private tokenizerDecoder: any;
private paragraphSeparator: string;
private chunkingTokenizerFn: (text: string) => string[];
private splitLongSentences: boolean;
constructor(options?: {
chunkSize?: number;
chunkOverlap?: number;
tokenizer?: any;
tokenizerDecoder?: any;
paragraphSeparator?: string;
chunkingTokenizerFn?: (text: string) => string[];
splitLongSentences?: boolean;
}) {
const {
chunkSize = DEFAULT_CHUNK_SIZE,
chunkOverlap = DEFAULT_CHUNK_OVERLAP,
tokenizer = null,
tokenizerDecoder = null,
paragraphSeparator = defaultParagraphSeparator,
chunkingTokenizerFn,
splitLongSentences = false,
} = options ?? {};
if (chunkOverlap > chunkSize) {
throw new Error(
`Got a larger chunk overlap (${chunkOverlap}) than chunk size (${chunkSize}), should be smaller.`,
);
}
this.chunkSize = chunkSize;
this.chunkOverlap = chunkOverlap;
// this._callback_manager = callback_manager || new CallbackManager([]);
this.tokenizer = tokenizer ?? globalsHelper.tokenizer();
this.tokenizerDecoder =
tokenizerDecoder ?? globalsHelper.tokenizerDecoder();
this.paragraphSeparator = paragraphSeparator;
this.chunkingTokenizerFn = chunkingTokenizerFn ?? defaultSentenceTokenizer;
this.splitLongSentences = splitLongSentences;
}
private getEffectiveChunkSize(extraInfoStr?: string): number {
// get "effective" chunk size by removing the metadata
let effectiveChunkSize;
if (extraInfoStr != undefined) {
const numExtraTokens = this.tokenizer(`${extraInfoStr}\n\n`).length + 1;
effectiveChunkSize = this.chunkSize - numExtraTokens;
if (effectiveChunkSize <= 0) {
throw new Error(
"Effective chunk size is non positive after considering extra_info",
);
}
} else {
effectiveChunkSize = this.chunkSize;
}
return effectiveChunkSize;
}
getParagraphSplits(text: string, effectiveChunkSize?: number): string[] {
// get paragraph splits
let paragraphSplits: string[] = text.split(this.paragraphSeparator);
let idx = 0;
if (effectiveChunkSize == undefined) {
return paragraphSplits;
}
// merge paragraphs that are too small
while (idx < paragraphSplits.length) {
if (
idx < paragraphSplits.length - 1 &&
paragraphSplits[idx].length < effectiveChunkSize
) {
paragraphSplits[idx] = [
paragraphSplits[idx],
paragraphSplits[idx + 1],
].join(this.paragraphSeparator);
paragraphSplits.splice(idx + 1, 1);
} else {
idx += 1;
}
}
return paragraphSplits;
}
getSentenceSplits(text: string, effectiveChunkSize?: number): string[] {
let paragraphSplits = this.getParagraphSplits(text, effectiveChunkSize);
// Next we split the text using the chunk tokenizer fn/
let splits = [];
for (const parText of paragraphSplits) {
const sentenceSplits = this.chunkingTokenizerFn(parText);
if (!sentenceSplits) {
continue;
}
for (const sentence_split of sentenceSplits) {
splits.push(sentence_split.trim());
}
}
return splits;
}
/**
* Splits sentences into chunks if necessary.
*
* This isn't great behavior because it can split down the middle of a
* word or in non-English split down the middle of a Unicode codepoint
* so the splitting is turned off by default. If you need it, please
* set the splitLongSentences option to true.
* @param sentenceSplits
* @param effectiveChunkSize
* @returns
*/
private processSentenceSplits(
sentenceSplits: string[],
effectiveChunkSize: number,
): SplitRep[] {
if (!this.splitLongSentences) {
return sentenceSplits.map((split) => ({
text: split,
numTokens: this.tokenizer(split).length,
}));
}
let newSplits: SplitRep[] = [];
for (const split of sentenceSplits) {
let splitTokens = this.tokenizer(split);
const splitLen = splitTokens.length;
if (splitLen <= effectiveChunkSize) {
newSplits.push({ text: split, numTokens: splitLen });
} else {
for (let i = 0; i < splitLen; i += effectiveChunkSize) {
const cur_split = this.tokenizerDecoder(
splitTokens.slice(i, i + effectiveChunkSize),
);
newSplits.push({ text: cur_split, numTokens: effectiveChunkSize });
}
}
}
return newSplits;
}
combineTextSplits(
newSentenceSplits: SplitRep[],
effectiveChunkSize: number,
): TextSplit[] {
// go through sentence splits, combine to chunks that are within the chunk size
// docs represents final list of text chunks
let docs: TextSplit[] = [];
// curChunkSentences represents the current list of sentence splits (that)
// will be merged into a chunk
let curChunkSentences: SplitRep[] = [];
let curChunkTokens = 0;
for (let i = 0; i < newSentenceSplits.length; i++) {
// if adding newSentenceSplits[i] to curDocBuffer would exceed effectiveChunkSize,
// then we need to add the current curDocBuffer to docs
if (
curChunkTokens + newSentenceSplits[i].numTokens >
effectiveChunkSize
) {
if (curChunkSentences.length > 0) {
// push curent doc list to docs
docs.push(
new TextSplit(
curChunkSentences
.map((sentence) => sentence.text)
.join(" ")
.trim(),
),
);
}
const lastChunkSentences = curChunkSentences;
// reset docs list
curChunkTokens = 0;
curChunkSentences = [];
// add the last sentences from the last chunk until we've hit the overlap
// do it in reverse order
for (let j = lastChunkSentences.length - 1; j >= 0; j--) {
if (
curChunkTokens + lastChunkSentences[j].numTokens >
this.chunkOverlap
) {
break;
}
curChunkSentences.unshift(lastChunkSentences[j]);
curChunkTokens += lastChunkSentences[j].numTokens + 1;
}
}
curChunkSentences.push(newSentenceSplits[i]);
curChunkTokens += newSentenceSplits[i].numTokens + 1;
}
docs.push(
new TextSplit(
curChunkSentences
.map((sentence) => sentence.text)
.join(" ")
.trim(),
),
);
return docs;
}
splitTextWithOverlaps(text: string, extraInfoStr?: string): TextSplit[] {
// Split incoming text and return chunks with overlap size.
// Has a preference for complete sentences, phrases, and minimal overlap.
// here is the typescript code (skip callback manager)
if (text == "") {
return [];
}
let effectiveChunkSize = this.getEffectiveChunkSize(extraInfoStr);
let sentenceSplits = this.getSentenceSplits(text, effectiveChunkSize);
// Check if any sentences exceed the chunk size. If they don't,
// force split by tokenizer
let newSentenceSplits = this.processSentenceSplits(
sentenceSplits,
effectiveChunkSize,
);
// combine sentence splits into chunks of text that can then be returned
let combinedTextSplits = this.combineTextSplits(
newSentenceSplits,
effectiveChunkSize,
);
return combinedTextSplits;
}
splitText(text: string, extraInfoStr?: string): string[] {
const text_splits = this.splitTextWithOverlaps(text);
const chunks = text_splits.map((text_split) => text_split.textChunk);
return chunks;
}
}