-
Notifications
You must be signed in to change notification settings - Fork 386
/
Copy pathSentenceWindowNodeParser.ts
89 lines (78 loc) · 2.56 KB
/
SentenceWindowNodeParser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import type { BaseNode } from "../Node.js";
import { SentenceSplitter } from "../TextSplitter.js";
import type { NodeParser } from "./types.js";
import { getNodesFromDocument } from "./utils.js";
export const DEFAULT_WINDOW_SIZE = 3;
export const DEFAULT_WINDOW_METADATA_KEY = "window";
export const DEFAULT_OG_TEXT_METADATA_KEY = "original_text";
export class SentenceWindowNodeParser implements NodeParser {
/**
* The text splitter to use.
*/
textSplitter: SentenceSplitter;
/**
* The number of sentences on each side of a sentence to capture.
*/
windowSize: number = DEFAULT_WINDOW_SIZE;
/**
* The metadata key to store the sentence window under.
*/
windowMetadataKey: string = DEFAULT_WINDOW_METADATA_KEY;
/**
* The metadata key to store the original sentence in.
*/
originalTextMetadataKey: string = DEFAULT_OG_TEXT_METADATA_KEY;
/**
* Whether to include metadata in the nodes.
*/
includeMetadata: boolean = true;
/**
* Whether to include previous and next relationships in the nodes.
*/
includePrevNextRel: boolean = true;
constructor(init?: Partial<SentenceWindowNodeParser>) {
Object.assign(this, init);
this.textSplitter = init?.textSplitter ?? new SentenceSplitter();
}
static fromDefaults(
init?: Partial<SentenceWindowNodeParser>,
): SentenceWindowNodeParser {
return new SentenceWindowNodeParser(init);
}
async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> {
return this.getNodesFromDocuments(nodes);
}
getNodesFromDocuments(documents: BaseNode[]) {
return documents
.map((document) => this.buildWindowNodesFromDocument(document))
.flat();
}
protected buildWindowNodesFromDocument(doc: BaseNode): BaseNode[] {
const nodes = getNodesFromDocument(
doc,
this.textSplitter.getSentenceSplits.bind(this.textSplitter),
this.includeMetadata,
this.includePrevNextRel,
);
for (let i = 0; i < nodes.length; i++) {
const node = nodes[i];
const windowNodes = nodes.slice(
Math.max(0, i - this.windowSize),
Math.min(i + this.windowSize + 1, nodes.length),
);
node.metadata[this.windowMetadataKey] = windowNodes
.map((n) => n.getText())
.join(" ");
node.metadata[this.originalTextMetadataKey] = node.getText();
node.excludedEmbedMetadataKeys.push(
this.windowMetadataKey,
this.originalTextMetadataKey,
);
node.excludedLlmMetadataKeys.push(
this.windowMetadataKey,
this.originalTextMetadataKey,
);
}
return nodes;
}
}