-
Notifications
You must be signed in to change notification settings - Fork 314
/
MarkdownNodeParser.ts
109 lines (98 loc) · 2.89 KB
/
MarkdownNodeParser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import type { BaseNode, Metadata } from "../Node.js";
import { MetadataMode, TextNode } from "../Node.js";
import type { NodeParser } from "./types.js";
export class MarkdownNodeParser implements NodeParser {
includeMetadata: boolean;
includePrevNextRel: boolean;
constructor(init?: {
includeMetadata?: boolean;
includePrevNextRel?: boolean;
}) {
this.includeMetadata = init?.includeMetadata ?? true;
this.includePrevNextRel = init?.includePrevNextRel ?? true;
}
async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> {
return this.getNodesFromDocuments(nodes);
}
static fromDefaults(init?: {
includeMetadata?: boolean;
includePrevNextRel?: boolean;
}): MarkdownNodeParser {
return new MarkdownNodeParser(init);
}
buildNodeFromSplit(
textSplit: string,
node: BaseNode<Metadata>,
metadata: Metadata,
): BaseNode<Metadata> {
const newNode = new TextNode({
text: textSplit,
relationships: {
PARENT: [
{
...node,
nodeId: node.id_,
},
],
},
metadata: this.includeMetadata ? metadata : {},
});
return newNode;
}
updateMetadata(
headersMetadata: Metadata,
newHeader: string,
newHeaderLevel: number,
): Metadata {
const updatedHeaders: Metadata = {};
for (let i = 1; i < newHeaderLevel; i++) {
const key = `Header ${i}`;
if (key in headersMetadata) {
updatedHeaders[key] = headersMetadata[key];
}
}
updatedHeaders[`Header ${newHeaderLevel}`] = newHeader;
return updatedHeaders;
}
getNodesFromNode(node: BaseNode<Metadata>): BaseNode<Metadata>[] {
const text = node.getContent(MetadataMode.NONE);
const markdownNodes: BaseNode<Metadata>[] = [];
const lines = text.split("\n");
let metadata: Metadata = {};
let codeBlock = false;
let currentSection = "";
for (const line of lines) {
if (line.startsWith("```")) {
codeBlock = !codeBlock;
}
const headerMatch = line.match(/^(#+)\s(.*)/);
if (headerMatch && !codeBlock) {
if (currentSection !== "") {
markdownNodes.push(
this.buildNodeFromSplit(currentSection.trim(), node, metadata),
);
}
metadata = this.updateMetadata(
metadata,
headerMatch[2],
headerMatch[1].length,
);
currentSection = `${headerMatch[2]}\n`;
} else {
currentSection += line + "\n";
}
}
markdownNodes.push(
this.buildNodeFromSplit(currentSection.trim(), node, metadata),
);
return markdownNodes;
}
getNodesFromDocuments(documents: BaseNode<Metadata>[]): BaseNode<Metadata>[] {
let allNodes: BaseNode<Metadata>[] = [];
for (const node of documents) {
const nodes = this.getNodesFromNode(node);
allNodes = allNodes.concat(nodes);
}
return allNodes;
}
}