-
Notifications
You must be signed in to change notification settings - Fork 375
/
PDFReader.ts
35 lines (32 loc) · 929 Bytes
/
PDFReader.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import { FileReader } from "./type.js";
/**
* Read the text of a PDF
*/
export class PDFReader extends FileReader {
async loadData(file: string): Promise<Document[]> {
const content = await fs.readFile(file);
return this.loadDataAsContent(new Uint8Array(content.buffer));
}
async loadDataAsContent(content: Uint8Array): Promise<Document[]> {
const { totalPages, text } = await readPDF(content);
return text.map((text, page) => {
const metadata = {
page_number: page + 1,
total_pages: totalPages,
};
return new Document({ text, metadata });
});
}
}
async function readPDF(data: Uint8Array): Promise<{
totalPages: number;
text: string[];
}> {
const { extractText } = await import("unpdf");
return (await extractText(data)) as {
totalPages: number;
text: string[];
};
}