Skip to content

Commit

Permalink
feat: http(s) support for file readers
Browse files Browse the repository at this point in the history
  • Loading branch information
himself65 committed Apr 22, 2024
1 parent efb1c56 commit 619d834
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 29 deletions.
6 changes: 3 additions & 3 deletions packages/core/src/readers/CSVReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ export class PapaCSVReader implements FileReader {

/**
* Loads data from csv files
* @param {string} file - The path to the file to load.
* @param {string | URL} filePath - The path to the file to load.
* @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file.
* @returns {Promise<Document[]>}
*/
async loadData(
file: string,
filePath: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const fileContent = await fs.readFile(file);
const fileContent = await fs.readFile(filePath);
const result = Papa.parse(fileContent, this.papaConfig);
const textList = result.data.map((row: any) => {
// Compatible with header row mode
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/readers/DocxReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ import type { FileReader } from "./type.js";
export class DocxReader implements FileReader {
/** DocxParser */
async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const dataBuffer = await fs.readRawFile(file);
const { value } = await mammoth.extractRawText({ buffer: dataBuffer });
return [new Document({ text: value, id_: file })];
return [new Document({ text: value, id_: `${file}` })];
}
}
4 changes: 2 additions & 2 deletions packages/core/src/readers/HTMLReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ export class HTMLReader implements FileReader {
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
*/
async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const dataBuffer = await fs.readFile(file);
const htmlOptions = this.getOptions();
const content = await this.parseContent(dataBuffer, htmlOptions);
return [new Document({ text: content, id_: file })];
return [new Document({ text: content, id_: `${file}` })];
}

/**
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/readers/ImageReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ export class ImageReader implements FileReader {
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file.
*/
async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const dataBuffer = await fs.readRawFile(file);
const blob = new Blob([dataBuffer]);
return [new ImageDocument({ image: blob, id_: file })];
return [new ImageDocument({ image: blob, id_: `${file}` })];
}
}
4 changes: 2 additions & 2 deletions packages/core/src/readers/LlamaParseReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ export class LlamaParseReader implements FileReader {
}

async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
if (!file.endsWith(".pdf")) {
if (file instanceof URL || !file.endsWith(".pdf")) {
throw new Error("Currently, only PDF files are supported.");
}

Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/readers/MarkdownReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ export class MarkdownReader implements FileReader {
}

async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const content = await fs.readFile(file);
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/readers/PDFReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import type { BaseReader } from "./type.js";
*/
export class PDFReader implements BaseReader {
async loadData(
file: string,
file: string | URL,
fs: GenericFileSystem = defaultFS,
): Promise<Document[]> {
const content = await fs.readRawFile(file);
Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/readers/TextFileReader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ import type { BaseReader } from "./type.js";

export class TextFileReader implements BaseReader {
async loadData(
file: string,
file: string | URL,
fs: CompleteFileSystem = defaultFS,
): Promise<Document[]> {
const dataBuffer = await fs.readFile(file);
return [new Document({ text: dataBuffer, id_: file })];
return [new Document({ text: dataBuffer, id_: `${file}` })];
}
}
5 changes: 4 additions & 1 deletion packages/core/src/readers/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,8 @@ export interface BaseReader {
* A reader takes file paths and imports data into Document objects.
*/
export interface FileReader extends BaseReader {
loadData(filePath: string, fs?: CompleteFileSystem): Promise<Document[]>;
loadData(
filePath: string | URL,
fs?: CompleteFileSystem,
): Promise<Document[]>;
}
32 changes: 32 additions & 0 deletions packages/core/tests/readers/csv-reader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { PapaCSVReader } from "llamaindex/readers/CSVReader";
import type { AddressInfo } from "net";
import { createServer } from "node:http";
import { expect, test } from "vitest";

const csv = `title,reviewid,creationdate,criticname,originalscore,reviewstate,reviewtext
Beavers,1145982,2003-05-23,Ivan M. Lincoln,3.5/4,fresh,"Timed to be just long enough for most youngsters' brief attention spans -- and it's packed with plenty of interesting activity, both on land and under the water."
Blood Mask,1636744,2007-06-02,The Foywonder,1/5,rotten,"It doesn't matter if a movie costs 300 million or only 300 dollars; good is good and bad is bad, and Bloodmask: The Possession of Nicole Lameroux is just plain bad."`;

test("csv reader with http request", async (context) => {
const reader = new PapaCSVReader();
const server = createServer((req, res) => {
expect(req.url).toBe("/test.csv");
res.setHeader("Content-Type", "text/csv");
res.end(csv);
}).listen();
context.onTestFinished(() => {
server.close();
});
const port = (server.address() as AddressInfo).port;
const url = new URL("http://localhost:" + port + "/test.csv");
const documents = await reader.loadData(url);
expect(documents.length).toBe(1);
expect(documents[0].text).toMatchInlineSnapshot(`
"title, reviewid, creationdate, criticname, originalscore, reviewstate, reviewtext
Beavers, 1145982, 2003-05-23, Ivan M. Lincoln, 3.5/4, fresh, Timed to be just long enough for most youngsters' brief attention spans -- and it's packed with plenty of interesting activity, both on land and under the water.
Blood Mask, 1636744, 2007-06-02, The Foywonder, 1/5, rotten, It doesn't matter if a movie costs 300 million or only 300 dollars; good is good and bad is bad, and Bloodmask: The Possession of Nicole Lameroux is just plain bad."
`);
expect(documents[0].hash).toMatchInlineSnapshot(
`"HlE1SNFxtti8/Rd16ADNKv1uzLd2MIRfHeNCTuAkwDY="`,
);
});
18 changes: 15 additions & 3 deletions packages/env/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,25 @@ export function createSHA256(): SHA256 {
}

export const defaultFS: CompleteFileSystem = {
writeFile: function (path: string, content: string) {
writeFile: async function (path: string | URL, content: string) {
return fs.writeFile(path, content, "utf-8");
},
readRawFile(path: string): Promise<Buffer> {
readRawFile: async function (path: string | URL): Promise<Buffer> {
if (path instanceof URL) {
if (path.protocol === "http:" || path.protocol === "https:") {
const response = await fetch(path);
return Buffer.from(await response.arrayBuffer());
}
}
return fs.readFile(path);
},
readFile: function (path: string) {
readFile: async function (path: string | URL): Promise<string> {
if (path instanceof URL) {
if (path.protocol === "http:" || path.protocol === "https:") {
const response = await fetch(path);
return response.text();
}
}
return fs.readFile(path, "utf-8");
},
access: fs.access,
Expand Down
23 changes: 13 additions & 10 deletions packages/env/src/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,35 +7,35 @@ import _ from "lodash";
* browsers.
*/
export type GenericFileSystem = {
writeFile(path: string, content: string): Promise<void>;
writeFile(path: string | URL, content: string): Promise<void>;
/**
* Reads a file and returns its content as a raw buffer.
*/
readRawFile(path: string): Promise<Buffer>;
readRawFile(path: string | URL): Promise<Buffer>;
/**
* Reads a file and returns its content as an utf-8 string.
*/
readFile(path: string): Promise<string>;
access(path: string): Promise<void>;
readFile(path: string | URL): Promise<string>;
access(path: string | URL): Promise<void>;
mkdir(
path: string,
path: string | URL,
options: {
recursive: boolean;
},
): Promise<string | undefined>;
mkdir(path: string): Promise<void>;
mkdir(path: string | URL): Promise<void>;
};
export type WalkableFileSystem = {
readdir(path: string): Promise<string[]>;
stat(path: string): Promise<any>;
readdir(path: string | URL): Promise<string[]>;
stat(path: string | URL): Promise<any>;
};
export type CompleteFileSystem = GenericFileSystem & WalkableFileSystem;

/**
* A filesystem implementation that stores files in memory.
*/
export class InMemoryFileSystem implements CompleteFileSystem {
private files: Record<string, any> = {};
private files: Record<string, string> = {};

async writeFile(
path: string,
Expand All @@ -59,7 +59,10 @@ export class InMemoryFileSystem implements CompleteFileSystem {
}

async mkdir(path: string): Promise<undefined> {
this.files[path] = _.get(this.files, path, null);
const content = _.get(this.files, path, null);
if (content) {
this.files[path] = content;
}
}

async readdir(path: string): Promise<string[]> {
Expand Down

0 comments on commit 619d834

Please sign in to comment.