Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BaseFilesystemReader interface + implement in SimpleDirectoryReader #13424

Merged
merged 6 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llama-index-core/llama_index/core/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,16 @@
from llama_index.core.readers.download import download_loader

# readers
from llama_index.core.readers.file.base import SimpleDirectoryReader
from llama_index.core.readers.file.base import (
SimpleDirectoryReader,
BaseFilesystemReader,
)
from llama_index.core.readers.string_iterable import StringIterableReader
from llama_index.core.schema import Document

__all__ = [
"SimpleDirectoryReader",
"BaseFilesystemReader",
"ReaderConfig",
"Document",
"StringIterableReader",
Expand Down
130 changes: 120 additions & 10 deletions llama-index-core/llama_index/core/readers/file/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Simple reader that reads files of different formats from a directory."""

from abc import ABC, abstractmethod
import os
import logging
import mimetypes
Expand All @@ -20,6 +21,32 @@
from tqdm import tqdm


class BaseFilesystemReader(BaseReader, ABC):
@abstractmethod
def list_files(self, **kwargs) -> List[Path]:
"""List files in the given filesystem."""

async def alist_files(self, **kwargs) -> List[Path]:
"""List files in the given filesystem asynchronously."""
return self.list_files(**kwargs)

@abstractmethod
def get_file_info(self, input_file: Path, **kwargs) -> Dict:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am wondering if this should return a FileInfo/FileMetadata pydantic class which just contains some minimum required metadata fields to standardize this a bit 🤔

"""Get FS-specific file info that uniquely identifies the file. This call shouldn't imply reading the file."""

async def aget_file_info(self, input_file: Path, **kwargs) -> Dict:
"""Get file info that uniquely identifies the file asynchronously. This call shouldn't imply reading the file."""
return self.get_file_info(input_file, **kwargs)

@abstractmethod
def read_file(self, input_file: Path, **kwargs) -> List[Document]:
"""Read file from filesystem and return documents."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this return the file content instead? to separate the file parsing from file loading


def aread_file(self, input_file: Path, **kwargs) -> List[Document]:
"""Read file from filesystem and return documents asynchronously."""
return self.read_file(input_file, **kwargs)


def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
try:
from llama_index.readers.file import (
Expand Down Expand Up @@ -59,16 +86,22 @@ def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
return default_file_reader_cls


def _format_file_timestamp(timestamp: float) -> Optional[str]:
"""Format file timestamp to a %Y-%m-%d string.
def _format_file_timestamp(
timestamp: float, include_time: bool = False
) -> Optional[str]:
"""
Format file timestamp to a %Y-%m-%d string.

Args:
timestamp (float): timestamp in float
include_time (bool): whether to include time in the formatted string

Returns:
str: formatted timestamp
"""
try:
if include_time:
return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ")
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
except Exception:
return None
Expand All @@ -77,7 +110,8 @@ def _format_file_timestamp(timestamp: float) -> Optional[str]:
def default_file_metadata_func(
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
) -> Dict:
"""Get some handy metadata from filesystem.
"""
Get some handy metadata from filesystem.

Args:
file_path: str: file path in str
Expand Down Expand Up @@ -135,8 +169,9 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
logger = logging.getLogger(__name__)


class SimpleDirectoryReader(BaseReader):
"""Simple directory reader.
class SimpleDirectoryReader(BaseFilesystemReader):
"""
Simple directory reader.

Load files from file directory.
Automatically select the best file reader given file extensions.
Expand Down Expand Up @@ -315,7 +350,8 @@ def _add_files(self, input_dir: Path) -> List[Path]:
return new_input_files

def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
"""Exclude metadata from documents.
"""
Exclude metadata from documents.

Args:
documents (List[Document]): List of documents.
Expand Down Expand Up @@ -348,6 +384,76 @@ def _exclude_metadata(self, documents: List[Document]) -> List[Document]:

return documents

def list_files(self, **kwargs) -> List[Path]:
"""List files in the given filesystem."""
return self.input_files

def get_file_info(self, input_file: Path, **kwargs) -> Dict:
info_result = self.fs.info(input_file)

creation_date = _format_file_timestamp(
info_result.get("created"), include_time=True
)
last_modified_date = _format_file_timestamp(
info_result.get("mtime"), include_time=True
)

info_dict = {
"file_path": input_file,
"file_size": info_result.get("size"),
"creation_date": creation_date,
"last_modified_date": last_modified_date,
}

# Ignore None values
return {
meta_key: meta_value
for meta_key, meta_value in info_dict.items()
if meta_value is not None
}

def read_file(self, input_file: Path, **kwargs) -> List[Document]:
file_metadata = kwargs.get("file_metadata", self.file_metadata)
file_extractor = kwargs.get("file_extractor", self.file_extractor)
filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
encoding = kwargs.get("encoding", self.encoding)
errors = kwargs.get("errors", self.errors)
raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
fs = kwargs.get("fs", self.fs)

return SimpleDirectoryReader.load_file(
input_file=input_file,
file_metadata=file_metadata,
file_extractor=file_extractor,
filename_as_id=filename_as_id,
encoding=encoding,
errors=errors,
raise_on_error=raise_on_error,
fs=fs,
**kwargs,
)

async def aread_file(self, input_file: Path, **kwargs) -> List[Document]:
file_metadata = kwargs.get("file_metadata", self.file_metadata)
file_extractor = kwargs.get("file_extractor", self.file_extractor)
filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
encoding = kwargs.get("encoding", self.encoding)
errors = kwargs.get("errors", self.errors)
raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
fs = kwargs.get("fs", self.fs)

return await SimpleDirectoryReader.aload_file(
input_file=input_file,
file_metadata=file_metadata,
file_extractor=file_extractor,
filename_as_id=filename_as_id,
encoding=encoding,
errors=errors,
raise_on_error=raise_on_error,
fs=fs,
**kwargs,
)

@staticmethod
def load_file(
input_file: Path,
Expand All @@ -359,7 +465,8 @@ def load_file(
raise_on_error: bool = False,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> List[Document]:
"""Static method for loading file.
"""
Static method for loading file.

NOTE: necessarily as a static method for parallel processing.

Expand Down Expand Up @@ -517,7 +624,8 @@ def load_data(
num_workers: Optional[int] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> List[Document]:
"""Load data from the input directory.
"""
Load data from the input directory.

Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Expand Down Expand Up @@ -582,7 +690,8 @@ async def aload_data(
num_workers: Optional[int] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> List[Document]:
"""Load data from the input directory.
"""
Load data from the input directory.

Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Expand Down Expand Up @@ -613,7 +722,8 @@ async def aload_data(
def iter_data(
self, show_progress: bool = False
) -> Generator[List[Document], Any, Any]:
"""Load data iteratively from the input directory.
"""
Load data iteratively from the input directory.

Args:
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
Expand Down
2 changes: 1 addition & 1 deletion llama-index-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ name = "llama-index-core"
packages = [{include = "llama_index"}]
readme = "README.md"
repository = "https://github.com/run-llama/llama_index"
version = "0.10.36"
version = "0.10.36.post1"

[tool.poetry.dependencies]
SQLAlchemy = {extras = ["asyncio"], version = ">=1.4.49"}
Expand Down
Loading