run-llama · Javtor · May 17, 2024 · May 11, 2024 · May 16, 2024 · May 16, 2024
diff --git a/llama-index-core/llama_index/core/readers/__init__.py b/llama-index-core/llama_index/core/readers/__init__.py
@@ -14,12 +14,16 @@
 from llama_index.core.readers.download import download_loader
 
 # readers
-from llama_index.core.readers.file.base import SimpleDirectoryReader
+from llama_index.core.readers.file.base import (
+    SimpleDirectoryReader,
+    BaseFilesystemReader,
+)
 from llama_index.core.readers.string_iterable import StringIterableReader
 from llama_index.core.schema import Document
 
 __all__ = [
     "SimpleDirectoryReader",
+    "BaseFilesystemReader",
     "ReaderConfig",
     "Document",
     "StringIterableReader",

diff --git a/llama-index-core/llama_index/core/readers/file/base.py b/llama-index-core/llama_index/core/readers/file/base.py
@@ -1,5 +1,6 @@
 """Simple reader that reads files of different formats from a directory."""
 
+from abc import ABC, abstractmethod
 import os
 import logging
 import mimetypes
@@ -20,6 +21,32 @@
 from tqdm import tqdm
 
 
+class BaseFilesystemReader(BaseReader, ABC):
+    @abstractmethod
+    def list_files(self, **kwargs) -> List[Path]:
+        """List files in the given filesystem."""
+
+    async def alist_files(self, **kwargs) -> List[Path]:
+        """List files in the given filesystem asynchronously."""
+        return self.list_files(**kwargs)
+
+    @abstractmethod
+    def get_file_info(self, input_file: Path, **kwargs) -> Dict:
+        """Get FS-specific file info that uniquely identifies the file. This call shouldn't imply reading the file."""
+
+    async def aget_file_info(self, input_file: Path, **kwargs) -> Dict:
+        """Get file info that uniquely identifies the file asynchronously. This call shouldn't imply reading the file."""
+        return self.get_file_info(input_file, **kwargs)
+
+    @abstractmethod
+    def read_file(self, input_file: Path, **kwargs) -> List[Document]:
+        """Read file from filesystem and return documents."""
+
+    def aread_file(self, input_file: Path, **kwargs) -> List[Document]:
+        """Read file from filesystem and return documents asynchronously."""
+        return self.read_file(input_file, **kwargs)
+
+
 def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
     try:
         from llama_index.readers.file import (
@@ -59,16 +86,22 @@ def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
     return default_file_reader_cls
 
 
-def _format_file_timestamp(timestamp: float) -> Optional[str]:
-    """Format file timestamp to a %Y-%m-%d string.
+def _format_file_timestamp(
+    timestamp: float, include_time: bool = False
+) -> Optional[str]:
+    """
+    Format file timestamp to a %Y-%m-%d string.
 
     Args:
         timestamp (float): timestamp in float
+        include_time (bool): whether to include time in the formatted string
 
     Returns:
         str: formatted timestamp
     """
     try:
+        if include_time:
+            return datetime.utcfromtimestamp(timestamp).strftime("%Y-%m-%dT%H:%M:%SZ")
         return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
     except Exception:
         return None
@@ -77,7 +110,8 @@ def _format_file_timestamp(timestamp: float) -> Optional[str]:
 def default_file_metadata_func(
     file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
 ) -> Dict:
-    """Get some handy metadata from filesystem.
+    """
+    Get some handy metadata from filesystem.
 
     Args:
         file_path: str: file path in str
@@ -135,8 +169,9 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
 logger = logging.getLogger(__name__)
 
 
-class SimpleDirectoryReader(BaseReader):
-    """Simple directory reader.
+class SimpleDirectoryReader(BaseFilesystemReader):
+    """
+    Simple directory reader.
 
     Load files from file directory.
     Automatically select the best file reader given file extensions.
@@ -315,7 +350,8 @@ def _add_files(self, input_dir: Path) -> List[Path]:
         return new_input_files
 
     def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
-        """Exclude metadata from documents.
+        """
+        Exclude metadata from documents.
 
         Args:
             documents (List[Document]): List of documents.
@@ -348,6 +384,76 @@ def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
 
         return documents
 
+    def list_files(self, **kwargs) -> List[Path]:
+        """List files in the given filesystem."""
+        return self.input_files
+
+    def get_file_info(self, input_file: Path, **kwargs) -> Dict:
+        info_result = self.fs.info(input_file)
+
+        creation_date = _format_file_timestamp(
+            info_result.get("created"), include_time=True
+        )
+        last_modified_date = _format_file_timestamp(
+            info_result.get("mtime"), include_time=True
+        )
+
+        info_dict = {
+            "file_path": input_file,
+            "file_size": info_result.get("size"),
+            "creation_date": creation_date,
+            "last_modified_date": last_modified_date,
+        }
+
+        # Ignore None values
+        return {
+            meta_key: meta_value
+            for meta_key, meta_value in info_dict.items()
+            if meta_value is not None
+        }
+
+    def read_file(self, input_file: Path, **kwargs) -> List[Document]:
+        file_metadata = kwargs.get("file_metadata", self.file_metadata)
+        file_extractor = kwargs.get("file_extractor", self.file_extractor)
+        filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
+        encoding = kwargs.get("encoding", self.encoding)
+        errors = kwargs.get("errors", self.errors)
+        raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
+        fs = kwargs.get("fs", self.fs)
+
+        return SimpleDirectoryReader.load_file(
+            input_file=input_file,
+            file_metadata=file_metadata,
+            file_extractor=file_extractor,
+            filename_as_id=filename_as_id,
+            encoding=encoding,
+            errors=errors,
+            raise_on_error=raise_on_error,
+            fs=fs,
+            **kwargs,
+        )
+
+    async def aread_file(self, input_file: Path, **kwargs) -> List[Document]:
+        file_metadata = kwargs.get("file_metadata", self.file_metadata)
+        file_extractor = kwargs.get("file_extractor", self.file_extractor)
+        filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
+        encoding = kwargs.get("encoding", self.encoding)
+        errors = kwargs.get("errors", self.errors)
+        raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
+        fs = kwargs.get("fs", self.fs)
+
+        return await SimpleDirectoryReader.aload_file(
+            input_file=input_file,
+            file_metadata=file_metadata,
+            file_extractor=file_extractor,
+            filename_as_id=filename_as_id,
+            encoding=encoding,
+            errors=errors,
+            raise_on_error=raise_on_error,
+            fs=fs,
+            **kwargs,
+        )
+
     @staticmethod
     def load_file(
         input_file: Path,
@@ -359,7 +465,8 @@ def load_file(
         raise_on_error: bool = False,
         fs: Optional[fsspec.AbstractFileSystem] = None,
     ) -> List[Document]:
-        """Static method for loading file.
+        """
+        Static method for loading file.
 
         NOTE: necessarily as a static method for parallel processing.
 
@@ -517,7 +624,8 @@ def load_data(
         num_workers: Optional[int] = None,
         fs: Optional[fsspec.AbstractFileSystem] = None,
     ) -> List[Document]:
-        """Load data from the input directory.
+        """
+        Load data from the input directory.
 
         Args:
             show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
@@ -582,7 +690,8 @@ async def aload_data(
         num_workers: Optional[int] = None,
         fs: Optional[fsspec.AbstractFileSystem] = None,
     ) -> List[Document]:
-        """Load data from the input directory.
+        """
+        Load data from the input directory.
 
         Args:
             show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
@@ -613,7 +722,8 @@ async def aload_data(
     def iter_data(
         self, show_progress: bool = False
     ) -> Generator[List[Document], Any, Any]:
-        """Load data iteratively from the input directory.
+        """
+        Load data iteratively from the input directory.
 
         Args:
             show_progress (bool): Whether to show tqdm progress bars. Defaults to False.

diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml
@@ -43,7 +43,7 @@ name = "llama-index-core"
 packages = [{include = "llama_index"}]
 readme = "README.md"
 repository = "https://github.com/run-llama/llama_index"
-version = "0.10.36"
+version = "0.10.36.post1"
 
 [tool.poetry.dependencies]
 SQLAlchemy = {extras = ["asyncio"], version = ">=1.4.49"}