Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BaseFilesystemReader interface + implement in SimpleDirectoryReader #13424

Merged
merged 6 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions llama-index-core/llama_index/core/readers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Data Connectors for LlamaIndex.
"""
Data Connectors for LlamaIndex.

This module contains the data connectors for LlamaIndex. Each connector inherits
from a `BaseReader` class, connects to a data source, and loads Document objects
Expand All @@ -14,12 +15,16 @@
from llama_index.core.readers.download import download_loader

# readers
from llama_index.core.readers.file.base import SimpleDirectoryReader
from llama_index.core.readers.file.base import (
SimpleDirectoryReader,
FileSystemReaderMixin,
)
from llama_index.core.readers.string_iterable import StringIterableReader
from llama_index.core.schema import Document

__all__ = [
"SimpleDirectoryReader",
"FileSystemReaderMixin",
"ReaderConfig",
"Document",
"StringIterableReader",
Expand Down
137 changes: 136 additions & 1 deletion llama-index-core/llama_index/core/readers/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Base reader class."""

from abc import ABC
from abc import ABC, abstractmethod
from typing import (
TYPE_CHECKING,
Any,
Expand Down Expand Up @@ -71,6 +71,141 @@ class Config:
arbitrary_types_allowed = True


class ResourcesReaderMixin(ABC):
"""
Mixin for readers that provide access to different types of resources.

Resources refer to specific data entities that can be accessed by the reader.
Examples of resources include files for a filesystem reader, channel IDs for a Slack reader, or pages for a Notion reader.
"""

@abstractmethod
def list_resources(self, *args: Any, **kwargs: Any) -> List[str]:
"""
List of identifiers for the specific type of resources available in the reader.

Returns:
List[str]: List of identifiers for the specific type of resources available in the reader.
"""

async def alist_resources(self, *args: Any, **kwargs: Any) -> List[str]:
"""
List of identifiers for the specific type of resources available in the reader asynchronously.

Returns:
List[str]: A list of resources based on the reader type, such as files for a filesystem reader,
channel IDs for a Slack reader, or pages for a Notion reader.
"""
return self.list_resources(*args, **kwargs)

@abstractmethod
def get_resource_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
"""
Get a dictionary of information about a specific resource.

Args:
resource (str): The resource identifier.

Returns:
Dict: A dictionary of information about the resource.
"""

async def aget_resource_info(
self, resource_id: str, *args: Any, **kwargs: Any
) -> Dict:
"""
Get a dictionary of information about a specific resource asynchronously.

Args:
resource (str): The resource identifier.

Returns:
Dict: A dictionary of information about the resource.
"""
return self.get_resource_info(resource_id, *args, **kwargs)

def list_resources_with_info(self, *args: Any, **kwargs: Any) -> Dict[str, Dict]:
"""
Get a dictionary of information about all resources.

Returns:
Dict[str, Dict]: A dictionary of information about all resources.
"""
return {
resource: self.get_resource_info(resource, *args, **kwargs)
for resource in self.list_resources(*args, **kwargs)
}

async def alist_resources_with_info(
self, *args: Any, **kwargs: Any
) -> Dict[str, Dict]:
"""
Get a dictionary of information about all resources asynchronously.

Returns:
Dict[str, Dict]: A dictionary of information about all resources.
"""
return {
resource: await self.aget_resource_info(resource, *args, **kwargs)
for resource in await self.alist_resources(*args, **kwargs)
}

@abstractmethod
def load_resource(
self, resource_id: str, *args: Any, **kwargs: Any
) -> List[Document]:
"""
Load data from a specific resource.

Args:
resource (str): The resource identifier.

Returns:
List[Document]: A list of documents loaded from the resource.
"""

async def aload_resource(
self, resource_id: str, *args: Any, **kwargs: Any
) -> List[Document]:
"""Read file from filesystem and return documents asynchronously."""
return self.load_resource(resource_id, *args, **kwargs)

def load_resources(
self, resource_ids: List[str], *args: Any, **kwargs: Any
) -> List[Document]:
"""
Similar to load_data, but only for specific resources.

Args:
resource_ids (List[str]): List of resource identifiers.

Returns:
List[Document]: A list of documents loaded from the resources.
"""
return [
doc
for resource in resource_ids
for doc in self.load_resource(resource, *args, **kwargs)
]

async def aload_resources(
self, resource_ids: List[str], *args: Any, **kwargs: Any
) -> Dict[str, List[Document]]:
"""
Similar ato load_data, but only for specific resources.

Args:
resource_ids (List[str]): List of resource identifiers.

Returns:
Dict[str, List[Document]]: A dictionary of documents loaded from the resources.
"""
return {
resource: await self.aload_resource(resource, *args, **kwargs)
for resource in resource_ids
}


class ReaderConfig(BaseComponent):
"""Represents a reader and it's input arguments."""

Expand Down
Loading
Loading