Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VectorStore -> BasePydanticVectorStore, also get/delete_nodes, clear() #13439

Merged
merged 27 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
8ae1798
update and migrate vector stores
logan-markewich May 11, 2024
49943e3
fix types
logan-markewich May 11, 2024
a18e6c4
add build files
logan-markewich May 11, 2024
63c0090
fix tests?
logan-markewich May 13, 2024
3c2821b
really fix tests
logan-markewich May 13, 2024
85d1112
bump workflow cache version?
logan-markewich May 13, 2024
218c0c1
fix azure?
logan-markewich May 13, 2024
f33baf4
tweaks
logan-markewich May 13, 2024
8c2763a
rename to maybe fix?
logan-markewich May 13, 2024
f1b1d6a
Final attempt
logan-markewich May 13, 2024
dd00f11
remove flakey tests
logan-markewich May 13, 2024
f41ce19
change cache version
logan-markewich May 13, 2024
3b82cfa
remove langchain from transititve dependencies
logan-markewich May 13, 2024
0d57af3
revert logging change for pants
logan-markewich May 13, 2024
ac47cd5
revert workflow changes
logan-markewich May 13, 2024
0a6d9a8
Fix tests?
logan-markewich May 13, 2024
7606d26
explicit deps
logan-markewich May 13, 2024
46a9304
test
logan-markewich May 13, 2024
c0a0583
interpreter constraints
logan-markewich May 14, 2024
21a52d7
interpreter constraints
logan-markewich May 14, 2024
3b5059e
fix
logan-markewich May 14, 2024
cf2a31e
Merge branch 'main' into logan/vector_store_upgrades
logan-markewich May 14, 2024
35cf946
remove transitive dependencies in core, remove extras
logan-markewich May 14, 2024
3c0a90e
remove transitive dependencies in core, remove extras
logan-markewich May 14, 2024
3bb870c
more fat trimming
logan-markewich May 14, 2024
d3d4e72
make tests run
logan-markewich May 14, 2024
c88b96a
fix bs4 deps
logan-markewich May 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions docs/docs/examples/low_level/vector_store.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.vector_stores.types import VectorStore\n",
"from llama_index.core.vector_stores.types import BasePydanticVectorStore\n",
"from llama_index.core.vector_stores import (\n",
" VectorStoreQuery,\n",
" VectorStoreQueryResult,\n",
Expand All @@ -176,7 +176,7 @@
"import os\n",
"\n",
"\n",
"class BaseVectorStore(VectorStore):\n",
"class BaseVectorStore(BasePydanticVectorStore):\n",
" \"\"\"Simple custom Vector Store.\n",
"\n",
" Stores documents in a simple in-memory dict.\n",
Expand Down Expand Up @@ -311,14 +311,14 @@
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.bridge.pydantic import Field\n",
"\n",
"\n",
"class VectorStore2(BaseVectorStore):\n",
" \"\"\"VectorStore2 (add/get/delete implemented).\"\"\"\n",
"\n",
" stores_text: bool = True\n",
"\n",
" def __init__(self) -> None:\n",
" \"\"\"Init params.\"\"\"\n",
" self.node_dict: Dict[str, BaseNode] = {}\n",
" node_dict: Dict[str, BaseNode] = Field(default_factory=dict)\n",
"\n",
" def get(self, text_id: str) -> List[float]:\n",
" \"\"\"Get embedding.\"\"\"\n",
Expand Down Expand Up @@ -469,6 +469,9 @@
"metadata": {},
"outputs": [],
"source": [
"from typing import cast\n",
"\n",
"\n",
"class VectorStore3A(VectorStore2):\n",
" \"\"\"Implements semantic/dense search.\"\"\"\n",
"\n",
Expand All @@ -485,7 +488,7 @@
"\n",
" similarities, node_ids = get_top_k_embeddings(\n",
" query_embedding,\n",
" embeddings,\n",
" doc_embeddings,\n",
" doc_ids,\n",
" similarity_top_k=query.similarity_top_k,\n",
" )\n",
Expand Down Expand Up @@ -891,9 +894,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "llama_index_v2",
"display_name": "venv",
"language": "python",
"name": "llama_index_v2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from llama_index.core.storage.docstore.types import RefDocInfo
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.utils import get_tqdm_iterable
from llama_index.core.vector_stores.types import VectorStore
from llama_index.core.vector_stores.types import BasePydanticVectorStore

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -115,7 +115,7 @@ def __init__(
)

@property
def vector_store(self) -> VectorStore:
def vector_store(self) -> BasePydanticVectorStore:
return self._vector_store

def as_retriever(
Expand Down
10 changes: 5 additions & 5 deletions llama-index-core/llama_index/core/indices/multi_modal/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
DEFAULT_VECTOR_STORE,
SimpleVectorStore,
)
from llama_index.core.vector_stores.types import VectorStore
from llama_index.core.vector_stores.types import BasePydanticVectorStore

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -63,7 +63,7 @@ def __init__(
# Image-related kwargs
# image_vector_store going to be deprecated. image_store can be passed from storage_context
# keep image_vector_store here for backward compatibility
image_vector_store: Optional[VectorStore] = None,
image_vector_store: Optional[BasePydanticVectorStore] = None,
image_embed_model: EmbedType = "clip:ViT-B/32",
is_image_to_text: bool = False,
# is_image_vector_store_empty is used to indicate whether image_vector_store is empty
Expand Down Expand Up @@ -112,7 +112,7 @@ def __init__(
)

@property
def image_vector_store(self) -> VectorStore:
def image_vector_store(self) -> BasePydanticVectorStore:
return self._image_vector_store

@property
Expand Down Expand Up @@ -164,12 +164,12 @@ def as_query_engine(
@classmethod
def from_vector_store(
cls,
vector_store: VectorStore,
vector_store: BasePydanticVectorStore,
embed_model: Optional[EmbedType] = None,
# deprecated
service_context: Optional[ServiceContext] = None,
# Image-related kwargs
image_vector_store: Optional[VectorStore] = None,
image_vector_store: Optional[BasePydanticVectorStore] = None,
image_embed_model: EmbedType = "clip",
**kwargs: Any,
) -> "VectorStoreIndex":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)
from llama_index.core.vector_stores.types import (
MetadataFilters,
VectorStore,
BasePydanticVectorStore,
VectorStoreQuery,
VectorStoreQueryMode,
VectorStoreQueryResult,
Expand Down Expand Up @@ -225,7 +225,7 @@ def _get_nodes_with_embeddings(
self,
query_bundle_with_embeddings: QueryBundle,
similarity_top_k: int,
vector_store: VectorStore,
vector_store: BasePydanticVectorStore,
) -> List[NodeWithScore]:
query = self._build_vector_store_query(
query_bundle_with_embeddings, similarity_top_k
Expand Down Expand Up @@ -347,7 +347,7 @@ async def _aget_nodes_with_embeddings(
self,
query_bundle_with_embeddings: QueryBundle,
similarity_top_k: int,
vector_store: VectorStore,
vector_store: BasePydanticVectorStore,
) -> List[NodeWithScore]:
query = self._build_vector_store_query(
query_bundle_with_embeddings, similarity_top_k
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from llama_index.core.storage.docstore.types import RefDocInfo
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.utils import iter_batch
from llama_index.core.vector_stores.types import VectorStore
from llama_index.core.vector_stores.types import BasePydanticVectorStore

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -87,7 +87,7 @@ def __init__(
@classmethod
def from_vector_store(
cls,
vector_store: VectorStore,
vector_store: BasePydanticVectorStore,
embed_model: Optional[EmbedType] = None,
# deprecated
service_context: Optional[ServiceContext] = None,
Expand All @@ -110,7 +110,7 @@ def from_vector_store(
)

@property
def vector_store(self) -> VectorStore:
def vector_store(self) -> BasePydanticVectorStore:
return self._vector_store

def as_retriever(self, **kwargs: Any) -> BaseRetriever:
Expand Down
25 changes: 12 additions & 13 deletions llama-index-core/llama_index/core/storage/storage_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
)
from llama_index.core.vector_stores.types import (
BasePydanticVectorStore,
VectorStore,
)

DEFAULT_PERSIST_DIR = "./storage"
Expand All @@ -54,26 +53,24 @@ class StorageContext:
indices, and vectors. It contains the following:
- docstore: BaseDocumentStore
- index_store: BaseIndexStore
- vector_store: VectorStore
- vector_store: BasePydanticVectorStore
- graph_store: GraphStore

"""

docstore: BaseDocumentStore
index_store: BaseIndexStore
vector_stores: Dict[str, VectorStore]
vector_stores: Dict[str, BasePydanticVectorStore]
graph_store: GraphStore

@classmethod
def from_defaults(
cls,
docstore: Optional[BaseDocumentStore] = None,
index_store: Optional[BaseIndexStore] = None,
vector_store: Optional[Union[VectorStore, BasePydanticVectorStore]] = None,
image_store: Optional[VectorStore] = None,
vector_stores: Optional[
Dict[str, Union[VectorStore, BasePydanticVectorStore]]
] = None,
vector_store: Optional[BasePydanticVectorStore] = None,
image_store: Optional[BasePydanticVectorStore] = None,
vector_stores: Optional[Dict[str, BasePydanticVectorStore]] = None,
graph_store: Optional[GraphStore] = None,
persist_dir: Optional[str] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
Expand All @@ -83,9 +80,9 @@ def from_defaults(
Args:
docstore (Optional[BaseDocumentStore]): document store
index_store (Optional[BaseIndexStore]): index store
vector_store (Optional[VectorStore]): vector store
vector_store (Optional[BasePydanticVectorStore]): vector store
graph_store (Optional[GraphStore]): graph store
image_store (Optional[VectorStore]): image store
image_store (Optional[BasePydanticVectorStore]): image store

"""
if persist_dir is None:
Expand Down Expand Up @@ -214,7 +211,7 @@ def from_dict(cls, save_dict: dict) -> "StorageContext":
index_store = SimpleIndexStore.from_dict(save_dict[INDEX_STORE_KEY])
graph_store = SimpleGraphStore.from_dict(save_dict[GRAPH_STORE_KEY])

vector_stores: Dict[str, VectorStore] = {}
vector_stores: Dict[str, BasePydanticVectorStore] = {}
for key, vector_store_dict in save_dict[VECTOR_STORE_KEY].items():
vector_stores[key] = SimpleVectorStore.from_dict(vector_store_dict)

Expand All @@ -226,10 +223,12 @@ def from_dict(cls, save_dict: dict) -> "StorageContext":
)

@property
def vector_store(self) -> VectorStore:
def vector_store(self) -> BasePydanticVectorStore:
"""Backwrds compatibility for vector_store property."""
return self.vector_stores[DEFAULT_VECTOR_STORE]

def add_vector_store(self, vector_store: VectorStore, namespace: str) -> None:
def add_vector_store(
self, vector_store: BasePydanticVectorStore, namespace: str
) -> None:
"""Add a vector store to the storage context."""
self.vector_stores[namespace] = vector_store
56 changes: 52 additions & 4 deletions llama-index-core/llama_index/core/vector_stores/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import fsspec
from dataclasses_json import DataClassJsonMixin
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.indices.query.embedding_utils import (
get_top_k_embeddings,
get_top_k_embeddings_learner,
Expand All @@ -18,9 +19,9 @@
from llama_index.core.vector_stores.types import (
DEFAULT_PERSIST_DIR,
DEFAULT_PERSIST_FNAME,
BasePydanticVectorStore,
MetadataFilters,
FilterCondition,
VectorStore,
VectorStoreQuery,
VectorStoreQueryMode,
VectorStoreQueryResult,
Expand Down Expand Up @@ -95,7 +96,7 @@ class SimpleVectorStoreData(DataClassJsonMixin):
metadata_dict: Dict[str, Any] = field(default_factory=dict)


class SimpleVectorStore(VectorStore):
class SimpleVectorStore(BasePydanticVectorStore):
"""Simple Vector Store.

In this vector store, embeddings are stored within a simple, in-memory dictionary.
Expand All @@ -107,6 +108,8 @@ class SimpleVectorStore(VectorStore):
"""

stores_text: bool = False
_data: SimpleVectorStoreData = PrivateAttr()
_fs: fsspec.AbstractFileSystem = PrivateAttr()

def __init__(
self,
Expand All @@ -115,6 +118,7 @@ def __init__(
**kwargs: Any,
) -> None:
"""Initialize params."""
super().__init__()
self._data = data or SimpleVectorStoreData()
self._fs = fs or fsspec.filesystem("file")

Expand Down Expand Up @@ -142,11 +146,11 @@ def from_namespaced_persist_dir(
cls,
persist_dir: str = DEFAULT_PERSIST_DIR,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> Dict[str, VectorStore]:
) -> Dict[str, BasePydanticVectorStore]:
"""Load from namespaced persist dir."""
listing_fn = os.listdir if fs is None else fs.listdir

vector_stores: Dict[str, VectorStore] = {}
vector_stores: Dict[str, BasePydanticVectorStore] = {}

try:
for fname in listing_fn(persist_dir):
Expand Down Expand Up @@ -176,6 +180,11 @@ def from_namespaced_persist_dir(

return vector_stores

@classmethod
def class_name(cls) -> str:
"""Class name."""
return "SimpleVectorStore"

@property
def client(self) -> None:
"""Get client."""
Expand All @@ -185,6 +194,14 @@ def get(self, text_id: str) -> List[float]:
"""Get embedding."""
return self._data.embedding_dict[text_id]

def get_nodes(
self,
node_ids: Optional[List[str]] = None,
filters: Optional[MetadataFilters] = None,
) -> List[BaseNode]:
"""Get nodes."""
raise NotImplementedError("SimpleVectorStore does not store nodes directly.")

def add(
self,
nodes: List[BaseNode],
Expand Down Expand Up @@ -224,6 +241,37 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
if self._data.metadata_dict is not None:
self._data.metadata_dict.pop(text_id, None)

def delete_nodes(
self,
node_ids: Optional[List[str]] = None,
filters: Optional[MetadataFilters] = None,
**delete_kwargs: Any,
) -> None:
filter_fn = _build_metadata_filter_fn(
lambda node_id: self._data.metadata_dict[node_id], filters
)
logan-markewich marked this conversation as resolved.
Show resolved Hide resolved

if node_ids is not None:
node_id_set = set(node_ids)

def node_filter_fn(node_id: str) -> bool:
return node_id in node_id_set and filter_fn(node_id)

else:

def node_filter_fn(node_id: str) -> bool:
return filter_fn(node_id)

for node_id in list(self._data.embedding_dict.keys()):
if node_filter_fn(node_id):
del self._data.embedding_dict[node_id]
del self._data.text_id_to_ref_doc_id[node_id]
self._data.metadata_dict.pop(node_id, None)

def clear(self) -> None:
"""Clear the store."""
self._data = SimpleVectorStoreData()

def query(
self,
query: VectorStoreQuery,
Expand Down
Loading
Loading