llama-index-integrations/vector_stores/llama-index-vector-stores-opensearch/llama_index/vector_stores/opensearch/base.py

"""Elasticsearch/Opensearch vector store."""

import asyncio
import json
import uuid
from typing import Any, Dict, Iterable, List, Optional, Union, cast

from llama_index.core.bridge.pydantic import PrivateAttr

from llama_index.core.schema import BaseNode, MetadataMode, TextNode
from llama_index.core.vector_stores.types import (
    MetadataFilters,
    BasePydanticVectorStore,
    VectorStoreQuery,
    VectorStoreQueryMode,
    VectorStoreQueryResult,
)
from llama_index.core.vector_stores.utils import (
    metadata_dict_to_node,
    node_to_metadata_dict,
)
from opensearchpy import AsyncOpenSearch
from opensearchpy.exceptions import NotFoundError
from opensearchpy.helpers import async_bulk

IMPORT_OPENSEARCH_PY_ERROR = (
    "Could not import OpenSearch. Please install it with `pip install opensearch-py`."
)
INVALID_HYBRID_QUERY_ERROR = (
    "Please specify the lexical_query and search_pipeline for hybrid search."
)
MATCH_ALL_QUERY = {"match_all": {}}  # type: Dict


def _import_async_opensearch() -> Any:
    """Import OpenSearch if available, otherwise raise error."""
    return AsyncOpenSearch


def _import_async_bulk() -> Any:
    """Import bulk if available, otherwise raise error."""
    return async_bulk


def _import_not_found_error() -> Any:
    """Import not found error if available, otherwise raise error."""
    return NotFoundError


def _get_async_opensearch_client(opensearch_url: str, **kwargs: Any) -> Any:
    """Get AsyncOpenSearch client from the opensearch_url, otherwise raise error."""
    try:
        opensearch = _import_async_opensearch()
        client = opensearch(opensearch_url, **kwargs)

    except ValueError as e:
        raise ValueError(
            f"AsyncOpenSearch client string provided is not in proper format. "
            f"Got error: {e} "
        )
    return client


async def _bulk_ingest_embeddings(
    client: Any,
    index_name: str,
    embeddings: List[List[float]],
    texts: Iterable[str],
    metadatas: Optional[List[dict]] = None,
    ids: Optional[List[str]] = None,
    vector_field: str = "embedding",
    text_field: str = "content",
    mapping: Optional[Dict] = None,
    max_chunk_bytes: Optional[int] = 1 * 1024 * 1024,
    is_aoss: bool = False,
) -> List[str]:
    """Async Bulk Ingest Embeddings into given index."""
    if not mapping:
        mapping = {}

    async_bulk = _import_async_bulk()
    not_found_error = _import_not_found_error()
    requests = []
    return_ids = []
    mapping = mapping

    try:
        await client.indices.get(index=index_name)
    except not_found_error:
        await client.indices.create(index=index_name, body=mapping)

    for i, text in enumerate(texts):
        metadata = metadatas[i] if metadatas else {}
        _id = ids[i] if ids else str(uuid.uuid4())
        request = {
            "_op_type": "index",
            "_index": index_name,
            vector_field: embeddings[i],
            text_field: text,
            "metadata": metadata,
        }
        if is_aoss:
            request["id"] = _id
        else:
            request["_id"] = _id
        requests.append(request)
        return_ids.append(_id)
    await async_bulk(client, requests, max_chunk_bytes=max_chunk_bytes)
    if not is_aoss:
        await client.indices.refresh(index=index_name)
    return return_ids


def _default_approximate_search_query(
    query_vector: List[float],
    k: int = 4,
    vector_field: str = "embedding",
) -> Dict:
    """For Approximate k-NN Search, this is the default query."""
    return {
        "size": k,
        "query": {"knn": {vector_field: {"vector": query_vector, "k": k}}},
    }


def _parse_filters(filters: Optional[MetadataFilters]) -> Any:
    pre_filter = []
    if filters is not None:
        for f in filters.legacy_filters():
            pre_filter.append({f.key: json.loads(str(f.value))})

    return pre_filter


def _knn_search_query(
    embedding_field: str,
    query_embedding: List[float],
    k: int,
    filters: Optional[MetadataFilters] = None,
) -> Dict:
    """
    Do knn search.

    If there are no filters do approx-knn search.
    If there are (pre)-filters, do an exhaustive exact knn search using 'painless
        scripting'.

    Note that approximate knn search does not support pre-filtering.

    Args:
        query_embedding: Vector embedding to query.
        k: Maximum number of results.
        filters: Optional filters to apply before the search.
            Supports filter-context queries documented at
            https://opensearch.org/docs/latest/query-dsl/query-filter-context/

    Returns:
        Up to k docs closest to query_embedding
    """
    if filters is None:
        search_query = _default_approximate_search_query(
            query_embedding, k, vector_field=embedding_field
        )
    else:
        pre_filter = _parse_filters(filters)
        # https://opensearch.org/docs/latest/search-plugins/knn/painless-functions/
        search_query = _default_painless_scripting_query(
            query_embedding,
            k,
            space_type="l2Squared",
            pre_filter={"bool": {"filter": pre_filter}},
            vector_field=embedding_field,
        )

    return search_query


def _hybrid_search_query(
    text_field: str,
    query_str: str,
    embedding_field: str,
    query_embedding: List[float],
    k: int,
    filters: Optional[MetadataFilters] = None,
) -> Dict:
    knn_query = _knn_search_query(embedding_field, query_embedding, k, filters)["query"]
    lexical_query = {"must": {"match": {text_field: {"query": query_str}}}}

    parsed_filters = _parse_filters(filters)
    if len(parsed_filters) > 0:
        lexical_query["filter"] = parsed_filters
    return {
        "size": k,
        "query": {"hybrid": {"queries": [{"bool": lexical_query}, knn_query]}},
    }


def __get_painless_scripting_source(
    space_type: str, vector_field: str = "embedding"
) -> str:
    """For Painless Scripting, it returns the script source based on space type."""
    source_value = f"(1.0 + {space_type}(params.query_value, doc['{vector_field}']))"
    if space_type == "cosineSimilarity":
        return source_value
    else:
        return f"1/{source_value}"


def _default_painless_scripting_query(
    query_vector: List[float],
    k: int = 4,
    space_type: str = "l2Squared",
    pre_filter: Optional[Union[Dict, List]] = None,
    vector_field: str = "embedding",
) -> Dict:
    """For Painless Scripting Search, this is the default query."""
    if not pre_filter:
        pre_filter = MATCH_ALL_QUERY

    source = __get_painless_scripting_source(space_type, vector_field)
    return {
        "size": k,
        "query": {
            "script_score": {
                "query": pre_filter,
                "script": {
                    "source": source,
                    "params": {
                        "field": vector_field,
                        "query_value": query_vector,
                    },
                },
            }
        },
    }


def _is_aoss_enabled(http_auth: Any) -> bool:
    """Check if the service is http_auth is set as `aoss`."""
    if (
        http_auth is not None
        and hasattr(http_auth, "service")
        and http_auth.service == "aoss"
    ):
        return True
    return False


class OpensearchVectorClient:
    """
    Object encapsulating an Opensearch index that has vector search enabled.

    If the index does not yet exist, it is created during init.
    Therefore, the underlying index is assumed to either:
    1) not exist yet or 2) be created due to previous usage of this class.

    Args:
        endpoint (str): URL (http/https) of elasticsearch endpoint
        index (str): Name of the elasticsearch index
        dim (int): Dimension of the vector
        embedding_field (str): Name of the field in the index to store
            embedding array in.
        text_field (str): Name of the field to grab text from
        method (Optional[dict]): Opensearch "method" JSON obj for configuring
            the KNN index.
            This includes engine, metric, and other config params. Defaults to:
            {"name": "hnsw", "space_type": "l2", "engine": "faiss",
            "parameters": {"ef_construction": 256, "m": 48}}
        **kwargs: Optional arguments passed to the OpenSearch client from opensearch-py.

    """

    def __init__(
        self,
        endpoint: str,
        index: str,
        dim: int,
        embedding_field: str = "embedding",
        text_field: str = "content",
        method: Optional[dict] = None,
        max_chunk_bytes: int = 1 * 1024 * 1024,
        search_pipeline: Optional[str] = None,
        **kwargs: Any,
    ):
        """Init params."""
        if method is None:
            method = {
                "name": "hnsw",
                "space_type": "l2",
                "engine": "nmslib",
                "parameters": {"ef_construction": 256, "m": 48},
            }
        if embedding_field is None:
            embedding_field = "embedding"
        self._embedding_field = embedding_field

        self._endpoint = endpoint
        self._dim = dim
        self._index = index
        self._text_field = text_field
        self._max_chunk_bytes = max_chunk_bytes

        self._search_pipeline = search_pipeline
        http_auth = kwargs.get("http_auth")
        self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
        # initialize mapping
        idx_conf = {
            "settings": {"index": {"knn": True, "knn.algo_param.ef_search": 100}},
            "mappings": {
                "properties": {
                    embedding_field: {
                        "type": "knn_vector",
                        "dimension": dim,
                        "method": method,
                    },
                }
            },
        }
        self._os_client = _get_async_opensearch_client(self._endpoint, **kwargs)
        not_found_error = _import_not_found_error()

        event_loop = asyncio.get_event_loop()
        try:
            event_loop.run_until_complete(
                self._os_client.indices.get(index=self._index)
            )
        except not_found_error:
            event_loop.run_until_complete(
                self._os_client.indices.create(index=self._index, body=idx_conf)
            )
            event_loop.run_until_complete(
                self._os_client.indices.refresh(index=self._index)
            )

    async def index_results(self, nodes: List[BaseNode], **kwargs: Any) -> List[str]:
        """Store results in the index."""
        embeddings: List[List[float]] = []
        texts: List[str] = []
        metadatas: List[dict] = []
        ids: List[str] = []
        for node in nodes:
            ids.append(node.node_id)
            embeddings.append(node.get_embedding())
            texts.append(node.get_content(metadata_mode=MetadataMode.NONE))
            metadatas.append(node_to_metadata_dict(node, remove_text=True))

        return await _bulk_ingest_embeddings(
            self._os_client,
            self._index,
            embeddings,
            texts,
            metadatas=metadatas,
            ids=ids,
            vector_field=self._embedding_field,
            text_field=self._text_field,
            mapping=None,
            max_chunk_bytes=self._max_chunk_bytes,
            is_aoss=self.is_aoss,
        )

    async def delete_doc_id(self, doc_id: str) -> None:
        """
        Delete a document.

        Args:
            doc_id (str): document id
        """
        await self._os_client.delete(index=self._index, id=doc_id)

    async def aquery(
        self,
        query_mode: VectorStoreQueryMode,
        query_str: Optional[str],
        query_embedding: List[float],
        k: int,
        filters: Optional[MetadataFilters] = None,
    ) -> VectorStoreQueryResult:
        if query_mode == VectorStoreQueryMode.HYBRID:
            if query_str is None or self._search_pipeline is None:
                raise ValueError(INVALID_HYBRID_QUERY_ERROR)
            search_query = _hybrid_search_query(
                self._text_field,
                query_str,
                self._embedding_field,
                query_embedding,
                k,
                filters=filters,
            )
            params = {"search_pipeline": self._search_pipeline}
        else:
            search_query = _knn_search_query(
                self._embedding_field, query_embedding, k, filters=filters
            )
            params = None

        res = await self._os_client.search(
            index=self._index, body=search_query, params=params
        )
        nodes = []
        ids = []
        scores = []
        for hit in res["hits"]["hits"]:
            source = hit["_source"]
            node_id = hit["_id"]
            text = source[self._text_field]
            metadata = source.get("metadata", None)

            try:
                node = metadata_dict_to_node(metadata)
                node.text = text
            except Exception:
                # TODO: Legacy support for old nodes
                node_info = source.get("node_info")
                relationships = source.get("relationships") or {}
                start_char_idx = None
                end_char_idx = None
                if isinstance(node_info, dict):
                    start_char_idx = node_info.get("start", None)
                    end_char_idx = node_info.get("end", None)

                node = TextNode(
                    text=text,
                    metadata=metadata,
                    id_=node_id,
                    start_char_idx=start_char_idx,
                    end_char_idx=end_char_idx,
                    relationships=relationships,
                    extra_info=source,
                )
            ids.append(node_id)
            nodes.append(node)
            scores.append(hit["_score"])
        return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)


class OpensearchVectorStore(BasePydanticVectorStore):
    """
    Elasticsearch/Opensearch vector store.

    Args:
        client (OpensearchVectorClient): Vector index client to use
            for data insertion/querying.
    """

    stores_text: bool = True
    _client: OpensearchVectorClient = PrivateAttr(default=None)

    def __init__(
        self,
        client: OpensearchVectorClient,
    ) -> None:
        """Initialize params."""
        super().__init__()
        self._client = client

    @property
    def client(self) -> Any:
        """Get client."""
        return self._client

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """
        Add nodes to index.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings.

        """
        return asyncio.get_event_loop().run_until_complete(
            self.async_add(nodes, **add_kwargs)
        )

    async def async_add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """
        Async add nodes to index.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings.

        """
        await self._client.index_results(nodes)
        return [result.node_id for result in nodes]

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        asyncio.get_event_loop().run_until_complete(
            self.adelete(ref_doc_id, **delete_kwargs)
        )

    async def adelete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Async delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        await self._client.delete_doc_id(ref_doc_id)

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """
        Query index for top k most similar nodes.

        Args:
            query (VectorStoreQuery): Store query object.

        """
        return asyncio.get_event_loop().run_until_complete(self.aquery(query, **kwargs))

    async def aquery(
        self, query: VectorStoreQuery, **kwargs: Any
    ) -> VectorStoreQueryResult:
        """
        Async query index for top k most similar nodes.

        Args:
            query (VectorStoreQuery): Store query object.

        """
        query_embedding = cast(List[float], query.query_embedding)

        return await self._client.aquery(
            query.mode,
            query.query_str,
            query_embedding,
            query.similarity_top_k,
            filters=query.filters,
        )