In [1]:
from os import getenv
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index.core import VectorStoreIndex, StorageContext

# http endpoint for your cluster (opensearch required for vector index usage)
endpoint = getenv("OPENSEARCH_ENDPOINT", "http://localhost:9200")
# index to demonstrate the VectorStore impl
idx = getenv("OPENSEARCH_INDEX", "gpt-index-demo")
# # load some sample data

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products"]
)

In [5]:
import nest_asyncio 
nest_asyncio.apply()

In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Define the embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [7]:
Settings.embed_model =embed_model

In [8]:
from llama_index.llms.ollama import Ollama

In [9]:
Settings.llm = Ollama(model="llama2", request_timeout=30.0)

In [10]:
llm = Ollama(model="llama2", request_timeout=30.0)

In [8]:
# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled
client = OpensearchVectorClient(
    endpoint, idx, dim=384, embedding_field=embedding_field, text_field=text_field
)
# initialize vector store
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# initialize an index using our sample data and the client we just created
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context
)

In [9]:
# run query
query_engine = index.as_query_engine()
res = query_engine.query("What is data product?")
res.response

'Based on the provided context, a data product can be defined as a central marketplace or catalog of internal data products that are trustworthy, self-describing, interoperable, and secure. It is a platform that provides transparency regarding information quality metrics and performance promises, allowing data consumers to confidently consume and reuse data products. The data product is designed to separate data mesh from data silos by building trust among data teams across different domains and encouraging the reuse of data products.'

# hybrid query 

In [12]:
from os import getenv
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index.embeddings.ollama import OllamaEmbedding

# http endpoint for your cluster (opensearch required for vector index usage)
endpoint = getenv("OPENSEARCH_ENDPOINT", "http://localhost:9200")
# index to demonstrate the VectorStore impl
idx = getenv("OPENSEARCH_INDEX", "data_mesh_index")

# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled with hybrid search pipeline
client = OpensearchVectorClient(
    endpoint,
    idx,
    dim=384,
    embedding_field=embedding_field,
    text_field=text_field,
    search_pipeline="hybrid-search-pipeline",
)
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context
)
embed_model = OllamaEmbedding(model_name="llama2")


In [13]:
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.core.vector_stores.types import VectorStoreQueryMode


filters = MetadataFilters(
    filters=[
        ExactMatchFilter(
            key="term", value='{{"metadata.content": "What is data product?"}}'
        )
    ]
)

retriever = index.as_retriever(
    vector_store_query_mode=VectorStoreQueryMode.HYBRID
)

result = retriever.retrieve("What is data product?")


In [14]:
result

[NodeWithScore(node=TextNode(id_='807744c8-770b-4469-a358-d1e7acf186b5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://www.thoughtworks.com/en-in/insights/blog/data-strategy/building-an-amazon-com-for-your-data-products', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='9cfd2045ac0af63bf17bf552b603d97bc47ab4403ee17003501248b51cfa4201'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='c87cc1a3-ba97-4cc9-ba87-6994195a9579', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='e37e4e1cac3f0274d8483d96d8dea2f4f6cac45324ec37d95f626c9e515add63'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='a3582c63-44f9-4fe8-9b7c-a8d16eb53ef6', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='9c4426522e564b76252c781257f50819877e6dd433175831b245973a0f6960b3')}, text='[Customer 360 Data\nProduct](/content/dam/thoughtworks/images/infographic/Tw_illustra

In [16]:
query_engine = index.as_query_engine(vector_store_query_mode=VectorStoreQueryMode.HYBRID,llm=llm)
response = query_engine.query("What is data product?")

In [17]:
response.response

"According to the context information provided, a data product is a product that delivers data as a service, providing a trustworthy and reliable source of data to users. The data product is built using modern data engineering techniques and is discoverable, addressable, trustworthy, self-describing, interoperable, and secure. It has characteristics such as being transparent about information quality metrics and performance promises, and provides a central marketplace or catalog of internal data products to raise awareness and convince skeptical data consumers to use them.\n\nThe data product is also monitored and visualized using Monte Carlo's notification mechanism, which enables the data product teams to track the satisfaction of their data products' consumers over time. The metadata from these monitoring efforts can be extracted via APIs and published in catalogs like Collibra, dataworld, or Atlan, making the data products discoverable and providing a user experience that delivers 