In [1]:
from os import getenv
from llama_index.core import SimpleDirectoryReader
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index.core import VectorStoreIndex, StorageContext

# http endpoint for your cluster (opensearch required for vector index usage)
endpoint = getenv("OPENSEARCH_ENDPOINT", "http://localhost:9200")
# index to demonstrate the VectorStore impl
idx = getenv("OPENSEARCH_INDEX", "gpt-index-demo")
# # load some sample data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("/Users/samvardhan/Desktop/DataEngineer/opensearch_search_engine/data").load_data()

In [3]:
import nest_asyncio 
nest_asyncio.apply()

In [4]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

# Define the embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [5]:
Settings.embed_model =embed_model

In [5]:
from llama_index.llms.ollama import Ollama

In [10]:
Settings.llm = Ollama(model="llama2", request_timeout=30.0)

In [12]:
# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled
client = OpensearchVectorClient(
    endpoint, idx, dim=384, embedding_field=embedding_field, text_field=text_field
)
# initialize vector store
vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# initialize an index using our sample data and the client we just created
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context
)

ValueError: 
******
Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model='local'.
Visit our documentation for more embedding options: https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings.html#modules
******

In [9]:
# run query
query_engine = index.as_query_engine()
res = query_engine.query("What did the author do growing up?")
res.response

'The author grew up writing short stories and programming on an IBM 1401 computer in junior high school.'

In [11]:
query_engine = index.as_query_engine()
res = query_engine.query("What did the author do growing up?")
res.response

'Based on the context provided, the author grew up writing short stories and programming.'

# hybrid query 

In [17]:
from os import getenv
from llama_index.vector_stores.opensearch import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index.embeddings.ollama import OllamaEmbedding

# http endpoint for your cluster (opensearch required for vector index usage)
endpoint = getenv("OPENSEARCH_ENDPOINT", "http://localhost:9200")
# index to demonstrate the VectorStore impl
idx = getenv("OPENSEARCH_INDEX", "doc_retriever")

# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
# OpensearchVectorClient encapsulates logic for a
# single opensearch index with vector search enabled with hybrid search pipeline
client = OpensearchVectorClient(
    endpoint,
    idx,
    4096,
    embedding_field=embedding_field,
    text_field=text_field,
    search_pipeline="hybrid-search-pipeline",
)

vector_store = OpensearchVectorStore(client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context
)
embed_model = OllamaEmbedding(model_name="llama2")


In [18]:
Settings.embed_model =embed_model

In [30]:
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.core.vector_stores.types import VectorStoreQueryMode


filters = MetadataFilters(
    filters=[
        ExactMatchFilter(
            key="term", value='{{"metadata.content": "What did the author do growing?"}}'
        )
    ]
)

retriever = index.as_retriever(
    vector_store_query_mode=VectorStoreQueryMode.HYBRID
)

result = retriever.retrieve("What did the author do growing?")

In [34]:
result

[NodeWithScore(node=TextNode(id_='37fe7ad2-54e3-404a-9990-380d835f85a0', embedding=None, metadata={'file_path': '/Users/samvardhan/Desktop/DataEngineer/opensearch_search_engine/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-05-02', 'last_modified_date': '2024-05-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0d66dc4c-fe94-4e1e-b6d8-50010ddcba19', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/samvardhan/Desktop/DataEngineer/opensearch_search_engine/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-05-02',

In [35]:
query_engine = index.as_query_engine(vector_store_query_mode=VectorStoreQueryMode.HYBRID)

response = query_engine.query("What did the author do growing?")

In [37]:
response.response

'Based on the context information provided, the author grew "life right" while watching their boys play in the tide pools near the coast in 2015.'