In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

In [2]:
!pip install -r azure-search-vector-python-llamaindex-sample-requirements.txt --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
!pip install -q llama-index-readers-web


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
load_dotenv(override=True) # take environment variables from .env.

# Make sure your .env file has values for the following environment variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
# index_name = os.environ["AZURE_SEARCH_INDEX"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
# Llama Index does not support RBAC authentication, an API key is required
azure_openai_key = os.environ["AZURE_OPENAI_KEY"]
if len(azure_openai_key) == 0:
    raise Exception("API key required")
azure_openai_embedding_model = os.environ["AZURE_OPENAI_EMBEDDING_MODEL"]
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
azure_openai_chatgpt_deployment = os.environ["AZURE_OPENAI_CHATGPT_DEPLOYMENT"]
azure_openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]
# embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))

In [6]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
embeddings = AzureOpenAIEmbedding(
    model_name=azure_openai_embedding_model,
    deployment_name=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

In [7]:
from llama_index.llms.azure_openai import AzureOpenAI
llm = AzureOpenAI(
    deployment_name=azure_openai_chatgpt_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

In [8]:
from llama_index.readers.web import SimpleWebPageReader

# Define the target URL
url = "https://www.jainsocietyhouston.org/about_us"


web_documents = SimpleWebPageReader().load_data(
    [url]
)

# Extract the content from the website data document
html_content = web_documents[0].text

In [9]:
from bs4 import BeautifulSoup
from llama_index.core import Document
# Parse the data.
soup = BeautifulSoup(html_content, 'html.parser')
p_tags = soup.findAll('p')
text_content = ""
for each in p_tags:
    text_content += each.text + "\n"

# Convert back to Document format
documents = [Document(text=text_content)]

In [10]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
# Index name to use
index_name = "llamaindex-vector-jsh-about"

# Use index client to demonstrate creating an index
index_client = SearchClient(
    endpoint=endpoint,
    index_name=index_name,
    credential=credential,
)

In [11]:
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore
from llama_index.vector_stores.azureaisearch import (
    IndexManagement,
    MetadataIndexFieldType,
)

metadata_fields = {

}

vector_store = AzureAISearchVectorStore(
    search_or_index_client=index_client,
    filterable_metadata_field_keys=metadata_fields,
    # index_name=index_name,
    index_management=IndexManagement.VALIDATE_INDEX,
    id_field_key="id",
    chunk_field_key="chunk",
    embedding_field_key="embedding",
    embedding_dimensionality=1536,
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
)

In [12]:
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.core.settings import Settings

storage_context = StorageContext.from_defaults(vector_store=vector_store)

Settings.llm = llm
Settings.embed_model = embeddings

In [13]:
index = VectorStoreIndex.from_documents(
    [],
    storage_context=storage_context,
)

In [14]:
refreshed_docs = index.refresh_ref_docs(documents, update_kwargs={"delete_kwargs": {'delete_from_docstore': True}})

In [15]:
from llama_index.core.vector_stores.types import VectorStoreQueryMode

default_retriever = index.as_retriever(
    vector_store_query_mode=VectorStoreQueryMode.DEFAULT
)
response = default_retriever.retrieve("Tell me about Jain Society of Houston")

# Loop through each NodeWithScore in the response
for node_with_score in response:
    node = node_with_score.node  # The TextNode object
    score = node_with_score.score  # The similarity score
    chunk_id = node.id_  # The chunk ID

    # Extract the relevant metadata from the node
    file_name = node.metadata.get("file_name", "Unknown")
    file_path = node.metadata.get("file_path", "Unknown")

    # Extract the text content from the node
    text_content = node.text if node.text else "No content available"

    # Print the results in a user-friendly format
    print(f"Score: {score}")
    print(f"File Name: {file_name}")
    print(f"Id: {chunk_id}")
    print("\nExtracted Content:")
    print(text_content)
    print("\n" + "=" * 40 + " End of Result " + "=" * 40 + "\n")

Score: 0.89288765
File Name: Unknown
Id: ebd043d7-8b33-40ac-a815-dd87e1bf2bf1

Extracted Content:
Home
About Us
Events
Pathshala
More
 
Motto of Jain Center: To promote the philosophy and teachings of Jain religion and to establish a platform for worship, discussion and teaching of Jain rituals, ideals, and principles of Jain religion, to celebrate auspicious Jain events and festivals.
 
Jainism is one of the oldest religious traditions of India, and has existed side by side with Hinduism throughout its long history. The basic philosophy of Jainism is non Violence and Anekantvad (Multiplicity of viewpoint). Jain Samaj is active in Houston since early 70s. Jain Society of Houston was formed in 1982.
​Jain Center Houston has over 800 families as members, consisting of all sects of Jains like Deravasi, Sthanakvasi, Shwentamber, Digamber etc. JHS works under the guidance of  JAINA (The Federation of Jain Associations in North America).
 
We have Mahavir Swami (Mul Nayak), Shankeshwar Parsh

In [16]:
hybrid_retriever = index.as_retriever(
    vector_store_query_mode=VectorStoreQueryMode.SEMANTIC_HYBRID
)
hybrid_response = hybrid_retriever.retrieve("Tell me about pathshala")

# Loop through each NodeWithScore in the response
for node_with_score in hybrid_response:
    node = node_with_score.node  # The TextNode object
    score = node_with_score.score  # The similarity score
    chunk_id = node.id_  # The chunk ID

    # Extract the relevant metadata from the node
    file_name = node.metadata.get("file_name", "Unknown")
    file_path = node.metadata.get("file_path", "Unknown")

    # Extract the text content from the node
    text_content = node.text if node.text else "No content available"

    # Print the results in a user-friendly format
    print(f"Score: {score}")
    print(f"File Name: {file_name}")
    print(f"Id: {chunk_id}")
    print("\nExtracted Content:")
    print(text_content)
    print("\n" + "=" * 40 + " End of Result " + "=" * 40 + "\n")

Score: 2.349226474761963
File Name: Unknown
Id: ebd043d7-8b33-40ac-a815-dd87e1bf2bf1

Extracted Content:
Home
About Us
Events
Pathshala
More
 
Motto of Jain Center: To promote the philosophy and teachings of Jain religion and to establish a platform for worship, discussion and teaching of Jain rituals, ideals, and principles of Jain religion, to celebrate auspicious Jain events and festivals.
 
Jainism is one of the oldest religious traditions of India, and has existed side by side with Hinduism throughout its long history. The basic philosophy of Jainism is non Violence and Anekantvad (Multiplicity of viewpoint). Jain Samaj is active in Houston since early 70s. Jain Society of Houston was formed in 1982.
​Jain Center Houston has over 800 families as members, consisting of all sects of Jains like Deravasi, Sthanakvasi, Shwentamber, Digamber etc. JHS works under the guidance of  JAINA (The Federation of Jain Associations in North America).
 
We have Mahavir Swami (Mul Nayak), Shankeshwa

In [17]:
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata

# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=index.as_query_engine(),
        metadata=ToolMetadata(
            name="jsh_about",
            description="Jain Society of Houston (JSH)",
        ),
    ),
]
# build a sub-question query engine over this tool
# this allows decomposing the question down into sub-questions which then execute against the tool
query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
   use_async=False,
)

response = query_engine.query("When does monthly Bhavna take place?")

Generated 1 sub questions.
[1;3;38;2;237;90;200m[jsh_about] Q: What is the schedule for monthly Bhavna at Jain Society of Houston?
[0m[1;3;38;2;237;90;200m[jsh_about] A: The monthly Bhavna at the Jain Society of Houston is held on the last Sunday of each month.
[0m