**Example 1 - RAG on NASA book text**

https://learn.microsoft.com/en-us/azure/search/tutorial-rag-build-solution-pipeline

https://github.com/Azure-Samples/azure-search-openai-demo/issues/290

In [None]:
! pip install python-dotenv azure-core azure-search-documents==11.5.1 azure-storage-blob azure-identity openai aiohttp --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.7/297.7 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.4/197.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.6/405.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.6/375.6 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
AZURE_SEARCH_SERVICE: str = ""
AZURE_SEARCH_KEY: str = ""
AZURE_OPENAI_ACCOUNT: str = ""
AZURE_OPENAI_KEY: str = ""
AZURE_AI_MULTISERVICE_ACCOUNT: str = ""
AZURE_AI_MULTISERVICE_KEY: str = ""
AZURE_STORAGE_CONNECTION: str = ""

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

AZURE_SEARCH_CREDENTIAL = AzureKeyCredential(AZURE_SEARCH_KEY)

# Create a search index
index_name = "py-rag-tutorial-idx"
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL)
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="locations", type=SearchFieldDataType.Collection(SearchFieldDataType.String), filterable=True),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
    ]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="myOpenAI",
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myOpenAI",
            kind="azureOpenAI",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=AZURE_OPENAI_ACCOUNT,
                deployment_name="text-embedding-ada-002",
                model_name="text-embedding-ada-002",
                api_key=AZURE_OPENAI_KEY
            ),
        ),
    ],
)

# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f"{result.name} created")

py-rag-tutorial-idx created


In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source
indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL)
container = SearchIndexerDataContainer(name="nasa-ebook-pdfs-all")
data_source_connection = SearchIndexerDataSourceConnection(
    name="py-rag-tutorial-ds",
    type="azureblob",
    connection_string=AZURE_STORAGE_CONNECTION,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'py-rag-tutorial-ds' created or updated


In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    EntityRecognitionSkill,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey
)

# Create a skillset
skillset_name = "py-rag-tutorial-ss"

split_skill = SplitSkill(
    description="Split skill to chunk documents",
    text_split_mode="pages",
    context="/document",
    maximum_page_length=2000,
    page_overlap_length=500,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="textItems", target_name="pages")
    ],
)

embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Skill to generate embeddings via Azure OpenAI",
    context="/document/pages/*",
    resource_url=AZURE_OPENAI_ACCOUNT,
    deployment_name="text-embedding-ada-002",
    model_name="text-embedding-ada-002",
    dimensions=1536,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/pages/*"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="text_vector")
    ],
)

entity_skill = EntityRecognitionSkill(
    description="Skill to recognize entities in text",
    context="/document/pages/*",
    categories=["Location"],
    default_language_code="en",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/pages/*")
    ],
    outputs=[
        OutputFieldMappingEntry(name="locations", target_name="locations")
    ]
)

index_projections = SearchIndexerIndexProjection(
    selectors=[
        SearchIndexerIndexProjectionSelector(
            target_index_name=index_name,
            parent_key_field_name="parent_id",
            source_context="/document/pages/*",
            mappings=[
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                InputFieldMappingEntry(name="text_vector", source="/document/pages/*/text_vector"),
                InputFieldMappingEntry(name="locations", source="/document/pages/*/locations"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
            ],
        ),
    ],
    parameters=SearchIndexerIndexProjectionsParameters(
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
    ),
)

cognitive_services_account = CognitiveServicesAccountKey(key=AZURE_AI_MULTISERVICE_KEY)

skills = [split_skill, embedding_skill, entity_skill]

skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset to chunk documents and generating embeddings",
    skills=skills,
    index_projection=index_projections,
    cognitive_services_account=cognitive_services_account
)

client = SearchIndexerClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL)
client.create_or_update_skillset(skillset)
print(f"{skillset.name} created")

py-rag-tutorial-ss created


In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer
indexer_name = "py-rag-tutorial-idxr"

indexer_parameters = None

indexer = SearchIndexer(
    name=indexer_name,
    description="Indexer to index documents and generate embeddings",
    skillset_name=skillset_name,
    target_index_name=index_name,
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    parameters=indexer_parameters
)

# Create and run the indexer
indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL)
indexer_result = indexer_client.create_or_update_indexer(indexer)

print(f' {indexer_name} is created and running. Give the indexer a few minutes before running a query.')

 py-rag-tutorial-idxr is created and running. Give the indexer a few minutes before running a query.


In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Vector Search using text-to-vector conversion of the querystring
query = "where are NASA's headquarters located?"

search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL, index_name=index_name)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="text_vector", exhaustive=True)

results = search_client.search(
    search_text=query,
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "title", "chunk", "locations"],
    top=1
)

for result in results:
    print(f"Score: {result['@search.score']}")
    print(f"Title: {result['title']}")
    print(f"Locations: {result['locations']}")
    print(f"Content: {result['chunk']}")

Score: 0.03333333507180214
Title: page-178.pdf
Locations: ['Headquarters', 'Washington']
Content: national Aeronautics and Space Administration

earth Science

NASA Headquarters 

300 E Street SW 

Washington, DC 20546

www.nasa.gov

np-2018-05-2546-hQ


In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Vector Search using text-to-vector conversion of the querystring
query = "what is a sweet fruit in UAE?"

search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE, credential=AZURE_SEARCH_CREDENTIAL, index_name=index_name)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="text_vector", exhaustive=True)

results = search_client.search(
    search_text=query,
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "title", "chunk", "locations"],
    top=1
)

for result in results:
    print(f"Score: {result['@search.score']}")
    print(f"Title: {result['title']}")
    print(f"Locations: {result['locations']}")
    print(f"Content: {result['chunk']}")

Score: 0.030365297570824623
Title: page-117.pdf
Locations: ['United Arab Emirates', 'Rub’ al Khali', 'desert', 'plantations', 'towns', 'Liwa Oasis', 'Abu Dhabi', 'Arabian Peninsula', 'farms', 'greenhouses', 'aquifers', 'sand seas']
Content: L
a

n
d

E
A

R
T

H

110

Liwa Oasis
United Arab Emirates

In the sandy tan terrain of the United Arab Emirates, on the northern edge of the Rub’ al Khali, an oasis brings green to the desert. 

The T-shaped, 100-kilometer stretch of date plantations and small towns compose the Liwa Oasis, home to about 20,000 people in 

the emirate of Abu Dhabi. It is one of the largest oases on the Arabian Peninsula.

Bedouins tapped underground water supplies here at least five centuries ago, and date farms have proliferated. Drip irrigation 

and greenhouses now help conserve the precious water supply. Since rainfall is scarce in the region, much of the water comes 

from aquifers full of “fossil” water that accumulated more than 20,000 years ago and is now b

In [None]:
# Import libraries
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI

# Set up clients and specify the chat model
openai_client = AzureOpenAI(
     api_version="2024-06-01",
     azure_endpoint=AZURE_OPENAI_ACCOUNT,
     api_key=AZURE_OPENAI_KEY
 )

deployment_name = "gpt-4o"

search_client = SearchClient(
     endpoint=AZURE_SEARCH_SERVICE,
     index_name=index_name,
     credential=AZURE_SEARCH_CREDENTIAL
 )

# Provide instructions to the model
GROUNDED_PROMPT="""
You are a helpful AI assistant.
Answer the query using only the sources provided below.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""

# Provide the query. Notice it's sent to both the search engine and the LLM.
query="What is a sweet fruit in UAE?"

#needs to be added, missed from the demo code!!
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="text_vector", exhaustive=True)

# Set up the search results and the chat thread.
# Retrieve the selected fields from the search index related to the question.
search_results = search_client.search(
    search_text=query,
    vector_queries= [vector_query],
    top=1,
    select="title, chunk, locations"
)
sources_formatted = "\n".join([f'{document["title"]}:{document["chunk"]}:{document["locations"]}' for document in search_results])

response = openai_client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": GROUNDED_PROMPT.format(query=query, sources=sources_formatted)
        }
    ],
    model=deployment_name
)

print(response.choices[0].message.content)

A sweet fruit in the UAE is the date, which is cultivated extensively in the Liwa Oasis located in the emirate of Abu Dhabi.


In [None]:
sources_formatted

"page-117.pdf:L\na\n\nn\nd\n\nE\nA\n\nR\nT\n\nH\n\n110\n\nLiwa Oasis\nUnited Arab Emirates\n\nIn the sandy tan terrain of the United Arab Emirates, on the northern edge of the Rub’ al Khali, an oasis brings green to the desert. \n\nThe T-shaped, 100-kilometer stretch of date plantations and small towns compose the Liwa Oasis, home to about 20,000 people in \n\nthe emirate of Abu Dhabi. It is one of the largest oases on the Arabian Peninsula.\n\nBedouins tapped underground water supplies here at least five centuries ago, and date farms have proliferated. Drip irrigation \n\nand greenhouses now help conserve the precious water supply. Since rainfall is scarce in the region, much of the water comes \n\nfrom aquifers full of “fossil” water that accumulated more than 20,000 years ago and is now buried deep under the sand seas and \n\nlimestone formations.:['United Arab Emirates', 'Rub’ al Khali', 'desert', 'plantations', 'towns', 'Liwa Oasis', 'Abu Dhabi', 'Arabian Peninsula', 'farms', 'gre