# Azure AI Search integrated vectorization sample


In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os


In [3]:

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_QUERY_KEY"]) 

index_name = os.environ["AZURE_SEARCH_INDEX"]

In [4]:
import re 
def get_page_number(chunk_id : str): 
    page_re = r'_pages_(\d+)$'

    match = re.search(page_re, chunk_id)
    if match:
        page_number = match.group(1)
        return page_number

## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

If you indexed the health plan PDF file, send queries that ask plan-related questions.

In [4]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "eye test"
# "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"  

topk = 5
  
search_client = SearchClient(endpoint, index_name, credential=credential)


vector_fields = ["vector"]

vector_queries = [VectorizableTextQuery(
        text = query,
        k_nearest_neighbors=topk, 
        fields=vector,
        exhaustive=True, 
    ) for vector in vector_fields]


# vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=topk, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
# results = search_client.search(  
#     search_text=None,  
#     vector_queries= vector_queries,
#     select=["parent_id", "chunk_id", "chunk","title"],
#     top=topk
# )  



results = search_client.search(
        search_text = query, 
        vector_queries=vector_queries, 
        select=["parent_id", "chunk_id", "chunk","title"], 
        top=topk, 
        include_total_count=True, 
        query_type="semantic",
        semantic_configuration_name="my-semantic-config", 
        query_answer="extractive", 
        query_answer_count=5, 
        query_caption="extractive",
        query_caption_highlight_enabled=True,
    )


for result in results:  
    print(f"title: {result['title']}")
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Page: {get_page_number(result['chunk_id'])}")
    print(f"search captions: {result['@search.captions'][0].text}")
    print(f"search captions highlights: {result['@search.captions'][0].highlights}")
    
    # print(f"Content: {result['chunk']}")   


title: Northwind_Health_Plus_Benefits_Details.pdf
parent_id: aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9Ob3J0aHdpbmRfSGVhbHRoX1BsdXNfQmVuZWZpdHNfRGV0YWlscy5wZGY1
chunk_id: 06b081b451f3_aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9Ob3J0aHdpbmRfSGVhbHRoX1BsdXNfQmVuZWZpdHNfRGV0YWlscy5wZGY1_pages_104
Score: 0.013698630034923553
Page: 104
search captions: This includes:    • The patient’s diagnosis   • The proposed treatment   • The anticipated duration of the treatment   • Any other relevant information that may be requested by Northwind Health     It is also important to understand that prior authorization is not a guarantee of payment..  Maternity services   • Services and supplies related to dia...
search captions highlights: This includes:    • The patient’s diagnosis   • The proposed<em> treatment</em>   • The anticipated duration of the<em> treatment</em>   • Any other relevant information t

In [5]:
print(result.keys())

dict_keys(['chunk', 'title', 'parent_id', 'chunk_id', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])


In [6]:
search_caption = result["@search.captions"]

In [7]:
for sc in search_caption: 
    print(sc)

{'additional_properties': None, 'text': 'The plan also covers preventive care services, such as vaccinations and screenings Laboratory Tests: Northwind Health Plus covers laboratory tests prescribed by a healthcare   provider This includes blood tests, urine tests, and other tests to diagnose and treat   illnesses and injuries Imaging Services: Northwind Health Plus covers imaging service...', 'highlights': 'The plan also covers preventive care services, such as vaccinations and screenings Laboratory<em> Tests:</em> Northwind Health Plus covers laboratory<em> tests</em> prescribed by a healthcare   provider This includes blood<em> tests,</em> urine<em> tests,</em> and other<em> tests</em> to diagnose and treat   illnesses and injuries Imaging Services: Northwind Health Plus covers imaging service...'}


In [8]:
print(sc.text)
print(sc.highlights)
print(sc.additional_properties)
sc.as_dict().keys()

The plan also covers preventive care services, such as vaccinations and screenings Laboratory Tests: Northwind Health Plus covers laboratory tests prescribed by a healthcare   provider This includes blood tests, urine tests, and other tests to diagnose and treat   illnesses and injuries Imaging Services: Northwind Health Plus covers imaging service...
The plan also covers preventive care services, such as vaccinations and screenings Laboratory<em> Tests:</em> Northwind Health Plus covers laboratory<em> tests</em> prescribed by a healthcare   provider This includes blood<em> tests,</em> urine<em> tests,</em> and other<em> tests</em> to diagnose and treat   illnesses and injuries Imaging Services: Northwind Health Plus covers imaging service...
None


dict_keys(['text', 'highlights'])

## Perform a hybrid search

In [9]:
# Hybrid Search
query = "eye test" #"Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"  
  
search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [], #[vector_query],
    select=["parent_id", "chunk_id", "chunk", "title"],

    # top=1
)  
  

for result in results:  
    # print(f"{len(result['parent_id'])} : {len(result['chunk_id'])} ")
    if result["parent_id"] == None:
        print(f"chunk_id: {result['chunk_id']}")  
        print(f"Score: {result['@search.score']}")  
        
    print(f"title: {result['title']}")
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Page: {get_page_number(result['chunk_id'])}")

print(result.keys())

title: Enterprise AI Planning.pdf
parent_id: aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9FbnRlcnByaXNlJTIwQUklMjBQbGFubmluZy5wZGY1
chunk_id: 8774bc85b15b_aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9FbnRlcnByaXNlJTIwQUklMjBQbGFubmluZy5wZGY1_pages_232
Score: 4.622714
Page: 232
title: Enterprise AI Planning.pdf
parent_id: aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9FbnRlcnByaXNlJTIwQUklMjBQbGFubmluZy5wZGY1
chunk_id: 8774bc85b15b_aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9FbnRlcnByaXNlJTIwQUklMjBQbGFubmluZy5wZGY1_pages_128
Score: 4.443231
Page: 128
title: Enterprise AI Planning.pdf
parent_id: aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9FbnRlcnByaXNlJTIwQUklMjBQbGFubmluZy5wZGY1
chunk_id: 8774bc85b15b_aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV

In [10]:
import re

page_re = r'_pages_(\d+)$'

results = search_client.search(  
    search_text=query,  
    vector_queries= [], #[vector_query],
    select=["parent_id", "chunk_id", "chunk","title"],
    # filter="parent_id ne null", # retrieves chunks only
    filter="parent_id eq null", # retrieves parent documents only

    # top=1
)  

for result in results:  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}") 
    # print(f"Content: {result['chunk']}") 
    print(f"Title: {result['title']}")
    match = re.search(page_re, result['chunk_id'])
    if match:
        page_number = match.group(1)
        print(f"Page: {page_number}")
     
print(result.keys())

dict_keys(['chunk', 'title', 'parent_id', 'chunk_id', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])


In [11]:
# for result in results: 
#     print(result['parent_id'])

parent_ids = [result['parent_id'] for result in results]

In [12]:
len(parent_ids)

0

In [13]:
print(result.keys())
print(result)

dict_keys(['chunk', 'title', 'parent_id', 'chunk_id', '@search.score', '@search.reranker_score', '@search.highlights', '@search.captions'])
{'chunk': 'explanation of the services \n\nand treatments you are receiving, as well as the medical necessity for them.  \n\n• Make sure that all documentation is complete, including all required forms, test results, \n\nand physician’s notes.  \n\n• Send the documentation to Northwind Health as soon as possible.  \n\n• Follow up with Northwind Health to ensure that your claim has been received and is being \n\nprocessed.  \n\nBy following these steps and providing any necessary evidence of medical necessity, you \n\ncan help to ensure that your claim is processed quickly and accurately. \n\nThe Group And You \n\nOTHER INFORMATION ABOUT THIS PLAN \n\nThe Group and You \n\nThe Northwind Standard plan is a group plan, meaning that it is offered to a group of people \n\nand not to individuals. The group includes all eligible employees of Contoso and t

## Perform a hybrid search + semantic reranking

In [15]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
# Semantic Hybrid Search
query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"

search_client = SearchClient(endpoint, index_name, credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk", 'title'],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)

semantic_answers = results.get_answers()
if semantic_answers:
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"filename: {result['title']}")
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


Semantic Answer: <em>Northwind Standard</em> only offers coverage for doctor visits and lab   tests<em> Northwind Health</em> Plus is a comprehensive plan that offers more coverage than<em> Northwind</em> Standard Northwind Health Plus offers coverage for emergency services, mental health and substance abuse   coverage, and out-of-network services, while Northwind Standard does not Northwind...
Semantic Answer Score: 0.9033203125

parent_id: aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9CZW5lZml0X09wdGlvbnMucGRm0
chunk_id: 9bda6a7aeb27_aHR0cHM6Ly9nb2R6aWxsYXN0b3JhZ2UuYmxvYi5jb3JlLndpbmRvd3MubmV0L2ludGVncmF0ZWR2ZWN0b3ItZG9jcy9CZW5lZml0X09wdGlvbnMucGRm0_pages_1
filename: Benefit_Options.pdf
Reranker Score: 3.938725471496582
Content: care services, as well as prescription drug coverage. With 

Northwind Standard, you can choose from a variety of in-network providers, including primary care 

physicians, specialists, hospitals, and pharmacies. T