# AI Search - Index (Pull) documents for RAG

### Docs

- https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search

Basic appoaches push and pull
- https://learn.microsoft.com/en-us/azure/search/search-what-is-data-import
- Note: If AI enrichment (https://learn.microsoft.com/en-us/azure/search/cognitive-search-concept-intro) is a solution requirement, you must use the pull model (indexers) to load an index. Skillsets are attached to an indexer and don't run independently.

### Inspirational sources
- https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb

### Dependencies
- https://learn.microsoft.com/en-us/azure/search/search-api-versions

In [28]:
#! pip install -r requirements.txt

### Gobal flags (e.g. for debug and development)

### Load .env file (Copy .env-sample to .env and update accordingly)

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True) # take environment variables from .env.

from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]

blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"]
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]

### Connect to Blob Storage and sync documents
Synchronize documents in the blob storage with local document data. This will delete any documents that are not present locally.

In [2]:
from azure.storage.blob import BlobServiceClient  
import os

# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
    container_client.create_container()

deprecated_blobs = [blob for blob in container_client.list_blob_names()]

print(f"Setup sample data in {blob_container_name}...")

documents_directory = os.path.join("data")
for file in os.listdir(documents_directory):
    with open(os.path.join(documents_directory, file), "rb") as data:
        name = os.path.basename(file)
        if not container_client.get_blob_client(name).exists():
            print(f'Uploading: {name}')
            container_client.upload_blob(name=name, data=data)

        if name in deprecated_blobs:
                deprecated_blobs.remove(name)

if len(deprecated_blobs) > 0:
    for _blob_name in deprecated_blobs:
        print(f"Deleting (not found locally): {_blob_name}")
        container_client.delete_blob(_blob_name)

Setup sample data in openai...


### Create a blob data source connection on Azure AI Search

In [3]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import SearchIndexerDataContainer, SearchIndexerDataSourceConnection

# Create a data source 
indexer_client = SearchIndexerClient(endpoint, credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'testindex150424v4-blob' created or updated


### Create a search index
Vector and nonvector content is stored in a search index. Note the key attribute.

In [32]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)  
fields = [  
    SearchField(name="id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="title", type=SearchFieldDataType.String, sortable=False, filterable=True, facetable=False, searchable=False),  
    # SearchField(name="type", type=SearchFieldDataType.Int32, sortable=False, filterable=True, facetable=False, searchable=False),  
    SearchField(name="url", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False),  
    SearchField(name="filepath", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, key=True, analyzer_name="keyword"),  
    SearchField(name="content", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=True), 
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),  
    SearchField(name="last_updated", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False)
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        # Note:
        # HHNSW has several configuration parameters that can be tuned to achieve the throughput, latency, and recall objectives for your search application.
        # https://learn.microsoft.com/en-us/azure/search/vector-search-ranking#when-to-use-hnsw
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        # Note: ExhaustiveKnn is not actually used in the defintition of the index fields.
        # Exhaustive KNN performs a brute-force search that scans the entire vector space.
        # It's intended for scenarios where high recall is of utmost importance, and users are willing to accept the trade-offs in search performance. Because it's computationally intensive, use exhaustive KNN for small to medium datasets, or when precision requirements outweigh query performance considerations.
        # https://learn.microsoft.com/en-us/azure/search/vector-search-ranking#when-to-use-exhaustive-knn
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  
  
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.semanticconfiguration?view=azure-python-preview
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="content")]  
    ),  
)  
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  

# Delete the current index (for testing purposes only)
index_client.delete_index(index_name)  

# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

testindex150424v3 created


### Create custome skill 

In [4]:
from azure.search.documents.indexes.models import WebApiSkill, InputFieldMappingEntry, OutputFieldMappingEntry

# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.webapiskill?view=azure-python
custom_skill = WebApiSkill(
    name="mindscape-skill",
    description="A custom skill",
    context="/document",
    uri="https://sbdnic-func-prod-weu-aisearch-skill.azurewebsites.net/api/prepare",
    http_method="POST",
    timeout="PT30S",
    batch_size=1,
    degree_of_parallelism=1,
    inputs=[  
        InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="type", target_name="type")  
    ]
)

### Create a skillset
Skills drive integrated vectorization. Text Split provides data chunking. AzureOpenAIEmbedding handles calls to Azure OpenAI, using the connection information you provide in the environment variables. An indexer projection specifies secondary indexes used for chunked data.

In [5]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset
)

# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  

split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        # OutputFieldMappingEntry(name="embedding", target_name="vector") 
        OutputFieldMappingEntry(name="embedding", target_name="contentVector")  
    ],  
)  
  
# For standard meta data see: 
# https://learn.microsoft.com/en-us/azure/search/search-howto-indexing-azure-blob-storage 
# https://learn.microsoft.com/en-us/azure/search/search-blob-metadata-properties
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            # parent_key_field_name="parent_id",  
            parent_key_field_name="id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="content", source="/document/pages/*"),  
                InputFieldMappingEntry(name="contentVector", source="/document/pages/*/contentVector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
                # InputFieldMappingEntry(name="content", source="/document/content"),  
                # InputFieldMappingEntry(name="type", source="/document/type"),  
                InputFieldMappingEntry(name="filepath", source="/document/metadata_storage_path"),  
                InputFieldMappingEntry(name="url", source="/document/metadata_storage_path"),  
                InputFieldMappingEntry(name="last_updated", source="/document/metadata_storage_last_modified"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    # skills=[custom_skill, split_skill, embedding_skill],  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

testindex150424v4-skillset created


### Create an indexer

In [6]:
from azure.search.documents.indexes.models import SearchIndexer #, FieldMapping

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    # RH: this is done further below more explicitly:
    # field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(endpoint, credential)  

# delete indexer (for testing purposes only)
indexer_client.delete_indexer(indexer_name)
indexer_result = indexer_client.create_or_update_indexer(indexer)  
print(f' {indexer_name} created')  

# Run the indexer (if not started right with index creation)
#indexer_client.run_indexer(indexer_name)  

 testindex150424v4-indexer created


### Helper functions

In [7]:
import time 

def wait_for_indexer(seconds=15):

    while indexer_client.get_indexer_status(indexer_name).last_result == None or indexer_client.get_indexer_status(indexer_name).last_result.status == 'inProgress':
        print('Indexer running...')
        time.sleep(seconds)
        
    print(indexer_client.get_indexer_status(indexer_name).last_result.status)

### Wait for indexer to finish

In [8]:
wait_for_indexer(5)

Indexer running...
success
