# AI Search - Create Index for RAG

### Docs

- https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search

### Dependencies
- https://learn.microsoft.com/en-us/azure/search/search-api-versions

In [28]:
#! pip install -r requirements.txt

### Gobal flags (e.g. for debug and development)

### Load .env file (Copy .env-sample to .env and update accordingly)

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True) # take environment variables from .env.

from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential

endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]

blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"]
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]

### Create a search index
Vector and nonvector content is stored in a search index. Note the key attribute.

In [2]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)  
fields = [  
    SearchField(name="id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="title", type=SearchFieldDataType.String, sortable=False, filterable=True, facetable=False, searchable=False),  
    # SearchField(name="type", type=SearchFieldDataType.Int32, sortable=False, filterable=True, facetable=False, searchable=False),  
    SearchField(name="url", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False),  
    SearchField(name="filepath", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, key=True, analyzer_name="keyword"),  
    SearchField(name="content", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=True), 
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),  
    SearchField(name="last_updated", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, searchable=False)
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        # Note:
        # HHNSW has several configuration parameters that can be tuned to achieve the throughput, latency, and recall objectives for your search application.
        # https://learn.microsoft.com/en-us/azure/search/vector-search-ranking#when-to-use-hnsw
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        # Note: ExhaustiveKnn is not actually used in the defintition of the index fields.
        # Exhaustive KNN performs a brute-force search that scans the entire vector space.
        # It's intended for scenarios where high recall is of utmost importance, and users are willing to accept the trade-offs in search performance. Because it's computationally intensive, use exhaustive KNN for small to medium datasets, or when precision requirements outweigh query performance considerations.
        # https://learn.microsoft.com/en-us/azure/search/vector-search-ranking#when-to-use-exhaustive-knn
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_deployment,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  
  
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.semanticconfiguration?view=azure-python-preview
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="content")]  
    ),  
)  
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  

# Delete the current index (for testing purposes only)
index_client.delete_index(index_name)  

# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

testindex150424v4 created
