Notebook is inspired by Microsofts notebook found on Github: [Vector search in Python (Azure AI Search)](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/readme.md).

In [None]:
#Install requirements.txt

### Load .env file

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Azure AI Search parameters

search_service_name = os.environ["AZURE_SEARCH_SERVICE"]
search_endpoint = f"https://{search_service_name}.search.windows.net"
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_KEY"])
index_name = os.environ["AZURE_SEARCH_INDEX"]

# Azure Blob Storage parameters
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]

# Azure OpenAI parameters
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"]
azure_openai_embedding_name = os.environ["AZURE_OPENAI_EMBEDDING_NAME"]

## Connect to Blob Storage  
Retrieve documents from Blob Storage. You can use the sample documents in the [documents](../data/documents) folder.  

In [None]:
from azure.storage.blob import BlobServiceClient  
# Connect to Blob Storage using connection string
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(blob_container_name)

### (Optional) Running a few tests to verify blob connection

In [None]:
# Test connection by listing all containers in blob storage
all_containers = blob_service_client.list_containers(include_metadata=True)
for container in all_containers:
    print(container['name'], container['metadata'])

## Connect your Blob storage to a data source in Azure AI Search
Makes data from a blob storage available as a data source to an indexer.

In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (SearchIndexerDataContainer, SearchIndexerDataSourceConnection)
from azure.search.documents.indexes._generated.models import NativeBlobSoftDeleteDeletionDetectionPolicy

# Create a data source 
indexer_client = SearchIndexerClient(endpoint, credential=credential)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=SearchIndexerDataContainer(name=blob_container_name),
    data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy()
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

## Create a search index
Index is used to store data and make it searchable with Azure AI Search

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SimpleField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)  
fields = [  
    # Standard fields
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False),  
    SearchField(name="title", type=SearchFieldDataType.String),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=False, filterable=False, facetable=False, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
    SimpleField(name="storage_path", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),  
    SimpleField(name="storage_content_type", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),  
    SimpleField(name="storage_last_modified", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),  
    # Vector field for embeddings
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    # Add custom metadatafields here
    SearchField(name="Orgname", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False),    
    SearchField(name="Genorgindex", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False),
    SearchField(name="Name", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False),
    SimpleField(name="Statusname", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),    
    SimpleField(name="Validfromdate", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Docmoduletype", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Docid", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Approver_middlename", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Accesslevelid", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Approver_lastname", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Doctype", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Doctypeid", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Approver_firstname", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Docstatusid", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Validity_custmizeddomainname", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Validity_customizeddomain_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
    SimpleField(name="Validity_customizeddomain_genindex", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=False, searchable=False),
]  

# Configure the vector search configuration (standard config here from the Microsoft example). 
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                deployment_id=azure_openai_embedding_name,  
                api_key=azure_openai_key,  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)  
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  


## Create a skillset
Skillsets is used to process and enhance files before being indexed

In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset
)

# Create a skillset  
skillset_name = f"{index_name}-skillset"  

# Splitt skill to chunk documents  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000, # Standard config from Microsoft example. 
    page_overlap_length=500,  # Standard config from Microsoft example.
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_name,  
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  

index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[
                # Map fields to the index  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
                InputFieldMappingEntry(name="storage_path", source="/document/metadata_storage_path"),  
                InputFieldMappingEntry(name="storage_content_type", source="/document/metadata_storage_content_type"),  
                InputFieldMappingEntry(name="storage_last_modified", source="/document/metadata_storage_last_modified"),  
                # Add custom metadata fields here
                InputFieldMappingEntry(name="Orgname", source="/document/Orgname"), 
                InputFieldMappingEntry(name="Statusname", source="/document/Statusname"), 
                InputFieldMappingEntry(name="Validfromdate", source="/document/Validfromdate"),
                InputFieldMappingEntry(name="Genorgindex", source="/document/Genorgindex"), 
                InputFieldMappingEntry(name="Name", source="/document/Name"), 
                InputFieldMappingEntry(name="Docmoduletype", source="/document/Docmoduletype"), 
                InputFieldMappingEntry(name="Docid", source="/document/Docid"), 
                InputFieldMappingEntry(name="Approver_middlename", source="/document/Approver_middlename"), 
                InputFieldMappingEntry(name="Accesslevelid", source="/document/Accesslevelid"), 
                InputFieldMappingEntry(name="Approver_lastname", source="/document/Approver_lastname"), 
                InputFieldMappingEntry(name="Doctype", source="/document/Doctype"), 
                InputFieldMappingEntry(name="Doctypeid", source="/document/Doctypeid"), 
                InputFieldMappingEntry(name="Approver_firstname", source="/document/Approver_firstname"), 
                InputFieldMappingEntry(name="Docstatusid", source="/document/Docstatusid"),
                InputFieldMappingEntry(name="Validity_custmizeddomainname", source="/document/Validity_custmizeddomainname"), 
                InputFieldMappingEntry(name="Validity_customizeddomain_id", source="/document/Validity_customizeddomain_id"), 
                InputFieldMappingEntry(name="Validity_customizeddomain_genindex", source="/document/Validity_customizeddomain_genindex")


                
            ],  
        ),  
    ], 
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


In [None]:
data_source.name

## Create an indexer

In [None]:
from datetime import datetime
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    IndexingSchedule
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    # Schedule the indexer to run every hour
    schedule=IndexingSchedule(interval="PT1H", start_time=datetime.utcnow().isoformat() + "Z")
)
indexer_client = SearchIndexerClient(endpoint, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  

# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')

In [None]:
# TODOs
# Indexers scheduling 

#Terraform 
# Enable soft delete for blobs and containers 

In [None]:
endpoint