In [None]:
# Install the required Python packages listed in the requirements file.
# The `--quiet` flag suppresses the output for a cleaner installation process.
! pip install -r azure-search-vector-python-sample-requirements.txt --quiet

In [3]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

# Load environment variables from a .env file, overriding existing environment variables if necessary
load_dotenv(override=True)

# The following variables are loaded from your .env file and used in this notebook:
# - AZURE_SEARCH_SERVICE_ENDPOINT: The endpoint of your Azure Cognitive Search service
# - AZURE_SEARCH_ADMIN_KEY: The admin key for your Azure Cognitive Search service
# - AZURE_SEARCH_INDEX: The name of the search index to use (default: "vectest")
# - AZURE_OPENAI_ENDPOINT: The endpoint of your Azure OpenAI service
# - AZURE_OPENAI_KEY: The API key for your Azure OpenAI service
# - AZURE_OPENAI_EMBEDDING_DEPLOYMENT: The deployment name for the embedding model (default: "text-embedding-3-large")
# - AZURE_OPENAI_EMBEDDING_DIMENSIONS: The dimensions of the embedding model (default: 1024)
# - AZURE_OPENAI_API_VERSION: The API version for Azure OpenAI (default: "2024-10-21")

# Azure Cognitive Search service endpoint
endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")

# Credential for Azure Cognitive Search: Use AzureKeyCredential if an admin key is provided, otherwise use DefaultAzureCredential
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) if len(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) > 0 else DefaultAzureCredential()

# Name of the Azure Cognitive Search index
index_name = os.getenv("AZURE_SEARCH_INDEX", "vectest")

# Azure OpenAI service endpoint and API key
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY", "") if len(os.getenv("AZURE_OPENAI_KEY", "")) > 0 else None

# Configuration for the embedding model
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1024))
embedding_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-10-21")

In [4]:
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

# Initialize Azure OpenAI credentials
openai_credential = DefaultAzureCredential()

# Token provider for Azure AD authentication (used if API key is not provided)
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

# Initialize the Azure OpenAI client
client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,  # Deployment name for the embedding model
    api_version=azure_openai_api_version,               # API version for Azure OpenAI
    azure_endpoint=azure_openai_endpoint,               # Azure OpenAI service endpoint
    api_key=azure_openai_key,                           # API key for Azure OpenAI (if available)
    azure_ad_token_provider=token_provider if not azure_openai_key else None  # Use token provider if API key is not provided
)

# Generate Document Embeddings using OpenAI 3 large
# Read the input data from the text-sample.json file
path = os.path.join('text-sample.json')
with open(path, 'r', encoding='utf-8') as file:
    input_data = json.load(file)

# Extract titles and content from the input data
titles = [item['title'] for item in input_data]
content = [item['content'] for item in input_data]

# Generate embeddings for the titles
title_response = client.embeddings.create(
    input=titles, 
    model=embedding_model_name, 
    dimensions=azure_openai_embedding_dimensions
)
title_embeddings = [item.embedding for item in title_response.data]

# Generate embeddings for the content
content_response = client.embeddings.create(
    input=content, 
    model=embedding_model_name, 
    dimensions=azure_openai_embedding_dimensions
)
content_embeddings = [item.embedding for item in content_response.data]

# Add the generated embeddings to the input data
for i, item in enumerate(input_data):
    title = item['title']
    content = item['content']
    item['titleVector'] = title_embeddings[i]  # Add title embeddings
    item['contentVector'] = content_embeddings[i]  # Add content embeddings

# Output the updated data with embeddings to the docVectors.json file
output_path = os.path.join('output', 'docVectors.json')
output_directory = os.path.dirname(output_path)

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Write the updated data to the output file
with open(output_path, "w") as f:
    json.dump(input_data, f)

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters
)

# Initialize the SearchIndexClient to manage the search index
index_client = SearchIndexClient(
    endpoint=endpoint, credential=credential)

# Define the fields for the search index
fields = [
    # Define a simple field for the document ID, which is the key field
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    
    # Define searchable fields for title, content, and category
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
    
    # Define vector fields for title and content embeddings
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=azure_openai_embedding_dimensions, vector_search_profile_name="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=azure_openai_embedding_dimensions, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search settings
vector_search = VectorSearch(
    # Define the HNSW (Hierarchical Navigable Small World) algorithm configuration
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"  # Name of the HNSW algorithm configuration
        )
    ],
    # Define the vector search profile
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",  # Name of the vector search profile
            algorithm_configuration_name="myHnsw",  # Link to the HNSW algorithm configuration
            vectorizer_name="myVectorizer"  # Link to the vectorizer
        )
    ],
    # Define the Azure OpenAI vectorizer
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myVectorizer",  # Name of the vectorizer
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_endpoint,  # Azure OpenAI endpoint
                deployment_name=azure_openai_embedding_deployment,  # Deployment name for the embedding model
                model_name=embedding_model_name,  # Name of the embedding model
                api_key=azure_openai_key  # API key for Azure OpenAI
            )
        )
    ]
)

# Configure the semantic search settings
semantic_config = SemanticConfiguration(
    name="my-semantic-config",  # Name of the semantic configuration
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),  # Prioritize the title field
        keywords_fields=[SemanticField(field_name="category")],  # Prioritize the category field as keywords
        content_fields=[SemanticField(field_name="content")]  # Prioritize the content field
    )
)

# Create the semantic search settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the defined fields, vector search, and semantic search settings
index = SearchIndex(
    name=index_name,  # Name of the search index
    fields=fields,  # Fields defined for the index
    vector_search=vector_search,  # Vector search configuration
    semantic_search=semantic_search  # Semantic search configuration
)

# Create or update the search index in Azure Cognitive Search
result = index_client.create_or_update_index(index)
print(f'{result.name} created')  # Output the name of the created index

In [None]:
from azure.search.documents import SearchClient
import json

# Define the path to the output file containing the documents to be uploaded
output_path = os.path.join('output', 'docVectors.json')

# Ensure the output directory exists
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Load the documents from the output file
with open(output_path, 'r') as file:  
    documents = json.load(file)  

# Initialize the SearchClient to interact with the Azure Cognitive Search index
search_client = SearchClient(
    endpoint=endpoint,  # Azure Cognitive Search service endpoint
    index_name=index_name,  # Name of the search index
    credential=credential  # Credential for authentication
)

# Upload the documents to the Azure Cognitive Search index
result = search_client.upload_documents(documents)

# Print the number of documents uploaded
print(f"Uploaded {len(documents)} documents") 

In [None]:
from azure.search.documents import SearchIndexingBufferedSender

# Load the documents to be uploaded to the Azure Cognitive Search index
# The documents are read from the output file generated earlier
with open(output_path, 'r') as file:  
    documents = json.load(file)  

# Use SearchIndexingBufferedSender to upload the documents in batches
# This approach optimizes the indexing process by handling batching, retries, and error handling automatically
with SearchIndexingBufferedSender(  
    endpoint=endpoint,  # Azure Cognitive Search service endpoint
    index_name=index_name,  # Name of the search index
    credential=credential,  # Credential for authentication
) as batch_client:  
    # Add upload actions for all documents
    # The documents are uploaded in batches to improve performance and reliability
    batch_client.upload_documents(documents=documents)  

# Print the total number of documents uploaded to the index
print(f"Uploaded {len(documents)} documents in total")  

In [1]:
# Helper code to print results

from azure.search.documents import SearchItemPaged

def print_results(results: SearchItemPaged[dict]):
    """
    Prints the results of a search query, including semantic answers, document details, and captions.

    Args:
        results (SearchItemPaged[dict]): The search results returned by the Azure Cognitive Search client.

    The function performs the following:
    - If semantic answers are available, it prints the highlighted text or the answer text along with the score.
    - For each result, it prints the title, search score, reranker score (if available), content, and category.
    - If captions are available, it prints the highlighted caption text or the plain caption text.
    """
    # Retrieve and print semantic answers, if available
    semantic_answers = results.get_answers()
    if semantic_answers:
        for answer in semantic_answers:
            if answer.highlights:
                print(f"Semantic Answer: {answer.highlights}")
            else:
                print(f"Semantic Answer: {answer.text}")
            print(f"Semantic Answer Score: {answer.score}\n")

    # Iterate through the search results and print details
    for result in results:
        print(f"Title: {result['title']}")  
        print(f"Score: {result['@search.score']}")
        if result.get('@search.reranker_score'):
            print(f"Reranker Score: {result['@search.reranker_score']}")
        print(f"Content: {result['content']}")  
        print(f"Category: {result['category']}\n")

        # Print captions, if available
        captions = result["@search.captions"]
        if captions:
            caption = captions[0]
            if caption.highlights:
                print(f"Caption: {caption.highlights}\n")
            else:
                print(f"Caption: {caption.text}\n")

In [5]:
from azure.search.documents.models import VectorizedQuery

# Generate a vectorized query for pure vector search
# The query string represents the search term for which embeddings will be generated
# Uncomment the desired query string to use
#query = "tools for software development"  
query = "Azure Virtual WAN"

# Generate the embedding for the query using the Azure OpenAI client
# The embedding is created based on the specified model and dimensions
embedding = client.embeddings.create(
    input=query, 
    model=embedding_model_name, 
    dimensions=azure_openai_embedding_dimensions
).data[0].embedding

# Create a VectorizedQuery object for vector search
# - `vector`: The embedding generated for the query
# - `k_nearest_neighbors`: The number of nearest neighbors to retrieve (optimal value is 50)
# - `fields`: The vector field in the index to search against
# To learn more about vector ranking, visit: https://learn.microsoft.com/azure/search/vector-search-ranking
vector_query = VectorizedQuery(
    vector=embedding, 
    k_nearest_neighbors=50, 
    fields="contentVector"
)

# Perform the vector search using the SearchClient
# - `search_text`: Set to None for pure vector search
# - `vector_queries`: A list of vectorized queries to execute
# - `select`: Specifies the fields to include in the search results
# - `top`: Limits the number of results returned
results = search_client.search(
    search_text=None,  
    vector_queries=[vector_query],
    select=["title", "content", "category"],
    top=3
)

# Print the search results using the helper function
print_results(results)

NameError: name 'search_client' is not defined