# Azure Cognitive Search

# Basic Example

In this basic example, we take the a Paul Graham essay, split it into chunks, embed it using an OpenAI embedding model, load it into an Azure Cognitive Search index, and then query it.

In [4]:
import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# logger = logging.getLogger(__name__)

In [5]:
#!{sys.executable} -m pip install llama-index
#!{sys.executable} -m pip install azure-search-documents==11.4.0b8
#!{sys.executable} -m pip install azure-identity

In [6]:
# import
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    Document,
)
from llama_index.storage.storage_context import StorageContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
from IPython.display import Markdown, display
from llama_index.vector_stores import CognitiveSearchVectorStore

In [7]:
# set up OpenAI
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]

In [8]:
# set up Azure Cognitive Search

from azure.core.credentials import AzureKeyCredential

search_service_name = getpass.getpass(
    "Azure Cognitive Search Service Name"
)  

key = getpass.getpass(
    "Azure Cognitive Search Key"
)  

cognitive_search_credential = AzureKeyCredential(key)

service_endpoint = f"https://{search_service_name}.search.windows.net"

index_name = "quickstart"

In [9]:
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient

from azure.core.credentials import AzureKeyCredential

from typing import Any

In [10]:
def drop_and_create_index(index_name: str, service_endpoint: str, credential: Any):
    index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
    if index_name in index_client.list_index_names():
        print(f"Index {index_name} exists, dropping index")
        index_client.delete_index(index_name)

    create_search_index(index_name, service_endpoint, credential)


def create_search_index(index_name: str, service_endpoint: str, credential: Any):
    # if args.verbose: print(f"Ensuring search index {args.index} exists")
    index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
    if index_name not in index_client.list_index_names():
        index = SearchIndex(
            name=index_name,
            fields=[
                SimpleField(name="id", type="Edm.String", key=True),
                SearchableField(
                    name="content", type="Edm.String", analyzer_name="en.microsoft"
                ),
                SearchField(
                    name="embedding",
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    hidden=False,
                    searchable=True,
                    filterable=False,
                    sortable=False,
                    facetable=False,
                    vector_search_dimensions=1536,
                    vector_search_configuration="default",
                ),
                SimpleField(name="li_jsonMetadata", type="Edm.String"),
                SimpleField(name="li_doc_id", type="Edm.String", filterable=True),
            ],
            semantic_settings=SemanticSettings(
                configurations=[
                    SemanticConfiguration(
                        name="default",
                        prioritized_fields=PrioritizedFields(
                            title_field=None,
                            prioritized_content_fields=[
                                SemanticField(field_name="content")
                            ],
                        ),
                    )
                ]
            ),
            vector_search=VectorSearch(
                algorithm_configurations=[
                    HnswVectorSearchAlgorithmConfiguration(
                        name="default",
                        kind="hnsw",
                        parameters={
                            "m": 4,
                            "efConstruction": 400,
                            "efSearch": 1000,
                            "metric": "cosine",
                        },
                    )
                ]
            ),
        )
        print(f"Creating {index_name} search index")
        index_client.create_index(index)
    else:
        print(f"Search index {index_name} already exists")

In [11]:
# drop_and_create_index(index_name=index_name, service_endpoint=service_endpoint, credential=cognitive_search_credential)
#create_search_index(
#    index_name=index_name,
#    service_endpoint=service_endpoint,
#    credential=cognitive_search_credential,
#)

In [12]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [13]:
# set up Azure Cognitive Search

from azure.core.credentials import AzureKeyCredential

search_service_name = "llmdevcog001"  

key = "UWTIPqJwmA03Cjew8KmP9OkxidP4whkfxVN0EwzmJXAzSeAMib7T"  

cognitive_search_credential = AzureKeyCredential(key)

service_endpoint = f"https://{search_service_name}.search.windows.net"

index_name = "quickstart"

In [14]:
from azure.search.documents.indexes import SearchIndexClient 
from llama_index.vector_stores import CognitiveSearchVectorStore
from llama_index.vector_stores.cogsearch import IndexManagement, MetadataIndexFieldType

# set up Azure Cognitive Search vector store and load in data
index_client = SearchIndexClient(
    endpoint=service_endpoint,
    credential=cognitive_search_credential,
)

metadata_fields = ["author", "theme", "director"]
# metadata_fields = {"Subject": ("SchoolSubject", MetadataIndexFieldType.STRING), "Class": ("RoomNumber", MetadataIndexFieldType.INT32), "Building": ("Block", MetadataIndexFieldType.DOUBLE)}

vector_store = CognitiveSearchVectorStore(
    search_or_index_client=index_client,
    index_name = "quickstart01",
    filterable_metadata_field_keys = metadata_fields,
    index_management = IndexManagement.CREATE_IF_NOT_EXISTS,
    id_field_key="id",
    chunk_field_key="content",
    embedding_field_key="embedding",
    metadata_string_field_key="li_jsonMetadata",
    doc_id_field_key="li_doc_id",
)



INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'de0a519f-48cc-11ee-96f5-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'de0a519f-48cc-11ee-96f5-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
 

In [15]:
# define embedding function
from llama_index.embeddings import OpenAIEmbedding

embed_model = OpenAIEmbedding()

# load documents
documents = SimpleDirectoryReader(
    "../../../examples/paul_graham_essay/data"
).load_data()

# set up Azure Cognitive Search vector store and load in data
search_client = SearchClient(
    endpoint=service_endpoint,
    index_name=index_name,
    credential=cognitive_search_credential,
)

vector_store = CognitiveSearchVectorStore(
    search_or_index_client=index_client,
    index_name = "quickstart01",
    filterable_metadata_field_keys = metadata_fields,
    index_management = IndexManagement.VALIDATE_INDEX,
    id_field_key="id",
    chunk_field_key="content",
    embedding_field_key="embedding",
    metadata_string_field_key="li_jsonMetadata",
    doc_id_field_key="li_doc_id",
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'fbc3e51a-48cc-11ee-9a1d-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'api-key': 'REDACTED'
    'Accept': 'application/json;odata.metadata=minimal'
    'x-ms-client-request-id': 'fbc3e51a-48cc-11ee-9a1d-bc091bdb04fb'
    'User-Agent': 'azsdk-python-search-documents/11.4.0b8 Python/3.11.4 (Windows-10-10.0.22621-SP0)'
No body was attached to the request
Request URL: 'https://llmdevcog001.search.windows.net/indexes?api-version=REDACTED'
Request method: 'GET'
Request headers:
 

ValueError: Search client not initialized

In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

In [None]:
# Query Data
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = query_engine.query(
    "What did the author learn?",
)
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = query_engine.query("What was a hard moment for the author?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
response = query_engine.query("Who is the author?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
import time

query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("What happened at interleaf?")

start_time = time.time()

token_count = 0
for token in response.response_gen:
    print(token, end="")
    token_count += 1

time_elapsed = time.time() - start_time
tokens_per_second = token_count / time_elapsed

print(f"\n\nStreamed output at {tokens_per_second} tokens/s")

# Adding a document

In [None]:
response = query_engine.query("What colour is the sky?")
display(Markdown(f"<b>{response}</b>"))

In [None]:
index.insert_nodes([Document(text="The sky is indigo today")])

In [None]:
response = query_engine.query("What colour is the sky?")
display(Markdown(f"<b>{response}</b>"))

# Filtering

In [None]:
from llama_index.schema import TextNode

nodes = [
    TextNode(
        text="The Shawshank Redemption",
        metadata={
            "author": "Stephen King",
            "theme": "Friendship",
        },
    ),
    TextNode(
        text="The Godfather",
        metadata={
            "director": "Francis Ford Coppola",
            "theme": "Mafia",
        },
    ),
    TextNode(
        text="Inception",
        metadata={
            "director": "Christopher Nolan",
        },
    ),
]

In [None]:
index.insert_nodes(nodes)

In [None]:
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters


filters = MetadataFilters(filters=[ExactMatchFilter(key="theme", value="Mafia")])

retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is inception about?")

# Appendix

In [None]:
from typing import Any, List, cast, Dict, Callable, Optional, Union

In [None]:
import enum 
class MetadataIndexFieldType(str, enum.Enum):
    """Enumeration representing the supported types for metadata fields in an Azure Cognitive Search Index, corresponds with types supported in a flat metadata dictionary"""

    STRING = "Edm.String"
    BOOLEAN = "Edm.Boolean"
    INT32 = "Edm.Int32"
    INT64 = "Edm.Int64"
    DOUBLE = "Edm.Double"


In [None]:
t = MetadataIndexFieldType.BOOLEAN

In [None]:
t.value

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)

In [None]:

def get_metadata_to_index_field_dict(filterable_metadata_field_keys: Union[
                List[str] | Dict[str, str],
                Dict[str, tuple[str, MetadataIndexFieldType]]
        ] = None) -> Dict[str, tuple[str, MetadataIndexFieldType]]:
    """
    Normalises the supported forms for specifying metadata field names and their
    corresponding field types and field names in the Azure Cognitive Search index 
    """
    index_field_spec: Dict[str, tuple[str, MetadataIndexFieldType]] = {}

    if isinstance(filterable_metadata_field_keys, List):
        
        for metadata_field in filterable_metadata_field_keys:
            # Index field name and the metadata field name are the same
            # Use String as the default index field type
            index_field_spec[metadata_field] = (metadata_field, MetadataIndexFieldType.STRING)

    elif isinstance(filterable_metadata_field_keys, Dict):      
        for metadata_field,v in filterable_metadata_field_keys.items():
            if isinstance(v, tuple):
                # Index field name and metadata field name may differ
                # The index field type used is as supplied
                index_field_spec[metadata_field] = (v[0], v[1])
                
                
            else:
                # Index field name and metadata field name may differ
                # Use String as the default index field type
                index_field_spec[metadata_field] = (v, MetadataIndexFieldType.STRING)
    return index_field_spec


def create_metadata_index_fields(filterable_metadata_field_keys: Union[
                List[str] | Dict[str, str],
                Dict[str, tuple[str, MetadataIndexFieldType]]
        ] = None) -> List[SearchableField]:
    index_fields = []
    index_field_spec = get_metadata_to_index_field_dict(filterable_metadata_field_keys)

    print("Mappings")

    # create search fields
    for k,v in index_field_spec.items():
        field_name, field_type = v
        index_field_type = MetadataIndexFieldType(field_type).name
        field = SimpleField(name=field_name, type=index_field_type, filterable=True)
        index_fields.append(field)

    return index_fields
    



In [None]:
def print_search_field(s: SearchField):
    print(f"{s.name},{s.type},{s.filterable}")


In [None]:
index = test_param({"content": ("chunk", MetadataIndexFieldType.INT32), "id": ("id", MetadataIndexFieldType.DOUBLE)})
print("Search Fields")
for i in index:
    print_search_field(i)

In [None]:
test_param(["content", "doc_id"])

In [None]:
index = test_param({"content": "chunk", "id": "id"})
for i in index:
    print_search_field(i)

In [None]:
index = test_param({"content": ("chunk", MetadataIndexFieldType.INT32), "id": ("id", MetadataIndexFieldType.DOUBLE)})
for i in index:
    print_search_field(i)