# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

In [2]:
! pip install azure-search-documents --pre



## Import required libraries and environment variables

In [3]:
# Import required libraries
import os
import json
import openai
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
# from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

ModuleNotFoundError: No module named 'azure.search'

In [None]:
# Configure environment variables
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv(
    "OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY
# ---
credential = AzureKeyCredential(key)

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:

In [None]:
# test embedding with langchain
embeddingmodel = OpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
vec = embeddingmodel.embed_query("transform to vec")
vec

In [None]:
# Generate Document Embeddings using OpenAI Ada 002

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(page):
    response = openai.Embedding.create(
        input=page, engine="text-embedding-ada-002")

    embeddings = response['data'][0]['embedding']
    return embeddings

## Prepare data for loading into Azure Cognitive Search

In [None]:
doc_title = "Semantic Kernel"
# load pdf and split into pages
fileName = "../data/semantic-kernel.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

doc_with_vector_list = []
doc_id = 0
# Generate embeddings for title and content fields
for page in pages:
    page_with_vector = {}
    page_with_vector['id'] = str(doc_id)
    page_with_vector['title'] = doc_title
    page_with_vector['titleVector'] = generate_embeddings(doc_title)
    page_with_vector['content'] = page.page_content
    page_with_vector['contentVector'] = generate_embeddings(page.page_content)
    doc_with_vector_list.append(page_with_vector)
    doc_id += 1

# Output embeddings to docVectors.json file
with open("./sk_Vectors.json", "w") as f:
    json.dump(doc_with_vector_list, f)

## Create search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
# Note: You must create Cognitive Search resource and get the endpoint and key in advance
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)

fields = [
    # doc id - mandatory field
    SimpleField(name="id", type=SearchFieldDataType.String, key=True,
                sortable=True, filterable=True, facetable=True),

    # title and titleVector
    SearchableField(
        name="title", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),

    # content and contentVector
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="sk-vector-config"),

]

#The Hierarchical Navigable Small World (HNSW) graph algorithm is a popular method for approximate nearest neighbor search 
# in high-dimensional spaces.
vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="sk-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,                  #maximum number of edges per node in the zero or base layer of the HNSW graph.
                "efConstruction": 400,   #this parameter affects the index building during the construction phase.Increasing efConstruction will usually improve the quality of the constructed graph, leading to better recall. However, it will also slow down the index building process.
                "efSearch": 500,         #this parameter affects the search time of the query phase. A higher value of efSearch increases the search time but usually results in better recall. 
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="sk-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name="sk-cogsrch-vector-index-2", fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [None]:
# Upload documents to the index
with open('./sk_Vectors.json', 'r') as file:
    documents = json.load(file)
search_client = SearchClient(
    endpoint=service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

## Perform a vector similarity search

In [None]:
# Pure Vector Search
query = "semantic kernel?"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)

results = search_client.search(
    search_text=None,
    vector=generate_embeddings(query),
    top_k=3,
    vector_fields="contentVector",
    select=["title", "content"],
)

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")

In [None]:

query = "semantic kernel planner and kernel"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)

results = search_client.search(
    search_text=None,
    vector=generate_embeddings(query), top_k=3,
    vector_fields="contentVector",
    select=["title", "content"],
)

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")

In [None]:
# Pure Vector Search multi-lingual
query = "Planificador semántico del kernel y kernel"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)

results = search_client.search(
    search_text=None,
    vector=generate_embeddings(query), top_k=3,
    vector_fields="contentVector",
    select=["title", "content"],
)

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")

## Perform Cross-Field Vector Search with a filter

In [None]:
# Pure Vector Search with Filter
query = "programming languages supported by semantic kernel"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)

results = search_client.search(
    search_text=None,
    vector=generate_embeddings(query), top_k=3,
    vector_fields="titleVector, contentVector",
    filter="title eq 'Semantic Kernel'",
    select=["title", "content"] #searching on two fields title and content
)

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}")

## Perform a Hybrid Search

In [None]:
# Hybrid Search
query = "semantic kernel planner and kernel"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)


results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query), top_k=3,
    vector_fields="contentVector",
    filter="title eq 'Semantic Kernel'",
    select=["title", "content",],
    top=3
)

print(type(results))

for result in results:
    print(f"Title: {result['title']}")
    print(f"Score: {result['@search.score']}")
    print(f"Content: {result['content']}\n")

## Perform a Semantic Hybrid Search

In [None]:
# Semantic Hybrid Search
query = "semantic kernel planner and kernel"

search_client = SearchClient(
    service_endpoint, index_name="sk-cogsrch-vector-index-2", credential=credential)

results = search_client.search(
    search_text=query,
    vector=generate_embeddings(query), top_k=3,
    vector_fields="contentVector",
    select=["title", "content"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='sk-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"Content: {result['content']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")