# RAG Approach 2: Azure Cognitive Search Vector Search + Azure OpenAI 
This code demonstrates how to use Azure Cognitive Search by using the Push API to insert vectors into your search index.
Adapted from: https://github.com/Azure/cognitive-search-vector-pr/blob/main/demo-python/code/azure-search-vector-python-sample.ipynb
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`. This sample currently uses version `11.4.0b11`.

## Import required libraries and environment variables

In [None]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    RawVectorQuery,
    VectorizableTextQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchVectorizer,
    VectorSearchVectorizerKind,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings, 
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    # VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
)  

# Additional Imports
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Callable, Optional, Union
import time
from tqdm import tqdm
from dotenv import load_dotenv, dotenv_values



In [None]:

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 


load_dotenv('./my.env')
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_base= os.getenv('OPENAI_API_BASE')
openai.api_type= "azure"
openai.api_version = os.getenv('OPENAI_API_VERSION')


#overwriting to azure open_ai environment variables
config = dotenv_values("./my.env")
openai.api_base = config["AZURE_OPENAI_ENDPOINT"]
openai.api_version = config["AZURE_OPENAI_API_VERSION"]
credential = AzureKeyCredential(key)

In [None]:
def generate_ada_embedding(text: Union[str, list[str]]) -> list[list[float]]:
    """Generates embeddings using text-embedding-ada-002 model

    Args:
        text (Union[str, list[str]]): Text to generate embedding for

    Returns:
        list[list[float]]: List of embeddings for text item
    """
    if isinstance(text, str):
        text = [text]
    embeddings = []
    EMBEDDING_MODEL = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME","text-embedding-ada-002")
    # Azure OpenAI embedding API has max batch size of 16
    step_size = 16
    for n in range(step_size, len(text) + step_size, step_size):
        res = openai.Embedding.create(
            input=text[n - step_size : n], deployment_id='text-embedding-ada-002'
        )
        embeddings += [d.embedding for d in res.data]
        time.sleep(5)
    return embeddings

In [None]:

def split_text(text, chunk_size: int, chunk_overlap: int, length_function: Callable[[str], int] = len):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = length_function
    )
    split_text = text_splitter.create_documents([text])
    
    return split_text

## Create embeddings
Read your data, generate OpenAI embeddings and export to a format to insert your Azure Cognitive Search index:

In [None]:
doc_html = 'test_doc.html'
doc_pdf = 'test_doc.pdf'


input_data = {'pdf': {}, 'html':{}}
loader = UnstructuredHTMLLoader(doc_html)
data = loader.load()
input_data['html']['title'] = 'html'
input_data['html']['content'] = data[0].page_content



loader = PyPDFLoader(doc_pdf)
data = loader.load()
pages = len(data)
pdf_content = ''

for x in range(pages):
    pdf_content = pdf_content + data[x].page_content

input_data['pdf']['title'] = 'pdf'

input_data['pdf']['content'] = pdf_content

In [None]:
chunk_size = 1000
chunk_overlap=0
content_list = []

In [None]:

#html
content = input_data['html']['content']
title = input_data['html']['title']
split_text_html = split_text(content, chunk_size, chunk_overlap)


for id_, chunk in tqdm(enumerate(split_text_html)):
    chunk = chunk.page_content
    data_json = {}
    data_json['id'] = str(id_)
    data_json['title'] = title
    data_json['content'] = chunk
    content_embeddings = generate_ada_embedding(chunk)[0]
    data_json['contentVector'] = content_embeddings
    content_list.append(data_json) # adding chunk to content_list

In [None]:
#pdf
content = input_data['pdf']['content']
title = input_data['pdf']['title']
split_text_pdf = split_text(content, chunk_size, chunk_overlap)


for id_, chunk in tqdm(enumerate(split_text_pdf)):
    chunk = chunk.page_content
    data_json = {}
    data_json['id'] = str(id_+len(split_text_html))
    data_json['title'] = title
    data_json['content'] = chunk
    content_embeddings = generate_ada_embedding(chunk)[0]
    data_json['contentVector'] = content_embeddings
    content_list.append(data_json) # adding chunk to content_list


In [None]:
content_list[0]

In [None]:
# Output embeddings to docVectors.json file
with open("./docVectors.json", "w") as f:
    json.dump(content_list, f)

In [None]:
model = 'text-embedding-ada-002'
index_name = 'mysearch3'

## Create your search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        ),
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm="myHnsw",
            vectorizer="myOpenAI"
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm="myExhaustiveKnn",
            vectorizer="myOpenAI"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),
                deployment_id=model,
                api_key=os.getenv("AZURE_OPENAI_API_KEY")
            )
    )  
]  

)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [None]:
# Upload some documents to the index
with open('./docVectors.json', 'r') as file:  
    chunks = json.load(file)

In [None]:

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
print(f"Uploaded {len(chunks)} chunks") 
result = search_client.upload_documents(chunks)


If you are indexing a very large number of chunks, you can use the `SearchIndexingBufferedSender` which is an optimized way to automatically index the docs as it will handle the batching for you:

In [None]:

# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing  
with SearchIndexingBufferedSender(  
    endpoint=service_endpoint,  
    index_name=index_name,  
    credential=credential,  
) as batch_client:  
    # Add upload actions for all documents  
    batch_client.upload_documents(documents=chunks)  
print(f"Uploaded {len(chunks)} chunks in total")  


## Perform a vector similarity search

In [None]:
def print_results(results, string_out=150):
      
    for result in results:  
        print(f"Title: {result['title']}")  
        print(f"Score: {result['@search.score']}")  
        print(f"Content: {result['content'][0: string_out]}\n\n")    

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

In [None]:
# Pure Vector Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector")

  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content"],
)  
  
print_results(results)


This example shows a pure vector search using a raw vector query, in this example, you are responsible for generating the query vector.

In [None]:
# Pure Vector Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
# Use the below query to pass in the raw vector query instead of the query vectorization
vector_query = RawVectorQuery(vector=generate_ada_embedding(query)[0], k=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content"],
)  
  
print_results(results)


This example shows a pure vector search to demonstrate OpenAI's text-embedding-ada-002 multilingual capabilities.

In [None]:
# Pure Vector Search multi-lingual (e.g '"Am I entitled to Night Pay?"  ' in French)  
query = "Suis-je éligible à une prime de nuit ?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content"],
)  
  
print_results(results)



## Perform an Exhaustive KNN exact nearest neighbor search

This example shows how you can exhaustively search your vector index regardless of what index you have, HNSW or ExhaustiveKNN. You can use this to calculate the ground-truth values.

In [None]:
# Pure Vector Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector", exhaustive=True)
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content"],
)  
  
print_results(results)


## Perform a Multi-Vector Search

This example shows a cross-field vector search that allows you to query multiple vector fields at the same time by passing in multiple query vectors. Note, in this case, you can pass in query vectors from two different embedding models to the corresponding vector fields in your index.

In [None]:
# Multi-Vector Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)  
vector_query_2 = VectorizableTextQuery(text=query, k=3, fields="contentVector")  
  
results = search_client.search(  
    search_text=None,  
    vector_queries=[vector_query_2, vector_query_2],
    select=["title", "content"],
)  
  
print_results(results)  


## Perform a Pure Vector Search with a filter
This example shows how to apply filters on your index. Note, that you can choose whether you want to use Pre-Filtering (default) or Post-Filtering.

In [None]:
# Pure Vector Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
  
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    vector_filter_mode=VectorFilterMode.PRE_FILTER,
    filter="title eq 'pdf'",
    select=["title", "content"],
)
  
print_results(results)


## Perform a Hybrid Search

In [None]:
# Hybrid Search
query = "Am I entitled to Night Pay?"  
  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    select=["title", "content"],
    top=3
)  

print_results(results) 
