# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please note that the `pip install azure-search-documents==11.4.0a20230509004` is currently using the Dev Feed. For instructions on how to connect to the dev feed, please visit [Azure-Python-SDK Azure Search Documents Dev Feed](https://dev.azure.com/azure-sdk/public/_artifacts/feed/azure-sdk-for-python/connect/pip).

In [None]:
! pip install azure-search-documents --pre
! pip install openai
! pip install python-dotenv
! pip install tenacity

In [None]:
! pip install openai[datalib]

## Import required libraries and environment variables

In [3]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
load_dotenv()  
# OpenAI init
AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.environ.get("AZURE_OPENAI_CHATGPT_DEPLOYMENT") or "gpt35-turbo-test" 
openai.api_type = "azure"  
openai.api_key = os.getenv("OPENAI_API_KEY")  
openai.api_base = os.getenv("OPENAI_ENDPOINT")  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
# Form Recognizer init
formrecognizer_key = os.getenv("AZURE_FORMRECOGNIZER_KEY")
formrecognizer_creds = AzureKeyCredential(formrecognizer_key)
formrecognizerservice = os.getenv("AZURE_FORMRECOGNIZER_SERVICE")
# Azure search init
searchservice= os.getenv("AZURE_SEARCH_SERVICE")
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
search_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
search_credential = AzureKeyCredential(search_key)

NameError: name 'key' is not defined

In [None]:
# ChatGPT uses a particular set of tokens to indicate turns in conversations
prompt_prefix = """<|im_start|>system
Assistant helps the company employees with their questions on company policies, roles. 
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
Each source file has a name followed by colon and by source page  followed by second colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source file and source page, separate souefile and sourcepage by colon, e.g. [role_library.pdf:role_library-6.pdf]. Don't combine sources, list each source separately, e.g. [role_library.pdf:role_library-1.pdf][role_library.pdf:role_library-6.pdf].

Sources:
{sources}

<|im_end|>"""

turn_prefix = """
<|im_start|>user
"""

turn_suffix = """
<|im_end|>
<|im_start|>assistant
"""

prompt_history = turn_prefix

history = []

summary_prompt_template = """Below is a summary of the conversation so far, and a new question asked by the user that needs to be answered by searching in a knowledge base. Generate a search query based on the conversation and the new question. Source names are not good search terms to include in the search query.

Summary:
{summary}

Question:
{question}

Search query:
"""

## Create your search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=search_credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
#    SearchableField(name="title", type=SearchFieldDataType.String,
#                    searchable=True, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True, searchable=True, retrievable=True),
    SimpleField(name="sourcepage", type="Edm.String", filterable=True, facetable=True),
    SimpleField(name="sourcefile", type="Edm.String", filterable=True, facetable=True),
#    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
#                searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
#        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


## Create PDF parse functions 

In [None]:
from azure.ai.formrecognizer import DocumentAnalysisClient
import html

def table_to_html(table):
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

def get_document_text(filename):
    offset = 0
    page_map = []
    print(f"Extracting text from '{filename}' using Azure Form Recognizer")

    form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{formrecognizerservice}.cognitiveservices.azure.com/", credential=formrecognizer_creds, headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"})
    with open(filename, "rb") as f:
        poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document = f)
    form_recognizer_results = poller.result()

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1]*page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >=0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing charcters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += form_recognizer_results.content[page_offset + idx]
            elif not table_id in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)

    return page_map

for filename in glob.glob(".\data\*"):

page_map = get_document_text(filename)


## Create functions to upload cognitive search

In [None]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import *
from azure.search.documents import SearchClient
import openai 
import re

def blob_name_from_file_page(filename, page = 0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)
    
def create_sections(filename, page_map):
    for page in page_map:
        pagenum=page[0]+1
        yield {
            "id": re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{pagenum}"),
            "content": page[2],
            "category": "test01",
            "sourcepage": blob_name_from_file_page(filename, pagenum),
            "sourcefile": filename
        }

def index_sections(filename, sections):
    print(f"Indexing sections from '{filename}' into search index '{index_name}'")
    search_client = SearchClient(endpoint=f"https://{searchservice}.search.windows.net/",
                                    index_name=index_name,
                                    credential=search_credential)
    def generate_embeddings(text):
        response = openai.Embedding.create(
            input=text, engine="ada-embed-test")
        embeddings = response['data'][0]['embedding']
        return embeddings

    i = 0
    batch = []
    for s in sections:
        content = s['content']
        content_embeddings = generate_embeddings(content)
        s['contentVector'] = content_embeddings
        batch.append(s)
        i += 1
        if i % 1000 == 0:
            results = search_client.upload_documents(documents=batch)
            succeeded = sum([1 for r in results if r.succeeded])
            print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
            batch = []

    if len(batch) > 0:
        results = search_client.upload_documents(documents=batch)
        succeeded = sum([1 for r in results if r.succeeded])
        print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")

## Upload data


In [None]:
import glob
for filename in glob.glob(".\data\*"):
    print(f"Processing '{filename}'")
    page_map = get_document_text(filename)
    sections = create_sections(os.path.basename(filename), page_map)
    index_sections(os.path.basename(filename), sections)

## Perform a vector similarity search

In [1]:
# Pure Vector Search
#user_input = "What is the difference in responsibilities between a Vice President of Human Resources and Manager of Human Resources"  
user_input = "What are responsibilities of a Vice President of Human Resources and Manager of Human Resources"  

# Exclude category, to simulate scenarios where there's a set of docs you can't see
exclude_category = None

query = user_input

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  

def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="ada-embed-test")
    embeddings = response['data'][0]['embedding']
    return embeddings

r = search_client.search(  
    search_text="",  
    vector=generate_embeddings(query), 
    top_k=6, 
    vector_fields="contentVector",  
    select=["content", "sourcefile", "sourcepage"] 
)  

results = [doc['sourcefile'] + ": "+doc['sourcepage'] + ": " + doc['content'].replace("\n", "").replace("\r", "") for doc in r]

content = "\n".join(results)

prompt = prompt_prefix.format(sources=content) + prompt_history + user_input + turn_suffix

completion = openai.Completion.create(
    engine=AZURE_OPENAI_CHATGPT_DEPLOYMENT, 
    prompt=prompt, 
    temperature=0.7, 
    max_tokens=1024,
    stop=["<|im_end|>", "<|im_start|>"])

prompt_history += user_input + turn_suffix + completion.choices[0].text + "\n<|im_end|>" + turn_prefix
history.append("user: " + user_input)
history.append("assistant: " + completion.choices[0].text)

print("\n-------------------\n".join(history))
print("\n-------------------\nSource:\n" + prompt)
print("\n-------------------\nPrompt:\n" + prompt)





NameError: name 'SearchClient' is not defined