# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use langchain with Azure Cognitive Search and OpenAI 


In [None]:
! pip install azure-search-documents --pre --upgrade
! pip install openai --upgrade
! pip install python-dotenv
! pip install tenacity --upgrade
! pip install openai[datalib] --upgrade
! pip install langchain --upgrade
! pip install tiktoken --upgrade
! pip install azure-identity --upgrade
! pip install azure-core --pre --upgrade

## Import required libraries and environment variables

In [20]:
# Import required libraries  
import os  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  

# Configure environment variables  
load_dotenv(override=True)  
# OpenAI init
openai.api_type = "azure"  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
# Form Recognizer init
formrecognizer_key = os.getenv("AZURE_FORMRECOGNIZER_KEY")
formrecognizer_creds = AzureKeyCredential(formrecognizer_key)
formrecognizerservice = os.getenv("AZURE_FORMRECOGNIZER_SERVICE")
# Azure search init
searchservice = os.getenv("AZURE_SEARCH_SERVICE")
azure_search_endpoint = f"https://{searchservice}.search.windows.net/"
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
azure_search_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
search_credential = AzureKeyCredential(azure_search_key)

## Create PDF parse functions 

In [23]:
from azure.ai.formrecognizer import DocumentAnalysisClient
import html

def table_to_html(table):
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

def get_document_text(filename):
    offset = 0
    page_map = []
    print(f"Extracting text from '{filename}' using Azure Form Recognizer")

    form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{formrecognizerservice}.cognitiveservices.azure.com/", credential=formrecognizer_creds, headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"})
    with open(filename, "rb") as f:
        poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document = f)
    form_recognizer_results = poller.result()

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1]*page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >=0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing charcters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += form_recognizer_results.content[page_offset + idx]
            elif not table_id in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)

    return page_map


print(f"https://{formrecognizerservice}.cognitiveservices.azure.com/")

https://prsaipoc-0000-ist-0604-formrcgn1.cognitiveservices.azure.com/


## Create functions to upload cognitive search

In [24]:
import re

def blob_name_from_file_page(filename, page = 0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)
    
def create_sections(filename, page_map):
    for page in page_map:
        pagenum=page[0]+1
        id = re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{pagenum}")
        yield {
            "id": id,
            "content": page[2],
            "source": id,
            "category": "test01",
            "sourcepage": blob_name_from_file_page(filename, pagenum),
            "sourcefile": filename
        }


## Parse documents


In [25]:
import glob
for filename in glob.glob(".\data\*"):
    print(f"Processing '{filename}'")
    page_map = get_document_text(filename)
    sections = create_sections(os.path.basename(filename), page_map)


Processing '.\data\role_library.pdf'
Extracting text from '.\data\role_library.pdf' using Azure Form Recognizer


In [11]:
! nslookup prsaipoc-0000-ist-0604-formrcgn1.cognitiveservices.azure.com

Server:  dns2.gov.on.ca
Address:  142.107.194.46

Name:    prsaipoc-0000-ist-0604-formrcgn1.privatelink.cognitiveservices.azure.com
Address:  10.204.12.72
Aliases:  prsaipoc-0000-ist-0604-formrcgn1.cognitiveservices.azure.com



Non-authoritative answer:



In [28]:
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import AzureSearch
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

openai.api_key = os.getenv("CHATGPT_OPENAI_API_KEY")  
openai.api_base = os.getenv("CHATGPT_OPENAI_API_BASE")  
print("openai.api_base", openai.api_base)

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", deployment="text-embedding-ada-002", chunk_size=1, openai_api_type="azure", openai_api_base= openai.api_base, openai_api_key= openai.api_key) 
print ("search endpoint", azure_search_endpoint)
embedding_function=embeddings.embed_query

# Connect to Azure Cognitive Search
acs = AzureSearch(azure_search_endpoint,
                 azure_search_key,
                 index_name,
                 embedding_function=embedding_function)



openai.api_base https://ist-0604-prsaipoc-0000-openai-ce.openai.azure.com/
search endpoint https://prsaipoc0000ist0604srch01.search.windows.net/


In [29]:
from langchain.schema.document import Document

def create_documents(sections):
    documents = []
    for section in sections:
        metadata = {
            "id": section['id'],
            "category": "test01",
            "sourcepage": section['sourcepage'],
            "sourcefile": section['sourcefile'],
            "source": section['id']
        }
        content = section['content']
        document = Document(metadata=metadata, page_content=content)
        documents.append(document)
    return documents

documents = create_documents(sections)

acs.add_documents(documents=documents)

['OGM4ZTdiNzktMTJkZC00NzAyLWJmODUtOWI2NGUzMDk4ZTE1',
 'OWZlMGZlMGYtMDMyNy00ZGRjLWEzNGItMDEwYzkzMTVlMTMw',
 'YzlhZWMyOTctZGQ4NS00MGVmLTg4NTctMTBmNTI1MDdiODM4',
 'MzA4YTIzNWUtNGE3Mi00ZTQ3LTk0MzYtNzAwOTI4ODg2YmJh',
 'YzRkODFkNjUtMjE4MC00NjZlLWE1ZWItZjg4YTY3ODYzMjEw',
 'ZDJiNzk2Y2EtMzI3Ni00MmQ1LWI2OWYtM2VhYTk5ZTA5YThm',
 'NGRmZTY4Y2EtZGZkMS00NGE5LTljYWItYTliYTIxN2NlNTg4',
 'NTEzYzAxZjMtNmRjZS00MDE3LTg5YTMtOGI0NTU1MmY1NjJi',
 'MjEzMWNiNjktZDZkMC00MDNmLWI1NTgtYjE0NzNiYjhmMDZl',
 'YjVhZjM0YjMtZDc3MC00NWJhLThiNjMtNzc1ZTUzNmFlYTUz',
 'NjViYTZjMmQtNmU2MS00N2M2LThkZDQtZmMwMDM0NDZhMTc1',
 'NWVjMmUxMDEtMTY0YS00ZDA5LThhMjEtOTUzZjk0ZGZhMDJk',
 'ZTI5OGFmNDEtOTAwZS00ZmIzLThjNTItOWJlNjUyZDNhYTZi',
 'ZThjNWNhMDYtMzY0OC00ZDMxLTlmYzYtMWQ2ZWJhZjgyMTc4',
 'MGVjM2NhNTEtZDA4ZC00OGYyLWFkY2UtNDM1Y2NhYTU4NDJi',
 'Y2U3MjNlYzAtYzQyMS00YTg4LTk1ZDAtMmRjNGI3M2JmYzU5',
 'NTVhY2JmNzItMDcyNS00MzdjLThmMjItNGRkZDFiYzEyNDcy',
 'YzdmY2ExNjMtYmY5Ny00NjdiLTk3ZmEtMTlkYjUyNDM5NDgz',
 'ZDU3ODNmODMtYmMxZC00N2I2LWI3NDItOTA1ZmQ2MmE4

In [30]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import LLMChain


CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template("""
Assistant helps the company employees with their questions on company policies, roles. 
Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. 
Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. 
Always include the source metadata for each fact you use in the response. Use square brakets to reference the source, e.g. [role_library_pdf-10]. 
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:""")
openai.api_key = os.getenv("CHATGPT_OPENAI_API_KEY")  
openai.api_base = os.getenv("CHATGPT_OPENAI_API_BASE")  
print("openai.api_base", openai.api_base)
llm = AzureChatOpenAI(deployment_name="gpt-4", openai_api_base= openai.api_base, openai_api_key=openai.api_key, temperature=0)
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")
qa = ConversationalRetrievalChain(question_generator=question_generator,
                                           retriever=acs.as_retriever(),
                                           return_source_documents=True,
                                           combine_docs_chain=doc_chain,                                           
                                           verbose=False)

chat_history = []
query = """What are responsibilities of a Vice President of Human Resources and responsibilitie Manager of Human Resources? 
Provide responses for each in bulleted format. Specify the responsibilities and source for each fact you use in the answer separately."""

result = qa({"question": query, "chat_history": chat_history})

print("Question:", query)
print("Answer:", result["answer"])
for document in result.get("source_documents", []):
    print("Source Document:", document.metadata["sourcepage"])


openai.api_base https://ist-0604-prsaipoc-0000-openai-ce.openai.azure.com/
Question: What are responsibilities of a Vice President of Human Resources and responsibilitie Manager of Human Resources? 
Provide responses for each in bulleted format. Specify the responsibilities and source for each fact you use in the answer separately.
Answer: Responsibilities of a Vice President of Human Resources:
• Develop, implement and monitor comprehensive HR strategies and initiatives
• Foster a positive and productive work environment
• Collaborate with other departments to ensure alignment of HR initiatives with the company’s overall strategy
• Oversee the recruitment and onboarding process
• Develop, implement and monitor training and development initiatives
• Manage employee relations, including conflict resolution, disciplinary action and performance management
• Develop and implement compensation and benefit plans
• Track and analyze HR metrics
• Ensure compliance with all applicable laws and 