# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use langchain with Azure Cognitive Search and OpenAI 


In [None]:
! pip install azure-search-documents --pre --upgrade
! pip install openai --upgrade
! pip install python-dotenv
! pip install tenacity --upgrade
! pip install openai[datalib] --upgrade
! pip install langchain --upgrade
! pip install tiktoken --upgrade
! pip install azure-identity --upgrade
! pip install azure-core --pre --upgrade

## Import required libraries and environment variables

In [None]:
# Import required libraries  
import os  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  

# Configure environment variables  
load_dotenv(override=True)  
# OpenAI init
openai.api_type = "azure"  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
# Form Recognizer init
formrecognizer_key = os.getenv("AZURE_FORMRECOGNIZER_KEY")
formrecognizer_creds = AzureKeyCredential(formrecognizer_key)
formrecognizerservice = os.getenv("AZURE_FORMRECOGNIZER_SERVICE")
# Azure search init
searchservice = os.getenv("AZURE_SEARCH_SERVICE")
azure_search_endpoint = f"https://{searchservice}.search.windows.net/"
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
azure_search_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
search_credential = AzureKeyCredential(azure_search_key)

## Create PDF parse functions 

In [None]:
from azure.ai.formrecognizer import DocumentAnalysisClient
import html

def table_to_html(table):
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

def get_document_text(filename):
    offset = 0
    page_map = []
    print(f"Extracting text from '{filename}' using Azure Form Recognizer")

    form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{formrecognizerservice}.cognitiveservices.azure.com/", credential=formrecognizer_creds, headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"})
    with open(filename, "rb") as f:
        poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document = f)
    form_recognizer_results = poller.result()

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1]*page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >=0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing charcters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += form_recognizer_results.content[page_offset + idx]
            elif not table_id in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)

    return page_map


print(f"https://{formrecognizerservice}.cognitiveservices.azure.com/")

## Create functions to upload cognitive search

In [None]:
import re

def blob_name_from_file_page(filename, page = 0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)
    
def create_sections(filename, page_map):
    for page in page_map:
        pagenum=page[0]+1
        id = re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{pagenum}")
        yield {
            "id": id,
            "content": page[2],
            "source": id,
            "category": "test01",
            "sourcepage": blob_name_from_file_page(filename, pagenum),
            "sourcefile": filename
        }


## Parse documents


In [None]:
import glob
for filename in glob.glob(".\data\*"):
    print(f"Processing '{filename}'")
    page_map = get_document_text(filename)
    sections = create_sections(os.path.basename(filename), page_map)


In [None]:
import os
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import AzureSearch
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate

openai.api_key = os.getenv("CHATGPT_OPENAI_API_KEY")  
openai.api_base = os.getenv("CHATGPT_OPENAI_API_BASE")  
print("openai.api_base", openai.api_base)

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", deployment="text-embedding-ada-002", chunk_size=1, openai_api_type="azure", openai_api_base= openai.api_base, openai_api_key= openai.api_key) 
print ("search endpoint", azure_search_endpoint)
embedding_function=embeddings.embed_query

# Connect to Azure Cognitive Search
acs = AzureSearch(azure_search_endpoint,
                 azure_search_key,
                 index_name,
                 embedding_function=embedding_function)



In [None]:
from langchain.schema.document import Document

def create_documents(sections):
    documents = []
    for section in sections:
        metadata = {
            "id": section['id'],
            "category": "test01",
            "sourcepage": section['sourcepage'],
            "sourcefile": section['sourcefile'],
            "source": section['id']
        }
        content = section['content']
        document = Document(metadata=metadata, page_content=content)
        documents.append(document)
    return documents

documents = create_documents(sections)

acs.add_documents(documents=documents)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chat_models import AzureChatOpenAI
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.prompts import PromptTemplate

openai.api_key = os.getenv("CHATGPT_OPENAI_API_KEY")  
openai.api_base = os.getenv("CHATGPT_OPENAI_API_BASE")  
print("openai.api_base", openai.api_base)
model = AzureChatOpenAI(deployment_name="gpt-4", openai_api_base= openai.api_base, openai_api_key=openai.api_key, temperature=0)

retriever=acs.as_retriever()

template = """
Assistant helps the company employees with their questions on company policies, roles. 
Always include the source metadata for each fact you use in the response. Use square brakets to reference the source, e.g. [role_library_pdf-10]. 
Properly format the output for human readability with new lines.
Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

#chain.invoke("where did harrison work?")
result = chain.invoke("Responsibilities of Director of Operations?")
print (result)