# RAG (AI Search and AOAI) Demo

The logic and part of the code are come from https://github.com/Azure-Samples/azure-search-openai-demo, just make it simple to demo RAG part.
1. show how to transform question to search query using AOAI prompt
2. show to RAG works (AOAI + AI Search)
3. show how citation works (Source of answer) works


In [1]:
import os
import openai
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType,
)
import re  
from xml.sax.saxutils import escape  
from IPython.display import HTML

In [2]:
##load configuration from .env file
current_path = os.getcwd()   
envfilename = '.env'  
env_path = os.path.join(current_path, envfilename)  
load_dotenv(env_path)

True

In [3]:
aisearch_endpoint = os.getenv("aisearch_endpoint")
aisearch_key = os.getenv("aisearch_key")
index_name = os.getenv("aisearch_vectorindex")
azure_openai_endpoint=os.getenv("azure_openai_endpoint")
azure_openai_embedding_deployment=os.getenv("azure_openai_embedding_deployment") 
azure_openai_key=os.getenv("azure_openai_key")
azure_openai_api_version=os.getenv("azure_openai_api_version")
azure_openai_gpt_model=os.getenv("azure_openai_gpt_model") 

In [4]:
print(aisearch_endpoint)
print(aisearch_key)
print(index_name)
print(azure_openai_endpoint)
print(azure_openai_gpt_model)
print(azure_openai_api_version)
print(azure_openai_embedding_deployment)

https://gptkb-tybo4fifefmeo.search.windows.net
Uy4GpOvX2VtJMFhOHTRZFKz5Lf4Wgd4R8xVvOqncT5AzSeDTVhO2
gptkbindex
https://cog-tybo4fifefmeo.openai.azure.com/
chat
2023-05-15
embedding


In [8]:
# Initialize search client to connect to AI Search
credential = AzureKeyCredential(os.getenv("aisearch_key"))
search_client = SearchClient(endpoint=aisearch_endpoint, index_name=index_name, credential=credential)  

In [9]:
# initialize AOAI client to connect to AOAI Service
openaiclient = openai.AzureOpenAI(
        azure_endpoint=azure_openai_endpoint,
        api_key=azure_openai_key,
        api_version=azure_openai_api_version
    )

In [10]:
# define function to generate embedding using AOAI embedding endpoint.
def generate_embeddings(text):
    return openaiclient.embeddings.create(input = [text], model=azure_openai_embedding_deployment).data[0].embedding

In [11]:
# define function to remove newline characters from a string
def nonewlines(s: str) -> str:
    return s.replace("\n", " ").replace("\r", " ")


## Define the user question. 

In [18]:
user_query = "tell me more about contoso?"

### Use AOAI to generate a seach query based on the question

In [21]:
response = openaiclient.chat.completions.create(
    model= azure_openai_gpt_model,
    messages=[
        {"role": "system", "content": "Generate a search query based on the question or input. If you cannot generate a search query, return just the number 0"},
        {"role": "user", "content": user_query},
    ],
    )
user_query_search = response.choices[0].message.content 
print(f"generated search query: {user_query_search}")


generated search query: contoso


### Use Hybrid Search(full text + Vector) without semantics profile

In [32]:
# hybrid Search
vector_query = VectorizedQuery(vector=generate_embeddings(user_query_search), k_nearest_neighbors=5, fields="embedding")

# only return top 5  
results = search_client.search(  
    search_text=user_query_search,  
    vector_queries= [vector_query],
    select=["*"],
    include_total_count=True,
    top = 5
)  

# save the result in format sourcepage:content
print("===search result===")  
print(f"total: {results.get_count()}")
result_lists = []
for result in results:
    clean_content = nonewlines(result['content']) 
    formatted_result = f"{result['sourcepage']}:{clean_content}"  
    result_lists.append(formatted_result)
    print(f"{result['@search.score']}, {result['sourcepage']},{result['sourcefile']}")  

===search result===
total: 149
0.032786883413791656, Contoso_Electronics_Company_Overview.md,Contoso_Electronics_Company_Overview.md
0.02982766181230545, Benefit_Options.pdf#page=1,Benefit_Options.pdf
0.02982456237077713, employee_handbook.pdf#page=1,employee_handbook.pdf
0.028612013906240463, Contoso_Electronics_Company_Overview.md,Contoso_Electronics_Company_Overview.md
0.0238095261156559, employee_handbook.pdf#page=6,employee_handbook.pdf


### formatted the search result in sourcepage:content

In [31]:

print("===Source content passed to AOAI, the following result only display first 300 charcters===")
## print(f"{result['@search.score']}, {result['sourcepage']},{result['sourcefile']},{result['content'][:100]} \n") 
## result_lists = [f"{result['sourcepage']}:{result['content']}" for result in results] 
for result in result_lists:
    print(f"{result[:100]}") 



===Source content passed to AOAI, the following result only display first 300 charcters===
Contoso_Electronics_Company_Overview.md:# Contoso Electronics *Disclaimer: This content is generated
Benefit_Options.pdf#page=1:Contoso Electronics Plan and Benefit Packages Contoso ElectronicsThis doc
employee_handbook.pdf#page=1:Contoso Electronics Employee Handbook Contoso ElectronicsThis document 
Contoso_Electronics_Company_Overview.md: Here are some ways we celebrate achievements: - Monthly "In
employee_handbook.pdf#page=6: Prevention and Response Contoso Electronics is committed to preventing


### Define the sytem message to instruct AOAI to quote the source name for the response

In [33]:
system_message = """ 
Answer ONLY with the facts listed in the list of sources below. 
If there isn't enough information below, say you don't know. 
Do not generate answers that don't use the sources below. 
If asking a clarifying question to the user would help, ask the question.
For tabular information return it as an html table. Do not return markdown format. If the question is not in English, answer in the language used in the question.
Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].
"""


### Contruct original user query with search result

In [34]:
content = "\n".join(result_lists)
user_query_with_source = user_query + "\n\nSources:\n" + content

print(user_query_with_source)

tell me more about contoso?

Sources:
Contoso_Electronics_Company_Overview.md:# Contoso Electronics *Disclaimer: This content is generated by AI and may not accurately represent factual information about any real entity. Use this information with caution and verify details from reliable sources.* ## History Contoso Electronics, a pioneering force in the tech industry, was founded in 1985 by visionary entrepreneurs with a passion for innovation. Over the years, the company has played a pivotal role in shaping the landscape of consumer electronics. | Year | Milestone | |------|-----------| | 1985 | Company founded with a focus on cutting-edge technology | | 1990 | Launched the first-ever handheld personal computer | | 2000 | Introduced groundbreaking advancements in AI and robotics | | 2015 | Expansion into sustainable and eco-friendly product lines | ## Company Overview At Contoso Electronics, we take pride in fostering a dynamic and inclusive workplace. Our dedicated team of experts co

### Passed the query to AOAI

In [35]:
response = openaiclient.chat.completions.create(
    model= azure_openai_gpt_model,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_query_with_source},
    ],
    temperature=0.3
    )
print(response.choices[0].message.content)

Contoso Electronics is a tech company that was founded in 1985 [Contoso_Electronics_Company_Overview.md]. They have achieved several milestones over the years, including launching the first-ever handheld personal computer in 1990 and introducing advancements in AI and robotics in 2000 [Contoso_Electronics_Company_Overview.md]. In 2015, they expanded into sustainable and eco-friendly product lines [Contoso_Electronics_Company_Overview.md].

Contoso Electronics takes pride in fostering a dynamic and inclusive workplace, where a dedicated team of experts collaborates to create innovative solutions [Contoso_Electronics_Company_Overview.md].

They offer two comprehensive health insurance plans through Northwind Health, which provide coverage for medical, vision, dental services, prescription drugs, mental health, substance abuse, and preventive care [Benefit_Options.pdf#page=1].

Contoso Electronics specializes in providing advanced electronic components for both commercial and military air

## frontend - how to transform the AOAO response to HTML.

In [36]:
BACKEND_URI = ""  
  
def get_citation_file_path(citation):  
    return f"{BACKEND_URI}/content/{citation}"  
  
def on_citation_clicked(citation_file_path):  
    # This is a placeholder for the callback function  
    # In a real application, you would implement what should happen when a citation is clicked  
    print(f"Citation clicked: {citation_file_path}")  
  
def parse_answer_to_html(answer, on_citation_clicked):  
    citations = []  
    parsed_answer = answer.strip()  
  
    # Split the answer into parts using regex to find patterns like [citation]  
    parts = re.split(r'(\[[^\]]+\])', parsed_answer)  
  
    fragments = []  
    for index, part in enumerate(parts):  
        if index % 2 == 0:  
            # This is a regular text part  
            fragments.append(part)  
        else:  
            # This is a citation part; remove the brackets  
            citation = part[1:-1]  
  
            if citation not in citations:  
                citations.append(citation)  
            citation_index = citations.index(citation) + 1  
  
            path = get_citation_file_path(citation)  
  
            # Manually create the HTML for the citation link  
            citation_html = f'<a class="supContainer" title="{escape(citation)}" onclick="onCitationClicked(\'{escape(path)}\')"><sup>{citation_index}</sup></a>'  
            fragments.append(citation_html)  
  
    return {  
        'answerHtml': ''.join(fragments),  
        'citations': citations  
    }  
  

def render_citations_html(citations):  
    citation_list_items = []  
    for citation_index, citation in enumerate(citations, start=1):  # Start index from 1  
        citation_file_path = get_citation_file_path(citation)  
        # Add citation_index to display the citation number  
        citation_html = f'<li><a href="#" onclick="handleCitationClick(\'{citation_file_path}\')">{citation_index}. {citation}</a></li>'  
        citation_list_items.append(citation_html)  
  
    citation_list_html = '\n'.join(citation_list_items)  
    html = f'''  
    <div class="App-citations">  
        <h3>Citations</h3>  
        <ul>  
            {citation_list_html}  
        </ul>  
    </div>  
    '''  
    return html  

html_parsed_answer = parse_answer_to_html(response.choices[0].message.content, on_citation_clicked)  
citations_html = render_citations_html(html_parsed_answer['citations'])   

# Display the answer HTML  
display(HTML(html_parsed_answer['answerHtml']))  
  
# Display the citations HTML  
display(HTML(citations_html))  