Azure AI Search with LangChain

How to use Azure AI Search with OpenAI and Langchain

In [None]:
! pip install -r requirements.txt --quiet


Load .env file

In [7]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
key_credential = os.environ["AZURE_SEARCH_ADMIN_KEY"] if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else None
index_name = "megazone"
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
azure_openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]

credential = key_credential or DefaultAzureCredential()

Create LangChain Azure OpenAI Embedddings

In [8]:
from langchain_openai import AzureOpenAIEmbeddings
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")

# Use API key if provided, otherwise use RBAC authentication
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_openai_embedding_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key,
    azure_ad_token_provider=token_provider if not azure_openai_key else None
)   

Create LangChain Vector Store

In [9]:
from langchain.vectorstores.azuresearch import AzureSearch

vector_store = AzureSearch(
    azure_search_endpoint=endpoint,
    azure_search_key=key_credential,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    semantic_configuration_name="default"
)

In [14]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

directory = os.path.join("..", "data", "documents")
files = ["PDF샘플(한글)_세미나발표.pdf" ,"마산항인보이스샘플.pdf"]
total_chunks = 0
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

for file in files:
    loader = PyPDFLoader(os.path.join(directory, file))
    file_chunks = loader.load_and_split(splitter)
    results = vector_store.add_documents(documents=file_chunks)
    total_chunks += len(results)
    print(f"Indexed {file}")
print(f"Indexed {total_chunks} chunks")

Ignoring wrong pointing object 2 0 (offset 0)
Ignoring wrong pointing object 56 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 100 0 (offset 0)


Indexed PDF샘플(한글)_세미나발표.pdf
Indexed 마산항인보이스샘플.pdf
Indexed 11 chunks


Perform a vector similarity search

In [15]:
# Perform a similarity search
docs = vector_store.similarity_search(
    "지능화된 연계기술?",
    k=50,
    search_type="similarity",
)
docs = docs[:3]
for doc in docs:
    print("-" * 80)  
    print(f"Source: {doc.metadata['source']}")
    print(f"Chunk Content: {doc.page_content}")

--------------------------------------------------------------------------------
Source: ../data/documents/PDF샘플(한글)_세미나발표.pdf
Chunk Content: Copyright 2024 ⓒMegazoneCloud . ALL RIGHT RESERVED.01. Demo Session 1  – AOAI 기반RDB연동Chatbot (회의실예약시스템 )
TO-BE) AOAI 의지능화된연계기술 (Function calling) 을통해 Legacy 시스템과연계하고 ,
대화를통한추천방식으로전환하여단계별로진행되던업무과정을 혁신적으로 간소화할수있습니다 .
회의실 예약업무 문제 해결 과정 RDB 연동 LLM Chatbot 구성현황
예약신청
예약취소
예약변경회의실전체현황
조회회의실조건검색
대안고민
(회의실없는경우)반복적고민
(인원, 빈회의실 ,회의환경 )
기존
문제
개선시간소요
(매번동일한
조건으로입력)단계적인수행없이 One-Stop 업무처리가능한방식으로개선됨
변화) 검색방식 →대화, 추천방식
효과) 2분→30초(75% 단축)
IT(DB)와연동된 Chatbot 동작구조
“내일 회의실 좀예약해줘”
지난주에하신“TF 주간회의“
시면 6번Room으로오전 9시
반예약해드릴까요 ?
“오늘 회의 취소 됬어”
“회의 어디서 하지?”
오후 3시예약된“OO이슈회의”
취소해드릴까요 ?
10분후참석할“OO회의”
장소는“Room5번”입니다 .
네
기업내
정보시스템
정형데이터
(RDBMS, SAP 등)
One-Stop처리
(추천방식)
VOC 
처리인사제도
Q&A회의실
예약생성형 AI 서비스플랫폼
AOAI
LLM
AI Studio
(AI개발환경 )
Copilot
Studio임직원
※ 지능화된 AOAI 연동방식
     - Function Calling
--------------------------------------------------------------------------------
Source: ..

Perform a hybrid search

In [16]:
# Perform a hybrid search
docs = vector_store.similarity_search(
    query="지능화된 연계기술?",
    k=50, 
    search_type="hybrid"
)
docs = docs[:3]
for doc in docs:
    print("-" * 80)  
    print(f"Source: {doc.metadata['source']}")
    print(f"Chunk Content: {doc.page_content}")

--------------------------------------------------------------------------------
Source: ../data/documents/PDF샘플(한글)_세미나발표.pdf
Chunk Content: Copyright 2024 ⓒMegazoneCloud . ALL RIGHT RESERVED.01. Demo Session 1  – AOAI 기반RDB연동Chatbot (회의실예약시스템 )
TO-BE) AOAI 의지능화된연계기술 (Function calling) 을통해 Legacy 시스템과연계하고 ,
대화를통한추천방식으로전환하여단계별로진행되던업무과정을 혁신적으로 간소화할수있습니다 .
회의실 예약업무 문제 해결 과정 RDB 연동 LLM Chatbot 구성현황
예약신청
예약취소
예약변경회의실전체현황
조회회의실조건검색
대안고민
(회의실없는경우)반복적고민
(인원, 빈회의실 ,회의환경 )
기존
문제
개선시간소요
(매번동일한
조건으로입력)단계적인수행없이 One-Stop 업무처리가능한방식으로개선됨
변화) 검색방식 →대화, 추천방식
효과) 2분→30초(75% 단축)
IT(DB)와연동된 Chatbot 동작구조
“내일 회의실 좀예약해줘”
지난주에하신“TF 주간회의“
시면 6번Room으로오전 9시
반예약해드릴까요 ?
“오늘 회의 취소 됬어”
“회의 어디서 하지?”
오후 3시예약된“OO이슈회의”
취소해드릴까요 ?
10분후참석할“OO회의”
장소는“Room5번”입니다 .
네
기업내
정보시스템
정형데이터
(RDBMS, SAP 등)
One-Stop처리
(추천방식)
VOC 
처리인사제도
Q&A회의실
예약생성형 AI 서비스플랫폼
AOAI
LLM
AI Studio
(AI개발환경 )
Copilot
Studio임직원
※ 지능화된 AOAI 연동방식
     - Function Calling
--------------------------------------------------------------------------------
Source: ..

Perform a hybrid search with semantic reranking (Powered by Bing)

In [None]:
# Perform a hybrid search with semantic reranking  
docs_and_scores = vector_store.semantic_hybrid_search_with_score(  
    query="지능화된 연계기술??",  
    k=50,  
)
docs_and_scores = docs_and_scores[:3]
# Print the results  
for doc, score in docs_and_scores:  
    print("-" * 80)  
    answers = doc.metadata['answers']  
    if answers:  
        if answers.get('highlights'):  
            print(f"Semantic Answer: {answers['highlights']}")  
        else:  
            print(f"Semantic Answer: {answers['text']}")  
        print(f"Semantic Answer Score: {score}")  
    print("Content:", doc.page_content)  
    captions = doc.metadata['captions']
    print(f"Score: {score}") 
    if captions:  
        if captions.get('highlights'):  
            print(f"Caption: {captions['highlights']}")  
        else:  
            print(f"Caption: {captions['text']}")  
    else:  
        print("Caption not available")  