Azure AI Search with LlamaIndex

This code demonstrates how to use Azure AI Search with Azure OpenAI and the LlamaIndex data framework

Install packages

In [2]:
! pip install -r requirements.txt --quiet

You should consider upgrading via the '/Users/sithukaungset/Azure-AI-Search-prompthon/venv/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

Load .env file

In [8]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Make sure your .env file has values for the following environment variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
# Llama Index does not support RBAC authentication, an API key is required
azure_openai_key = os.environ["AZURE_OPENAI_KEY"]
if len(azure_openai_key) == 0:
    raise Exception("API key required")
azure_openai_embedding_model = os.environ["AZURE_OPENAI_EMBEDDING_MODEL_NAME"]
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]
azure_openai_chatgpt_deployment = os.environ["AZURE_OPENAI_CHATGPT_DEPLOYMENT"]
azure_openai_api_version = os.environ["AZURE_OPENAI_API_VERSION"]
embedding_dimensions = os.environ["AZURE_OPENAI_EMBEDDING_DIMENSIONS"]

Configure an embeddings instance

In [9]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI

embeddings = AzureOpenAIEmbedding(
    model_name=azure_openai_embedding_model,
    deployment_name=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

llm = AzureOpenAI(
    deployment_name=azure_openai_chatgpt_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

In [15]:
! pip install --upgrade jupyter ipywidgets docx2txt

Collecting jupyter
  Using cached jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting ipywidgets
  Using cached ipywidgets-8.1.2-py3-none-any.whl (139 kB)
Collecting jupyter-console
  Using cached jupyter_console-6.6.3-py3-none-any.whl (24 kB)
Collecting qtconsole
  Downloading qtconsole-5.5.1-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.4/123.4 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nbconvert
  Downloading nbconvert-7.16.3-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 KB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting notebook
  Downloading notebook-7.1.2-py3-none-any.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting jupyterlab-widgets~=3.0.10
  Using cached jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
Collecting widgetsnbextension~=4.

In [11]:
! pip install torch transformers python-pptx Pillow

Collecting torch
  Downloading torch-2.2.2-cp310-none-macosx_11_0_arm64.whl (59.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting transformers
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting python-pptx
  Using cached python_pptx-0.6.23-py3-none-any.whl (471 kB)
Collecting jinja2
  Using cached Jinja2-3.1.3-py3-none-any.whl (133 kB)
Collecting sympy
  Using cached sympy-1.12-py3-none-any.whl (5.7 MB)
Collecting filelock
  Downloading filelock-3.13.4-py3-none-any.whl (11 kB)
Collecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[

Upload, vectorize, and index documents

This step reads PDFs from a local folder, calls the embedding model for vectorization, and then calls a search client to index the content on Azure AI Search.



In [12]:

from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore, IndexManagement, MetadataIndexFieldType
from llama_index.core import StorageContext, VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from azure.search.documents.indexes import SearchIndexClient

metadata_fields = {
    "author": "author",
    "theme": ("topic", MetadataIndexFieldType.STRING),
    "director": "director",
}

vector_store = AzureAISearchVectorStore(  
    search_or_index_client=SearchIndexClient(endpoint=endpoint, credential=credential),  
    filterable_metadata_field_keys=metadata_fields,
    index_name=index_name,  
    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,  
    id_field_key="id",  
    chunk_field_key="content",  
    embedding_field_key="content_vector",  
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
    embedding_dimensionality=embedding_dimensions,
    language_analyzer="en.lucene",
    vector_algorithm_type="exhaustiveKnn"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.llm = llm
Settings.embed_model = embeddings
directory = os.path.abspath(os.path.join("..", "data", "documents"))
documents = SimpleDirectoryReader(directory).load_data()
index = VectorStoreIndex.from_documents(
    documents=documents,
    storage_context=storage_context)

  from .autonotebook import tqdm as notebook_tqdm


Failed to load file /Users/sithukaungset/Azure-AI-Search-prompthon/data/documents/PPT샘플(한글)_세미나발표.pptx with error: cannot find loader for this WMF file. Skipping...


ImportError: docx2txt is required to read Microsoft Word files: `pip install docx2txt`

Perform a vector similarity search


In [None]:
query_engine = index.as_query_engine(llm)
response = query_engine.query("What is included in my Northwind Health Plus plan that is not in standard?")
print(response)