In [None]:
import json
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient 
from openai import AzureOpenAI
import pandas as pd  
import numpy as np

In [2]:
openai_temperature = 0.1

#Load the configuration details for the AI Search Service and Azure OpenAI Instance
#Credentials should be secured using a more secure method such as Azure KeyVault
config = json.load(open("config.json"))

# Azure AI Search Config
search_service_name = config["search_service_name"]
search_service_url = "https://{}.search.windows.net/".format(search_service_name)
search_admin_key = config["search_admin_key"]
index_name = config["search_index_name"]
search_api_version = config["search_api_version"]

#Azure OpenAI
openai_embedding_api_base = config["openai_embedding_api_base"]
openai_embedding_api_key = config["openai_embedding_api_key"]
openai_embedding_api_version = config["openai_embedding_api_version"]
openai_embeddings_model = config["openai_embedding_model"]

openai_gpt_api_base = config["openai_gpt_api_base"]
openai_gpt_api_key = config["openai_gpt_api_key"]
openai_gpt_api_version = config["openai_gpt_api_version"]
openai_gpt_model = config["openai_gpt_model"]

client_id = config["client_id"]
client_secret = config["client_secret"]
tenant_id = config["tenant_id"]

index_client = SearchIndexClient(
        endpoint=search_service_url, credential=AzureKeyCredential(search_admin_key))
search_client = SearchClient(endpoint=search_service_url, index_name=index_name, credential=AzureKeyCredential(search_admin_key))

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
embeddings_client = AzureOpenAI(
    api_version=openai_embedding_api_version,
    azure_endpoint=openai_embedding_api_base,
    api_key=openai_embedding_api_key
)

gpt_client = AzureOpenAI(
    api_version=openai_gpt_api_version,
    azure_endpoint=openai_gpt_api_base,
    api_key=openai_gpt_api_key
)

print ('Search Service Name:', search_service_name)
print ('Search Service URL:', search_service_url)   
print ('Index Name:', index_name)
print ('Azure OpenAI Embeddings Base URL:', openai_embedding_api_base)
print ('Azure OpenAI Embeddings Model:', openai_embeddings_model)
print ('Azure OpenAI GPT Base URL:', openai_gpt_api_base)
print ('Azure OpenAI GPT Model:', openai_gpt_model)
print('Tenant ID:', tenant_id)
print('Client ID:', client_id)


Search Service Name: randysbasiccogsearch
Search Service URL: https://randysbasiccogsearch.search.windows.net/
Index Name: onsemi-images-cwmd
Azure OpenAI Embeddings Base URL: https://randysopenaieast.openai.azure.com/
Azure OpenAI Embeddings Model: text-embedding-ada-002
Azure OpenAI GPT Base URL: https://randysopenaieast.openai.azure.com/
Azure OpenAI GPT Model: gpt-4o
Tenant ID: 16b3c013-d300-468d-ac64-7eda0820b6d3
Client ID: dea42ec1-02bd-48e7-a79e-4fc39071b824


In [17]:
#read all records in the index
search_client = SearchClient(search_service_url, index_name, AzureKeyCredential(search_admin_key))

# Print the number of records in the index
print(f"Found {search_client.get_document_count()} records in the index")

#read the search results into a pandas dataframe
response = search_client.search(search_text="*")
df = pd.DataFrame(response)

#print the unique values for the filepath field

fileNames = df['file_name'].unique()

#print the number of unique file names
print(str(len(fileNames)) + " unique file names")

#sort the fileNames array:
for filename in np.sort(fileNames):
    print(filename)

    #confirm all pages have been indexed for each file
    chunks = df[df['file_name'] == filename]['chunk_id'].unique()
    #sort the chunks array:
    chunks = np.sort(chunks)
    print(chunks)

Found 107 records in the index
5 unique file names
https://onsemistorage.blob.core.windows.net/doc2md/a5191hrt-d.pdf
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34]
https://onsemistorage.blob.core.windows.net/doc2md/dtc114y-d.pdf
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
https://onsemistorage.blob.core.windows.net/doc2md/fna25060-d.pdf
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
https://onsemistorage.blob.core.windows.net/doc2md/ntbg020n090sc1-d.pdf
[ 0  1  2  3  4  5  6  7  8  9 10]
https://onsemistorage.blob.core.windows.net/doc2md/nvtys005n04c-d.pdf
[ 0  1  2  3  4  5  6  7  8  9 10 11 12]
