In [2]:
# Import required libraries
import os
import openai
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
from azure.search.documents.indexes import SearchIndexClient
from azure.storage.blob import BlobServiceClient
from pdfminer.high_level import extract_text
from azure.search.documents.models import VectorizableTextQuery
from azure.search.documents.indexes.models import (
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings, 
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
) 

In [3]:
## Additional Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Callable
import io
import base64

In [None]:
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")
storage_url = f"https://{os.getenv('AZURE_STORAGE_ACCOUNT_NAME')}.blob.core.windows.net/"
storage_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
storage_container_name = os.getenv("AZURE_STORAGE_BLOB_CONTAINER")
model: str = "text-embedding-ada-002" 
credential = AzureKeyCredential(key)
print(service_endpoint)
print(index_name)

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        ),
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm="myHnsw",
            vectorizer="myOpenAI"
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm="myExhaustiveKnn",
            vectorizer="myOpenAI"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myOpenAI",
            kind="azureOpenAI",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),
                deployment_id=model,
                api_key=os.getenv("AZURE_OPENAI_API_KEY")
            )
    )  
]  

)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [6]:
### function for breaking up document into different chunks

def split_text(text, chunk_size: int, chunk_overlap: int = 0, length_function: Callable[[str], int] = len):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = length_function
    )
    split_text = text_splitter.create_documents([text])
    
    return split_text

In [None]:
# Embed and Upload Blobs to Azure Cognitive Search
import tempfile
import uuid

blob_client = BlobServiceClient(
   account_url=storage_url,
   credential=storage_key
)

raw_files_container = blob_client.get_container_client(storage_container_name)

filenames = raw_files_container.list_blob_names()

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.embeddings.create(
        input=text,model="text-embedding-ada-002"
    )
    embeddings = response.data[0].embedding
    return embeddings


chunk_size = 1000
chunk_overlap = 0
vector_documents = []
for filename in filenames:
  # Loading file and extracting contents
  rawFileBinary = raw_files_container.download_blob(filename).readall()
  blob_stream = io.BytesIO(rawFileBinary)
  contents = extract_text(blob_stream)

  chunks = split_text(contents, chunk_size, chunk_overlap)

  print(f"Found {filename}")

  for i, chunk in enumerate(chunks):
    chunk = chunk.page_content
    vector_document = {
      "id": base64.urlsafe_b64encode(filename.encode('utf-8')),
      "title": filename,
      "content": chunk,
      "category": "hr docs",
      "titleVector": generate_embeddings(filename),
      "contentVector": generate_embeddings(chunk),
    }
    vector_documents.append(vector_document)
  print(f"Uploaded {i+1} chunks for {filename}")

In [None]:
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
print(f"Uploaded {len(vector_documents)} chunks") 
result = search_client.merge_or_upload_documents(vector_documents)

In [None]:
# Pure Vector Search
query = "record your DTO usage" 
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="contentVector")
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k=3, fields="contentVector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["title", "content", "category"],
) 
  
for result in results:
    print(result)
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['content'][0:150]}")  
    print(f"Category: {result['category']}\n")