## Create Index

In [1]:
# !pip install azure-search-documents==11.6.0b6
# !pip install azure-search-documents==11.5.2

In [2]:
from azure.core.credentials import AzureKeyCredential
from dotenv import dotenv_values, load_dotenv
import datetime

config = dotenv_values(".env")

container_name = config.get("storage_container")
storage_base_url = config.get("storage_base_url")
connection_string = config.get("storage_connection_string")

document_intelligence_endpoint = config.get("document_intelligence_endpoint")
document_intelligence_key = config.get("document_intelligence_key")

azure_openai_api_key = config.get("AZURE_OPENAI_API_KEY")
azure_openai_endpoint = config.get("AZURE_OPENAI_API_BASE")
azure_openai_api_version = config.get("AZURE_OPENAI_API_VERSION")
azure_openai_embedding_model = config.get("AZURE_OPENAI_EMBEDDING_MODEL")
chat_model = config.get("AZURE_OPENAI_MODEL")

search_credential = AzureKeyCredential(config.get("AZURE_AI_SEARCH_KEY"))
search_endpoint = config.get("AZURE_AI_SEARCH_ENDPOINT")

index_type = "advanced" # "simple" or advanced"
index_name = f"transformer-description-{index_type}"
index_name

'transformer-description-advanced'

In [3]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_openai_embedding_model,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
    openai_api_version=azure_openai_api_version
)

embedding_function = embeddings.embed_query

In [4]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    ComplexField,
    SimpleField,
    SearchField,
    SearchFieldDataType,
    SearchableField,
    ScoringProfile,
    FreshnessScoringFunction,
    FreshnessScoringParameters,
    TextWeights,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch
)

index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=search_credential
)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True, sortable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SimpleField(name="last_update", type=SearchFieldDataType.DateTimeOffset, filterable=True),
    SearchField(
        name="text_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")), # 3072
        vector_search_profile_name="myHnswProfile"
    ),
    ComplexField(
        name="metadata",
        fields=[
            SearchableField(name="section_heading", type=SearchFieldDataType.String),
            SearchField(
                name="images",
                type=SearchFieldDataType.Collection(SearchFieldDataType.String)
            ),
            SimpleField(name="type", type=SearchFieldDataType.String),
            SimpleField(name="chunk_size", type=SearchFieldDataType.Int32),
            SimpleField(name="page_start", type=SearchFieldDataType.Int32),
            SimpleField(name="page_end", type=SearchFieldDataType.Int32),
            SimpleField(name="url", type=SearchFieldDataType.String),
            SearchField(
                name="keywords",
                type=SearchFieldDataType.Collection(SearchFieldDataType.String),
                searchable=True,
                filterable=True
            )
        ]
    )
]

# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile_1"
scoring_profile = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(
        weights={
            "title": 5
        }
    ),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear",
        )
    ],
)

# Adding vector search settings
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer_name="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="myVectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_endpoint,
                deployment_name=azure_openai_embedding_model,
                model_name=azure_openai_embedding_model,
                api_key=azure_openai_api_key
            )
        )
    ]
)


# Create the semantic settings with the configuration
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="title"),
        keywords_fields=[
            SemanticField(field_name="metadata/keywords")
        ],
        content_fields=[SemanticField(field_name="content")]
    )
)

semantic_search = SemanticSearch(configurations=[semantic_config])

#### Creating the index

In [5]:
from azure.search.documents.indexes.models import SearchIndex

# Create the search index
index = SearchIndex(
    name=index_name,
    fields=fields,
    scoring_profiles=[scoring_profile],
    vector_search=vector_search,
    semantic_search=semantic_search
)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 transformer-description-advanced created


## Prep Data

In [6]:
import os
from tqdm import tqdm

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.storage.blob import BlobServiceClient

from dotenv import load_dotenv

load_dotenv()

True

### **Preprocessing Pipeline**
- **Read data from storage account**
- **Use Document Intelligence to crack PDF**
    - **Extract text**
    - **Extract images and write to file**

In [7]:
def initialize_blob_service_client(connection_string, container_name):
    # Initialize the BlobServiceClient using the connection string
    blob_service_client = BlobServiceClient.from_connection_string(conn_str=connection_string)
    container_client = blob_service_client.get_container_client(container_name)
    return container_client


def initialize_document_intelligence_client(endpoint, key):
    # Initialize the Document Intelligence client
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key)
    )
    return document_intelligence_client

In [8]:
def download_blob_content(blob_client, pbar):
    try:
        download_stream = blob_client.download_blob()
        blob_content = download_stream.readall()
        analysis_status = "Downloaded, now analyzing"
        return blob_content, analysis_status
    except Exception as e:
        analysis_status = f"Download Failed: {e}"
        pbar.set_postfix({"Status": analysis_status})
        pbar.update(1)  # Update progress bar even if download fails
        return {"error_message": str(e)}, analysis_status

def analyze_document(document_intelligence_client, blob_content, blob_name):
    from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult

    figures_list = []
    try:
        poller = document_intelligence_client.begin_analyze_document(
            model_id="prebuilt-layout",
            analyze_request=blob_content,
            content_type="application/octet-stream",  # Adjust based on your document type
            output=[AnalyzeOutputOption.FIGURES]
        )
        result: AnalyzeResult = poller.result()
        operation_id = poller.details["operation_id"]

        if result.figures:
            for figure in result.figures:
                if figure.id:
                    response = document_intelligence_client.get_analyze_result_figure(
                        model_id=result.model_id, result_id=operation_id, figure_id=figure.id
                    )
                    with open(f"data/figures/1706.03762v7/{figure.id}.png", "wb") as writer:
                        writer.writelines(response)
        return result, blob_name
    except Exception as e:
        return {"error_message": str(e)}, None

**Run Pipeline**

In [9]:
def run_process_data_pipeline():
    documents = []
    # documents = get_documents_dict()
    failed_files = []
    try:
        # Initialize the BlobServiceClient & Document Intelligence client
        container_client = initialize_blob_service_client(connection_string, container_name)
        document_intelligence_client = initialize_document_intelligence_client(document_intelligence_endpoint, document_intelligence_key)

        # List all blobs in the container
        blob_list = list(container_client.list_blobs())  # Convert generator to list to get total count
        total_blobs = len(blob_list)  # Total number of blobs

        if total_blobs == 0:
            print("No blobs found in the container.")
        else:
            # Initialize the tqdm progress bar
            with tqdm(total=total_blobs, desc="Processing Blobs", unit="blob") as pbar:
                # Initialize a variable to hold analysis status
                analysis_status = ""

                for blob in blob_list:
                    blob_name = blob.name

                    # Update the progress bar's description to show the current blob
                    pbar.set_description(f"Processing {blob_name}")

                    # Download the blob's content
                    blob_content, analysis_status = download_blob_content(blob_client = container_client.get_blob_client(blob_name), pbar=pbar)
                    pbar.set_postfix({"Status": analysis_status})

                    # if blob_content is None:
                    if isinstance(blob_content, dict) and "error_message" in blob_content:
                        print(f"Skipping {blob_name} due to download failure.")
                        failed_files.append({
                            blob_name: {
                                "stage": "Blob Download",
                                "error_message": blob_content["error_message"]
                            }
                        })
                        continue  # Skip to the next blob

                    # Analyze the document using Document Intelligence
                    data, filename = analyze_document(document_intelligence_client, blob_content, blob_name)

                    if "error_message" not in data:
                        documents.append({"filename": filename, "data": data, "url": f"{storage_base_url}/{container_name}/{blob_name}"})
                    else:
                        failed_files.append({
                            blob_name: {
                                "stage": "Document Analysis",
                                "error_message": data["error_message"]
                            }
                        })
                    # Update the progress bar after processing each blob
                    pbar.update(1)
                pbar.set_postfix({"Status": "Finished"})
        return documents, failed_files
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# before running the pipeline, make sure that the documents are not protected.
documents, failed_documents = run_process_data_pipeline()

Processing 1706.03762v7.pdf: 100%|██████████| 1/1 [00:10<00:00, 10.29s/blob, Status=Finished]                 


In [10]:
print(f"Total number of failed documents: {len(failed_documents)}")
for failed_document in failed_documents:
    print(f"failed_document: {failed_document}")

print()
print(f"Total number of documents: {len(documents)}")
for doc in documents:
    print(f"    - Failed document: {doc.get('filename')}")

Total number of failed documents: 0

Total number of documents: 1
    - Failed document: 1706.03762v7.pdf


### **Create Image Descriptions**
- **Base64 encode images**
- **Send encoded images to gpt-4o**

In [11]:
import base64
from mimetypes import guess_type

# Function to encode a local image into data URL 
def local_image_to_data_url(image_path):
    # Guess the MIME type of the image based on the file extension
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'  # Default MIME type if none is found

    # Read and encode the image file
    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    # Construct the data URL
    return f"data:{mime_type};base64,{base64_encoded_data}"

In [12]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=azure_openai_api_key,  
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint
)

def get_image_descriptions(image_url):
    # function to create image descriptions
    response = client.chat.completions.create(
        model=chat_model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that provides text descriptions of images."},
            {"role": "user", "content": [  
                { 
                    "type": "text", 
                    "text": "Describe this image:" 
                },
                { 
                    "type": "image_url",
                    "image_url": {
                        "url": f"{image_url}"
                    }
                }
            ]} 
        ],
        max_tokens=800
    )
    return response.choices[0].message.content

In [16]:
import os
import glob

images_path = "data/figures/1706.03762v7"
image_files = glob.glob(os.path.join(images_path, "*.png"))
base64_images = []

for image_path in image_files:
    base64_image = local_image_to_data_url(image_path)
    base64_images.append({
        "path": f"{image_path}",
        "description": get_image_descriptions(base64_image)
    })

### **Create Chunks + Add Metadata**
- **chunk id**
- **last update**
- **number of tokens**

In [17]:
import datetime
import pytz

def get_sweden_time():
    # Define the timezone for Sweden
    sweden_tz = pytz.timezone('Europe/Stockholm')
    utc_now = datetime.datetime.now(datetime.timezone.utc)
    sweden_time = utc_now.astimezone(sweden_tz)
    return sweden_time.isoformat()

In [18]:
import tiktoken

def count_token(chunk, model="gpt-4o"):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(chunk)
    return str(len(tokens))

**Structure output from Document Intelligence**

In [19]:
def structure_content(doc_info):
    # Initialize variables
    new_docs = []
    current_title = None
    current_section_start_page = None

    doc = doc_info["data"]

    # Loop through paragraphs to structure the main content
    # for paragraph in documents["pdf"][0]["content"]["ec330b.pdf"].paragraphs:
    for paragraph in doc.paragraphs:
        # Extract page number from boundingRegions
        if "boundingRegions" in paragraph and paragraph["boundingRegions"]:
            paragraph_page_number = paragraph["boundingRegions"][0]["pageNumber"]
        else:
            paragraph_page_number = None

        if paragraph.role == "title":
            # Update the current title but do not add an entry to new_docs
            current_title = paragraph.content
        elif paragraph.role == "sectionHeading":
            # Start a new entry for the section heading with the current title
            current_section_start_page = paragraph_page_number
            new_docs.append({
                "title": current_title,
                "sectionHeading": paragraph.content,
                "content": "",
                "page_start": current_section_start_page,
                "page_end": paragraph_page_number,
                "images": []
            })
        else:
            # Add content to the last entry, updating page_end as needed
            if new_docs:
                new_docs[-1]["content"] += " " + paragraph.content
                new_docs[-1]["page_end"] = paragraph_page_number

    # Helper function to find the nearest section heading
    def find_nearest_section(sections, page_number):
        nearest_section = None
        for section in sections:
            if section["page_start"] <= page_number <= section["page_end"]:
                nearest_section = section
        return nearest_section

    # Add images to the appropriate sections
    for figure in doc.figures:
        if "boundingRegions" in figure and figure["boundingRegions"]:
            figure_page = figure["boundingRegions"][0]["pageNumber"]
            # Find the nearest section
            nearest_section = find_nearest_section(new_docs, figure_page)
            if nearest_section:
                filename = doc_info.get('filename')
                basename, _ = os.path.splitext(filename)
                nearest_section["images"].append(
                    f"{basename}/{figure['id']}"
                )

    return new_docs
    

# new_docs = structure_content(documents[0])
# new_docs[7]

# for doc in new_docs[:2]:
#     print(f"Title: {doc['title']}\nSection Heading: {doc['sectionHeading']}\nContent: {doc['content']}\nPage Start: {doc['page_start']}\nPage End: {doc['page_end']}\nImages: {doc['images']}\n")



In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import AzureOpenAIEmbeddings as AzureOpenAIEmbeddingsLC
import json

def create_splits(content, chunk_size=2048, chunk_overlap=256):
    try:
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

        # This list will hold our final dictionaries with updated content.
        splits = []

        # Iterate through original documents and create chunks
        for doc in content:
            text = doc.get("content", "")
            
            # Update the dictionary to use `page_content` instead of `content`
            doc["page_content"] = text
            doc.pop("content", None)  # Remove the old `content` key if it exists

            # print(doc.keys())

            chunks = text_splitter.create_documents([text])
            if len(chunks) < 1:
                splits.append(doc)
                # splits.append(json.dumps(doc))
            else:
                for chunk in chunks:
                    new_doc = doc.copy()
                    new_doc["page_content"] = chunk.page_content  # Assign the splitted text
                    splits.append(new_doc)
                    # splits.append(json.dumps(new_doc))
        return splits
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [32]:
import uuid
from pathlib import Path

# Function to aggregate content for each document type
def format_documents(documents, base64_images):
    aggregated_documents = []

    for doc in documents:
        url = doc["url"]  # URL for the document
        filename = doc["filename"]  # Filename of the document

        content = structure_content(doc)
        splits = create_splits(content, chunk_size=1024, chunk_overlap=128)

        for i, split in enumerate(splits):
            name = f"{filename}_chunk_{i}"
            namespace = uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")

            # Append the splits with metadata
            aggregated_documents.append({
                "id": str(uuid.uuid5(namespace, name)),
                "page_content": split.get("page_content", "") if isinstance(split, dict) else split.page_content,  # Aggregated chunk content
                "last_update": get_sweden_time(),  # Current date & time
                "metadata": {
                    "type": "document_chunk",
                    "title": filename,
                    "sectionHeading": split.get("sectionHeading", ""),  # Section heading in the chunk
                    "images": split.get("images", []),  # Images in the chunk
                    "chunk_size": count_token(split.get("page_content", "") if isinstance(split, dict) else split.page_content),
                    "page_start": split.get("page_start", 0),
                    "page_end": split.get("page_end", 0),
                    "url": url
                }
            })


            # Append the image descriptions with metadata
            for image in base64_images:
                figure_id = os.path.splitext(image["path"])[0].split("\\")[-1]
                folder_name = Path(image["path"]).parent.name
                image_path = f"{folder_name}/{figure_id}"
                
                # Check if the image path is in the current split
                if image_path in split.get("images"):
                    aggregated_documents.append({
                        "id": str(uuid.uuid5(namespace, name)),
                        "page_content": image["description"],
                        "last_update": get_sweden_time(),
                        "metadata": {
                            "type": "image_description",
                            "title": filename,
                            "sectionHeading": split.get("sectionHeading", ""),
                            "images": [image_path],
                            "chunk_size": count_token(image["description"]),
                            "page_start": figure_id.split(".")[0],
                            "page_end": figure_id.split(".")[0],
                            "url": url
                        }
                    })

    return aggregated_documents

# Call the function with your documents dictionary
formatted_documents = format_documents(documents, base64_images)
formatted_documents[0]


{'id': '62112fb9-3e61-52f8-a291-3425273c20ec',
 'page_content': 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English- to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training cost

#### Extract additional metadata using AOAI

In [53]:
from typing import Literal, Optional, List
from pydantic import BaseModel, Field

class Properties(BaseModel):
    keywords: Optional[List[str]] = Field(
        ...,
        description="Important keywords and entities from the chunk for better searchability. If no keywords are found, return an empty list []."
    )

In [54]:
import time
from tqdm import tqdm
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain_community.document_transformers.openai_functions import create_metadata_tagger
from langchain.schema import Document


if index_type == "advanced":
    model = "gpt-4o-mini-global"

    llm = AzureChatOpenAI(
        azure_deployment=model,
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        api_key=azure_openai_api_key,
        azure_endpoint=azure_openai_endpoint,
        api_version=azure_openai_api_version
    )

    system = """You are an expert at extracting metadata from document chunks."""
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system),
            ("human", "{question}")
        ]
    )

    document_transformer = create_metadata_tagger(metadata_schema=Properties, llm=llm, prompt=prompt)

    # Start timing
    start_time = time.time()

    # Process documents with progress bar
    split_list_tmp = []
    formatted_documents_lc = [
        Document(page_content=doc["page_content"], metadata=doc["metadata"])
        for doc in formatted_documents
    ]

    for doc in tqdm(formatted_documents_lc, desc="Processing documents"):
        enhanced_doc = document_transformer.transform_documents([doc])
        split_list_tmp.extend(enhanced_doc)

    # End timing
    end_time = time.time()
    elapsed_time = end_time - start_time

    split_list = split_list_tmp
    print(f"Processing completed in {elapsed_time:.2f} seconds.")
else:
    split_list=formatted_documents


Processing documents: 100%|██████████| 32/32 [00:27<00:00,  1.15it/s]

Processing completed in 27.81 seconds.





## Index Data

In [56]:
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_openai_embedding_model,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version
)

embedding_function = embeddings.embed_query

In [57]:
import time
from tqdm import tqdm
import uuid

def convert_document(doc, embedding_function):
    # Define standard values
    standard_values = {
        'id': f'{uuid.uuid4()}',
        'title': 'unknown',
        'page_content': '',
        'last_update': get_sweden_time(),
        'text_vector': [],
        'metadata': {
            "section_heading": '',
            "images": [],
            "type": "",
            "chunk_size": 0,
            "page_start": 0,
            "page_end": 0,
            "url": '',
            "keywords": []
        }
    }
    # Extract values from the Document object, using standard values if missing
    id = doc.metadata.get('id', standard_values['id'])  # Use standard
    title = doc.metadata.get('title', standard_values['title'])
    content = doc.page_content if doc.page_content is not None else standard_values['page_content']
    last_update = doc.metadata.get('last_update', standard_values['last_update'])
    text_vector = embedding_function(content) if content else standard_values['text_vector']
    metadata_defaults = standard_values['metadata']
    metadata = {
        "section_heading": doc.metadata.get('sectionHeading', metadata_defaults['section_heading']),
        "images": doc.metadata.get('images', metadata_defaults['images']),
        "type": doc.metadata.get('type', metadata_defaults['type']),
        "chunk_size": doc.metadata.get('chunk_size', metadata_defaults['chunk_size']),
        "page_start": doc.metadata.get('page_start', metadata_defaults['page_start']),
        "page_end": doc.metadata.get('page_end', metadata_defaults['page_end']),
        "url": doc.metadata.get('url', metadata_defaults['url']),
        "keywords": doc.metadata.get('keywords', metadata_defaults['keywords'])
    }

    # Construct the document for uploading
    document = {
        "id": id,
        "title": title,
        "content": content,
        "last_update": last_update,
        "text_vector": text_vector,
        "metadata": metadata
    }
    return document

data_final = []

# Start timing
start_time = time.time()

# Process documents with progress bar
for i, doc in enumerate(tqdm(split_list, desc="Converting documents")):
    data_final.append(convert_document(doc, embedding_function))

# End timing
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Conversion completed in {elapsed_time:.2f} seconds.")



Converting documents: 100%|██████████| 32/32 [00:02<00:00, 12.28it/s]

Conversion completed in 2.61 seconds.





In [58]:
from azure.search.documents import SearchClient

def push_to_index(data, search_endpoint, search_credential, index_name=index_name):
    search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)
    search_client.upload_documents(data)

In [59]:
import time
from tqdm import tqdm

# Group the header based chunks into batches
batch_size = 50
total_chunks = len(data_final)
num_batches = (total_chunks + batch_size - 1) // batch_size

overall_start_time = time.time()

# Initialize the progress bar
progress_bar = tqdm(total=total_chunks, desc="Processing Batches")

# Process each batch
for i, batch_num in enumerate(range(num_batches)):
    batch_start_time = time.time()
    start = batch_num * batch_size
    batch = data_final[start:start + batch_size]

    # Push the documents to the index
    push_to_index(
        data=batch,
        search_endpoint=search_endpoint,
        search_credential=search_credential,
        index_name=index_name
    )
    progress_bar.update(len(batch))

    batch_end_time = time.time()
    elapsed_batch_time = batch_end_time - batch_start_time
    print(f"Batch {batch_num + 1} took {elapsed_batch_time:.2f} seconds.\n")

# Close the progress bar
progress_bar.close()

overall_end_time = time.time()
elapsed_overall_time = overall_end_time - overall_start_time
print(f"All batches pushed in {elapsed_overall_time:.2f} seconds.")

Processing Batches: 100%|██████████| 32/32 [00:00<00:00, 37.29it/s]

Batch 1 took 0.86 seconds.

All batches pushed in 0.86 seconds.



