# Use Case Onboarding

Sample notebook demonstrating how to utilize the deployed functions to:
- Create a new index
- List all files within a target Azure Storage container
- Trigger ingestion and indexing of that collection of files into the target storage index
- Retrieve a list of all chunks which have been added into the target index

Note: Before running this notebook, you will need to have deployed the Azure Durable Functions project to an Azure Function App environment as well as all associated resources (Azure AI Search, Azure Storage, Azure OpenAI, Azure Document Intelligence).

### Environment Variable Configuration

Create a `.env` file in your working directory with the following key-value pairs. We will load these into code using the `python-dotenv` library. 

| Variable       | Description                                                                                 |  
|----------------|---------------------------------------------------------------------------------------------|  
| FUNCTION_URI   | The primary URI endpoint for accessing your deployed Azure Function App.                    |  
| FUNCTION_KEY   | The default host key used for authenticating and securing access to your Azure Function App. | 

In [1]:
import os
from dotenv import load_dotenv
import requests
import json
from IPython.display import clear_output
import time
import pandas as pd

load_dotenv(override=True)

function_uri = os.getenv("FUNCTION_URI")
function_key = os.getenv("FUNCTION_KEY")

### Define Execution Variables

Below, create a stem name for a new Azure AI Search Index and fields that will be used in index creation. Further, define Azure Storage containers which contain source documents and which will contain extracted chunks, respectively. 

In [2]:
# Index Creation Settings
index_stem_name = 'test-index'
fields = {
    "content": "string", "pagenumber": "int", "sourcefile": "string", 
    "sourcefilepath": "string","sourcepage": "string", "category": "string"
}
embedding_dimensions = 3072 # Update according to the embedding model used (3072 for text-embedding-large-003; 1536 for text-embedding-ada-002)

# Data Source Settings
source_container = 'a-test-source'
extract_container = 'a-test-extract'

# Ingestion Settings
automatically_delete = False
analyze_images = True
overlapping_chunks = False
chunk_size = 800
chunk_overlap = 200
cosmos_logging = False

### Step 1 - Create New Index

Trigger the `create_new_index` function and store the created index name

In [None]:
create_index_uri = f"{function_uri}/api/create_new_index?code={function_key}"
create_index_payload = {
    "index_stem_name": index_stem_name,
    "fields": fields,
    "dimensions": embedding_dimensions
}

response = requests.post(create_index_uri, json=create_index_payload)
index_name = response.text
print(index_name)

### Step 2 - List Files in Source Container

In [None]:
list_files_uri = f"{function_uri}/api/list_files_in_container?code={function_key}"

response = requests.post(list_files_uri, json={"container": source_container})
files = response.json()
files

### Step 3 - Trigger Ingestion for all Files and Await Completion

In [None]:
def start_processing(blob_name, index_name):
    body_template = {
        'source_container': source_container,
        'extract_container': extract_container,
        'prefix_path':'',
        'index_name': index_name,
        'automatically_delete': automatically_delete,
        'analyze_images': analyze_images,
        'overlapping_chunks': overlapping_chunks,
        'chunk_size': chunk_size,
        'chunk_overlap': chunk_overlap,
        'cosmos_logging': cosmos_logging
    }

    body = body_template.copy()
    body['prefix_path'] = blob_name
    function_uri = f'{os.environ["FUNCTION_URI"]}/api/orchestrators/pdf_orchestrator?code={os.environ["FUNCTION_KEY"]}'
    response = requests.post(function_uri, json=(body))
    return response.json()['statusQueryGetUri']

def get_status(status_uri):
    response = requests.get(status_uri)
    status = response.json()['runtimeStatus']
    error = ''
    if status == 'Failed':
        error = response.json()['output']
    return status, error


# Submit all files for ingestion
tracking_dict = {}

for blob in files:
    blob_name = blob
    tracking_dict[blob_name] = {}
    status_uri = start_processing(blob_name, index_name)
    tracking_dict[blob_name]['status_uri'] = status_uri
    tracking_dict[blob_name]['submitted'] = True
    tracking_dict[blob_name]['completed'] = False
    status, error = get_status(status_uri)
    tracking_dict[blob_name]['error'] = error
    tracking_dict[blob_name]['status'] = status

print(f'Submitted {str(len(tracking_dict))} blobs for processing')

while True:
    all_complete = True
    total_complete = 0
    for k,v in tracking_dict.items():
        status, error = get_status(v['status_uri'])
        v['status'] = status
        v['error'] = error
        if v['status'] == 'Completed':
            v['completed'] = True
            total_complete += 1
        elif v['status'] == 'Failed': 
            # Logic to proceed forward WITHOUT retrying failed blobs
            status, error = get_status(status_uri)
            v['error'] = error
            v['status'] = status
            v['completed'] = True
            total_complete +=1

            # Logic to retry failed blobs
            # status_uri = start_processing(k)
            # v['status_uri'] = status_uri
            # v['submitted'] = True
            # v['completed'] = False
        else:
            all_complete = False
    clear_output(wait=True)
    print(f'Completed: {total_complete}/{len(tracking_dict)}')
    display(pd.DataFrame(tracking_dict).T)
    if all_complete:
        break
    time.sleep(10)

clear_output(wait=True)
print('All Blobs Processed')
succeeded = len([k for k,v in tracking_dict.items() if v['status'] == 'Completed'])
failed = len([k for k,v in tracking_dict.items() if v['status'] != 'Completed'])

print(f'Succeeded: {succeeded}')
succeeded_dict = {k: v for k, v in tracking_dict.items() if v['status'] == 'Completed'}
display(pd.DataFrame(succeeded_dict).T)
print()
print(f'Failed: {failed}')
failed_dict = {k: v for k, v in tracking_dict.items() if v['status'] != 'Completed'}
display(pd.DataFrame(failed_dict).T)


### Step 3.5 (Optional) - Attempt to Resubmit Failed Files

Uncommend the logic below and execute

In [6]:
# for k,v in failed_dict.items():
#     status_uri = start_processing(k)
#     v['status_uri'] = status_uri
#     v['submitted'] = True
#     v['completed'] = False

# while True:
#     all_complete = True
#     total_complete = 0
#     for k,v in failed_dict.items():
#         status, error = get_status(v['status_uri'])
#         v['status'] = status
#         v['error'] = error
#         if v['status'] == 'Completed':
#             v['completed'] = True
#             total_complete += 1
#         elif v['status'] == 'Failed': # Resubmit
#             status_uri = start_processing(k)
#             v['status_uri'] = status_uri
#             v['submitted'] = True
#             v['completed'] = False
#             status, error = get_status(status_uri)
#             v['error'] = error
#             v['status'] = status
#             v['completed'] = True
#             total_complete +=1
#         else:
#             all_complete = False
#     clear_output(wait=True)
#     print(f'Completed: {total_complete}/{len(failed_dict)}')
#     display(pd.DataFrame(failed_dict).T)
#     if all_complete:
#         break
#     time.sleep(10)


### Step 4 - Run 'Sync Index' to Retrieve a List of all Indexed Content

In [None]:
sync_index_uri = f"{function_uri}/api/orchestrators/sync_index_orchestrator?code={function_key}"
response = requests.post(sync_index_uri, json={"index_name": index_name, "extract_container": extract_container})
status_uri = response.json()['statusQueryGetUri']

while True:
    status, error = get_status(status_uri)
    if status == 'Completed':
        clear_output(wait=True)
        break
    clear_output(wait=True)
    print(f'Status: {status}')
    print(f'Error: {error}')
    time.sleep(10)

output = requests.get(status_uri)
index_content = output.json()['output']['index_content']
print(f'Indexed Content: {index_name}')
pd.DataFrame(index_content)
# Optional: save as CSV
# pd.DataFrame(index_content).to_csv(f'{index_name}.csv', index=False)

# Prints full list of indexed content
# print(json.dumps(output.json()['output']['index_content'], indent=2))