In [3]:
import os
import requests  
from azure.storage.blob import BlobServiceClient, ContainerClient  

# Azure AI Search
import json


In [4]:
# Data to send in the job request (optional)  
data = {  
    "prompt": """Extract everything you see in this image to markdown. 
                Convert all charts such as line, pie and bar charts to markdown tables and include a note that the numbers are approximate.
                """,
    "is_html": True,
    "openai_gpt_api_base" : "https://[redacted].openai.azure.com/",
    "openai_gpt_api_key" : "[redacted]",
    "openai_gpt_api_version" :  "2024-02-15-preview",
    "openai_gpt_model" : "gpt-4o",
    "blob_storage_service_name" : "[redacted]",
    "blob_storage_service_api_key" : "[redacted]",
    "blob_storage_container" : "doc2md",
    "openai_embedding_api_base" : "https://[redacted].openai.azure.com/",
    "openai_embedding_api_key" : "[redacted]",
    "openai_embedding_api_version" :  "2024-02-15-preview",
    "openai_embedding_model" : "text-embedding-ada-002",
    "search_service_name": "[redacted]",
    "search_admin_key" : "[redacted]",
    "search_index_name": "[redacted]",
    "search_api_version" : "2024-05-01-preview"
}  

# data['url_file_to_process'] = 'https://www.princexml.com/samples/invoice/invoicesample.pdf'
data['url_file_to_process'] = "https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search"

base_url = "https://<redacted>.westus2.azurecontainerapps.io"

job_submit_url = f"{base_url}/start-job"
job_status_url = f"{base_url}/job-status"


In [None]:
# Create the index
# NOTE: You may need to manually update the dimensions in the schema_template.json to match the model you are using

search_service_name = data['search_service_name']
search_admin_key = data['search_admin_key']
search_index_name = data['search_index_name']
search_api_version = data['search_api_version']

search_service_url = "https://{}.search.windows.net/".format(search_service_name)
search_headers = {  
    'Content-Type': 'application/json',  
    'api-key': search_admin_key  
} 

with open('schema_template.json', 'r') as f_in:
    index_schema = json.loads(f_in.read())
    index_schema['name'] = search_index_name
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['resourceUri'] = data['openai_embedding_api_base']
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['deploymentId'] = data['openai_embedding_model']
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey'] = data['openai_embedding_api_key']

# Check if all required AI Search variables are properly configured
search_vars = [search_service_name, search_admin_key, search_index_name, search_api_version]
ai_search_configured = True

for var in search_vars:
    if not var or var == "[redacted]" or var.strip() == "":
        ai_search_configured = False
        break

if not ai_search_configured:
    print("Skipping index creation - AI Search variables not properly configured (some values are '[redacted]', empty, or None)")
else:
    # Making the POST requests to re-create the index  
    delete_url = f"{search_service_url}/indexes/{search_index_name}?api-version={search_api_version}"  
    response = requests.delete(delete_url, headers=search_headers)  
    if response.status_code == 204:  
        print(f"Index {search_index_name} deleted successfully.")  
        # print(json.dumps(response.json(), indent=2))  
    else:  
        print("Error deleting index, it may not exist.")  

    # The endpoint URL for creating the index  
    create_index_url = f"{search_service_url}/indexes?api-version={search_api_version}"  
    response = requests.post(create_index_url, headers=search_headers, json=index_schema)  
      
    # Check the response  
    if response.status_code == 201:  
        print(f"Index {search_index_name} created successfully.")  
        # print(json.dumps(response.json(), indent=2))  
    else:  
        print(f"Error creating index {search_index_name} :")  
        print(response.json())  

Index test deleted successfully.
Index test created successfully.


In [6]:
# Submit job to convert the document to Markdown files
response = requests.post(job_submit_url, json=data)  

# Check if the request was successful  
if response.status_code == 200:  
    job_info = response.json()  
    job_id=job_info['job_id']
    print(f"Job started successfully! Job ID: {job_id}")  
    data_status = { 
        "job_id": job_info['job_id'],
        "blob_storage_service_name" : data['blob_storage_service_name'],
        "blob_storage_service_api_key" : data['blob_storage_service_api_key'],
        "blob_storage_container" : data['blob_storage_container']
    }  
    
    # Send requests to check job status  
    while True:
        time.sleep(2)
        response = requests.post(job_status_url, json=data_status)  

        # Check if the request was successful  
        if response.status_code == 200:  
            job_status = response.json()  
            print(f"Job Status for Job ID {job_id}: {job_status['status']}")  
            if 'message' in job_status:
                print(f"{job_status['message']}")  
            if job_status['status'] != 'in-progress':
                print (job_status)
                break
        else:  
            print(f"Failed to check job status: {response.status_code} - {response.text}")  
            break
else:  
    print(f"Failed to start job: {response.status_code} - {response.text}")  


KeyboardInterrupt: 

In [62]:
# Download the files
account_url = "https://" + data["blob_storage_service_name"] + ".blob.core.windows.net"  
blob_service_client = BlobServiceClient(account_url=account_url, credential=data["blob_storage_service_api_key"])  
container_client = blob_service_client.get_container_client(data["blob_storage_container"])  
blobs = container_client.list_blobs(name_starts_with='processed/' + job_id)  
 
# Download each blob  
for blob in blobs:  
    blob_client = container_client.get_blob_client(blob)  
    local_path = os.path.join(blob.name)  
    local_dir = os.path.dirname(local_path)  
    if not os.path.exists(local_dir):  
        os.makedirs(local_dir)  
    print(f"Downloading {blob.name} to {local_path}")  
  
    with open(local_path, "wb") as file:  
        blob_data = blob_client.download_blob()  
        file.write(blob_data.readall())  
  
print("Download complete")  


Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/1.png to processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/1.png
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/2.png to processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/2.png
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/3.png to processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/3.png
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/4.png to processed/d9558a40-69ae-4c3f-907b-399615f210a4/images/4.png
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/json/d9558a40-69ae-4c3f-907b-399615f210a4.json to processed/d9558a40-69ae-4c3f-907b-399615f210a4/json/d9558a40-69ae-4c3f-907b-399615f210a4.json
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/markdown/1.txt to processed/d9558a40-69ae-4c3f-907b-399615f210a4/markdown/1.txt
Downloading processed/d9558a40-69ae-4c3f-907b-399615f210a4/markdown/2.txt to processed/d9558a40-69ae-4c3f-907b-399615f

In [1]:
# import base64
# def encode_base64(input_string):  
#     byte_string = input_string.encode('utf-8')  
#     encoded_bytes = base64.b64encode(byte_string)  
#     encoded_string = encoded_bytes.decode('utf-8')  
#     return encoded_string  
  
# def decode_base64(encoded_string):  
#     encoded_bytes = encoded_string.encode('utf-8')  
#     decoded_bytes = base64.b64decode(encoded_bytes)  
#     decoded_string = decoded_bytes.decode('utf-8')  
#     return decoded_string  