In [24]:
# @title 1. Install Necessary Libraries
# Using the older azure-ai-formrecognizer SDK for ADI as per original user function
!pip install --upgrade --quiet azure-ai-formrecognizer azure-identity
# Newer ADI SDK (optional, but good practice for future reference)
# !pip install --upgrade --quiet azure-ai-documentintelligence
!pip install --upgrade --quiet langchain langchain-community langchain-openai langchain-core pydantic
!pip install --upgrade --quiet requests # General HTTP requests
!pip install --upgrade --quiet openai azure-ai-formrecognizer azure-identity requests
!pip install --upgrade --quiet openai azure-ai-formrecognizer azure-identity requests

In [25]:
# @title 2. Import Libraries
import os
import json
from google.colab import files # Import for file uploads
import io                      # Import for creating byte streams
import base64                  # Import for base64 encoding
import mimetypes               # Import for detecting image type
from google.colab import userdata
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient # For ADI function
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.prompts import PromptTemplate
import warnings

In [26]:
# @title 3. Configure Credentials using Colab Userdata

# --- Retrieve Credentials ---
try:
    # Azure Document Intelligence Credentials
    adi_endpoint = userdata.get('ENDPOINT')         # Your ADI Endpoint
    adi_key = userdata.get('APIKEY')                # Your ADI Key

    # Azure OpenAI Credentials
    openai_endpoint = userdata.get('AZURE_ENDPOINT')      # Your Azure OpenAI Endpoint
    openai_key = userdata.get('AZURE_OPENAI_KEY')   # Your Azure OpenAI Key
    openai_deployment = userdata.get('DEPLOYMENT')        # Your GPT-4o Deployment Name in Azure AI Studio

    # --- Define Static Config ---
    openai_api_version = userdata.get('API_VERSION')      # API version supporting GPT-4o vision

    # --- Validation ---
    if not all([adi_endpoint, adi_key, openai_endpoint, openai_key, openai_deployment]):
        missing = [
            "ENDPOINT (ADI)" if not adi_endpoint else None,
            "APIKEY (ADI)" if not adi_key else None,
            "AZURE_ENDPOINT (OpenAI)" if not openai_endpoint else None,
            "AZURE_OPENAI_KEY (OpenAI)" if not openai_key else None,
            "DEPLOYMENT (OpenAI)" if not openai_deployment else None,
        ]
        missing_str = ", ".join(filter(None, missing))
        raise ValueError(f"One or more secrets are missing from Colab Userdata: {missing_str}")

    print("Credentials retrieved successfully from Colab Userdata.")

except Exception as e:
    print(f"Error retrieving credentials: {e}")
    # Stop execution if credentials are not loaded properly
    raise

Credentials retrieved successfully from Colab Userdata.


In [27]:
secret_name = 'DEPLOYMENT'

# 2. Try to get the secret value
secret_value = userdata.get(secret_name)

# 3. Check if the value was retrieved (is not None)
if secret_value is not None:
    print(f"✅ Success: Secret '{secret_name}' is accessible.")

✅ Success: Secret 'DEPLOYMENT' is accessible.


In [28]:
# --- Retrieve Credentials ---
try:
    # Azure Document Intelligence Credentials (for text extraction)
    adi_endpoint = userdata.get('ENDPOINT')         # Your ADI Endpoint
    adi_key = userdata.get('APIKEY')                # Your ADI Key

    # Azure OpenAI Credentials (for key-value extraction)
    openai_endpoint = userdata.get('AZURE_ENDPOINT')      # Your Azure OpenAI Endpoint
    openai_key = userdata.get('AZURE_OPENAI_KEY')   # Your Azure OpenAI Key
    openai_deployment_name = userdata.get('DEPLOYMENT')   # Your GPT-4o Deployment Name

    # --- Define Static Config ---
    openai_api_version = "2024-05-01-preview"      # API version supporting GPT-4o vision

    # --- Validation ---
    if not all([adi_endpoint, adi_key, openai_endpoint, openai_key, openai_deployment_name]):
        missing = [
            "ENDPOINT (ADI)" if not adi_endpoint else None,
            "APIKEY (ADI)" if not adi_key else None,
            "AZURE_ENDPOINT (OpenAI)" if not openai_endpoint else None,
            "AZURE_OPENAI_KEY (OpenAI)" if not openai_key else None,
            "DEPLOYMENT (OpenAI)" if not openai_deployment_name else None,
        ]
        missing_str = ", ".join(filter(None, missing))
        raise ValueError(f"One or more secrets are missing from Colab Userdata: {missing_str}")

    print("Credentials retrieved successfully from Colab Userdata.")

    # --- Initialize Azure OpenAI Client ---
    # Use the direct openai library client configured for Azure
    openai_client = openai.AzureOpenAI(
        api_key=openai_key,
        azure_endpoint=openai_endpoint,
        api_version=openai_api_version
        # deployment_name is specified during the call, not usually at client init
    )
    print("Azure OpenAI client initialized.")

except Exception as e:
    print(f"Error during credential retrieval or client initialization: {e}")
    openai_client = None # Ensure client is None if setup fails
    # Stop execution if credentials are not loaded properly
    raise

Credentials retrieved successfully from Colab Userdata.
Azure OpenAI client initialized.


In [13]:
try:
    # Azure Document Intelligence Credentials (for layout/text extraction)
    adi_endpoint = userdata.get('ENDPOINT')         # Your ADI Endpoint
    adi_key = userdata.get('APIKEY')                # Your ADI Key

    # Azure OpenAI Credentials (for key-value extraction)
    openai_endpoint = userdata.get('AZURE_ENDPOINT')      # Your Azure OpenAI Endpoint
    openai_key = userdata.get('AZURE_OPENAI_KEY')   # Your Azure OpenAI Key
    openai_deployment_name = userdata.get('DEPLOYMENT')   # Your GPT-4o Deployment Name

    # --- Define Static Config ---
    openai_api_version = "2024-05-01-preview"      # API version supporting GPT-4o vision

    # --- Validation ---
    if not all([adi_endpoint, adi_key, openai_endpoint, openai_key, openai_deployment_name]):
        missing = [
            "ENDPOINT (ADI)" if not adi_endpoint else None,
            "APIKEY (ADI)" if not adi_key else None,
            "AZURE_ENDPOINT (OpenAI)" if not openai_endpoint else None,
            "AZURE_OPENAI_KEY (OpenAI)" if not openai_key else None,
            "DEPLOYMENT (OpenAI)" if not openai_deployment_name else None,
        ]
        missing_str = ", ".join(filter(None, missing))
        raise ValueError(f"One or more secrets are missing from Colab Userdata: {missing_str}")

    print("Credentials retrieved successfully from Colab Userdata.")

    # --- Initialize Azure OpenAI Client ---
    # Use the direct openai library client configured for Azure
    openai_client = openai.AzureOpenAI(
        api_key=openai_key,
        azure_endpoint=openai_endpoint,
        api_version=openai_api_version
    )
    print("Azure OpenAI client initialized.")

except Exception as e:
    print(f"Error during credential retrieval or client initialization: {e}")
    openai_client = None # Ensure client is None if setup fails
    raise

Credentials retrieved successfully from Colab Userdata.
Azure OpenAI client initialized.


In [29]:
# @title 4. Upload Image File
print("Please upload the image file you want to analyze.")
uploaded_files = files.upload()

# --- Process Upload ---
uploaded_filename = None
uploaded_content = None # Bytes of the file
image_bytes = None      # Alias for clarity
image_mime_type = None

if not uploaded_files:
    print("\nNo file uploaded. Please run this cell again and upload a file.")
    raise SystemExit("No file uploaded.")
elif len(uploaded_files) > 1:
    print("\nWarning: Multiple files uploaded. Using the first file.")
    uploaded_filename = next(iter(uploaded_files))
    uploaded_content = uploaded_files[uploaded_filename]
else:
    uploaded_filename = next(iter(uploaded_files))
    uploaded_content = uploaded_files[uploaded_filename]

if uploaded_content:
    image_bytes = uploaded_content
    print(f"\nSuccessfully uploaded: '{uploaded_filename}' ({len(image_bytes)} bytes)")
    image_mime_type, _ = mimetypes.guess_type(uploaded_filename)
    if image_mime_type is None:
        image_mime_type = "application/octet-stream"
        print(f"Warning: Could not determine MIME type, using default: {image_mime_type}")
    elif not image_mime_type.startswith('image/'):
        print(f"\nWarning: Uploaded file '{uploaded_filename}' might not be an image (detected type: {image_mime_type}). Processing anyway.")
    else:
         print(f"Detected MIME type: {image_mime_type}")

Please upload the image file you want to analyze.


Saving image 2.png to image 2.png

Successfully uploaded: 'image 2.png' (310711 bytes)
Detected MIME type: image/png


In [30]:
# @title 5. Extract Text using Azure Document Intelligence (Layout Model)

def extract_text_with_adi_layout(image_stream, endpoint, key):
    """
    Extracts concatenated text content using ADI 'prebuilt-layout' model
    from an in-memory image stream by iterating through lines.
    """
    print("\n--- Starting Azure Document Intelligence Layout Text Extraction ---")
    if not endpoint or not key:
        print("Error: ADI Endpoint or Key is missing.")
        return "Error: ADI Credentials missing."
    if not image_stream:
         print("Error: Image stream is missing.")
         return "Error: Image stream missing."

    try:
        document_analysis_client = DocumentAnalysisClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key)
        )
        print("Analyzing document stream using 'prebuilt-layout' model...")
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout",  # *** Use layout model ***
            document=image_stream
        )
        result = poller.result()
        print("ADI layout analysis complete.")

        # --- Reconstruct text from layout results ---
        extracted_lines = []
        if result.pages:
            for page in result.pages:
                if page.lines: # Check if lines exist for the page
                    for line in page.lines:
                        extracted_lines.append(line.content) # Add content of each line

        # Join lines with newline characters to form the text block
        full_text = "\n".join(extracted_lines)

        if full_text:
            print(f"Extracted {len(full_text)} characters of text from layout.")
            return full_text
        else:
            print("No text lines found by ADI layout model.")
            return "" # Return empty string if no content

    except Exception as e:
        print(f"An error occurred during ADI layout text extraction: {e}")
        return f"Error during ADI layout text extraction: {str(e)}"

# --- Execute ADI Layout Text Extraction ---
extracted_text = ""
if image_bytes:
    try:
        image_byte_stream_for_adi = io.BytesIO(image_bytes)
        # *** Call the updated function ***
        extracted_text = extract_text_with_adi_layout(image_byte_stream_for_adi, adi_endpoint, adi_key)
        if extracted_text.startswith("Error:"):
            print(f"ADI layout step failed: {extracted_text}")
            print("Proceeding to GPT-4o without extracted text context.")
            extracted_text = ""
    finally:
        if 'image_byte_stream_for_adi' in locals() and not image_byte_stream_for_adi.closed:
            image_byte_stream_for_adi.close()
else:
    print("Skipping ADI layout text extraction, no image content available.")
    extracted_text = "Analysis skipped due to missing image."


# Display first 500 chars of extracted text for verification
print("\n--- Extracted Text Context (Layout Model - First 500 Chars) ---")
print(extracted_text[:4070] + ("..." if len(extracted_text) > 500 else ""))
print("---------------------------------------------------------------")


--- Starting Azure Document Intelligence Layout Text Extraction ---
Analyzing document stream using 'prebuilt-layout' model...
ADI layout analysis complete.
Extracted 1484 characters of text from layout.

--- Extracted Text Context (Layout Model - First 500 Chars) ---
CURRICULUM VITAE
Dear Sir/ Madam,
I intend to apply a job in your company for Able Bodied Seaman position and I am ready to be positioned on
board with route of voyage determined by the company. My data as follows:
· PERSONAL DATA
Full Name
Boni Rustami
Rank
Cook
Place, date of birth
Samarinda, 10 Juni 1990
Address
JI Cempaka No 15, Samarinda
Marital Status
Married
Nationality
Indonesia
Religion
Moslem
Telephone
082931023940
Email
boni.rustami@cakemail.com
· DOCUMENT TRAVELS
Document
Number
Issued At
Issued Date
Expire Date
Seaman Book
F18273909
Samarinda
18 Sep 2019
18 Sep 2022
Passport
AD4934810
Jakarta
15 Agt 2017
15 Agt 2022
· CERTIFICATE OF COMPETENCY
Type of Certificate
Certificate Number
Place/ Date of Issued
Rati

In [31]:
# @title 6. Prepare and Send Request to Azure OpenAI GPT-4o

def encode_image_to_base64(image_bytes):
    """Encodes image bytes to a base64 string."""
    return base64.b64encode(image_bytes).decode('utf-8')

# Initialize dictionary to store final results
extracted_kv_pairs = None
messages = None
gpt_request_ready = False

if openai_client and image_bytes and image_mime_type:
    print("\n--- Preparing request for GPT-4o ---")

    base64_image = encode_image_to_base64(image_bytes)
    image_data_uri = f"data:{image_mime_type};base64,{base64_image}"
    print(f"Image encoded as data URI (Type: {image_mime_type}).")

    system_prompt = "You are an AI assistant specialized in extracting structured key-value pairs from images using the provided image and associated text context."

    user_prompt_template = """
    Analyze the provided image and the accompanying text extracted from its layout (which includes numbered points like 1-13).
    Your goal is to extract all distinct key-value pairs present in the form image, using both the visual information and the provided text context.
    Format the output as a single JSON object.

    Detailed Instructions:
    1.  **Structure around Numbered Points:** Use the numbered points (e.g., 1, 2, 3... 13) visible in the form and present in the text context as a primary guide for structuring the information. Group related sub-information under the main point's label.
    2.  **Key Naming:** Create JSON keys based on the field labels found in the form. Clean the keys by removing leading numbers and punctuation (e.g., '1. Name of Sponsor' should become 'Name of Sponsor', '7A. (Proposed) Indication for Use' should become 'Proposed Indication for Use (7A)' or similar descriptive key).
    3.  **Value Extraction:** Extract the corresponding data entered or selected for each field.
    4.  **Checkboxes/Radio Buttons:** For groups of options associated with a question (e.g., points 6B, 7C, 7D, 8, 11, 12, 13):
        *   Identify the text label of the option that is clearly marked as **selected or checked or ticketed**.
        *   Use this selected label as the value for the corresponding key (e.g., "IND Type (6B)": "Research", "Phase of Clinical Investigation": "Phase 2").
        *   If **no option** within a specific checkbox/radio button group appears to be selected, represent the value as `null` or an empty string `""`.
        *   Do **not** include the labels of unselected options.
    5.  **Sub-points & Nesting:**
        *   For multi-part fields like addresses (point 3), create a nested JSON object (e.g., "Sponsor Address": {{"Street Address": ..., "City": ...}}).  <-- Escaped inner brackets
        *   For other lettered sub-points (like 6A/6B, 7A/7B/7C/7D), use descriptive keys incorporating the sub-point identifier as shown in instruction 2. Group conceptually related sub-points if it makes sense (e.g., questions 7A, 7B, 7C, 7D all relate to the indication).
    6.  **Blank Fields:** If a field designed for text input is visibly empty or contains only generic placeholder text (like 'If previously assigned', 'Include country code...'), represent its value as `null` or an empty string `""`.
    7.  **Accuracy:** Be precise. Extract only information clearly present in the form. If unsure about a field or its value, it's better to omit the key-value pair.
    8.  **Output Format:** Return **only** the final JSON object, starting with `{{` and ending with `}}`. Do not include any explanations or introductory text. <-- Escaped brackets

    Extracted Text Context (from Layout model - use this to help interpret the image):
    ---
    {text_content}
    ---
    """

    current_extracted_text = extracted_text if not extracted_text.startswith("Error:") else "Text extraction failed or was skipped."
    user_prompt = user_prompt_template.format(text_content=current_extracted_text)

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image_url", "image_url": {"url": image_data_uri}}
            ]
        }
    ]
    print("GPT-4o request payload prepared.")
    gpt_request_ready = True
else:
    print("\nSkipping GPT-4o preparation: Missing OpenAI client, image data, or MIME type.")
    messages = None
    gpt_request_ready = False



--- Preparing request for GPT-4o ---
Image encoded as data URI (Type: image/png).
GPT-4o request payload prepared.


In [32]:
if openai_client and gpt_request_ready and messages:
    print("\n--- Sending request to Azure OpenAI GPT-4o ---")
    try:
        response = openai_client.chat.completions.create(
            model=openai_deployment_name,
            messages=messages,
            max_tokens=2500,
            temperature=0.1,
            response_format={"type": "json_object"} # Enforce JSON output
        )

        if response.choices and response.choices[0].message and response.choices[0].message.content:
            json_string = response.choices[0].message.content
            print("\n--- Raw GPT-4o Response (JSON String) ---")
            print(json_string)
            print("------------------------------------------")

            try:
                extracted_kv_pairs = json.loads(json_string)
                print("\nSuccessfully parsed JSON output from GPT-4o.")
            except json.JSONDecodeError as json_err:
                print(f"\nError: GPT-4o response format ('json_object') failed or returned non-JSON despite the setting. Error: {json_err}")
                print("Raw output was:", json_string)
                extracted_kv_pairs = {"error": "Failed to parse GPT-4o JSON output", "raw_output": json_string}
        else:
            print("\nError: No valid content received in GPT-4o response.")
            print("Full Response Object:", response)
            extracted_kv_pairs = {"error": "No content received from GPT-4o.", "response_object": str(response)}

    except openai.APIConnectionError as e:
        print(f"Connection Error during Azure OpenAI call: {e}")
        extracted_kv_pairs = {"error": f"Azure OpenAI API Connection Error: {e}"}
    except openai.RateLimitError as e:
        print(f"Rate Limit Error during Azure OpenAI call: {e}")
        extracted_kv_pairs = {"error": f"Azure OpenAI API Rate Limit Error: {e}"}
    except openai.BadRequestError as e:
         print(f"Bad Request Error during Azure OpenAI call: {e}")
         extracted_kv_pairs = {"error": f"Azure OpenAI API Bad Request Error: {e}"}
    except openai.APIError as e:
        print(f"Generic API Error during Azure OpenAI call: {e}")
        extracted_kv_pairs = {"error": f"Azure OpenAI API Error: {e}"}
    except Exception as e:
        print(f"An unexpected error occurred during Azure OpenAI call: {e}")
        extracted_kv_pairs = {"error": f"Unexpected error during OpenAI call: {e}"}

else:
    print("\nSkipping Azure OpenAI call. Prerequisites not met (check client, image, messages).")
    if not extracted_kv_pairs:
      extracted_kv_pairs = {"error": "Prerequisites not met for OpenAI call."}


--- Sending request to Azure OpenAI GPT-4o ---

--- Raw GPT-4o Response (JSON String) ---
{
  "Personal Data": {
    "Full Name": "Boni Rustami",
    "Rank": "Cook",
    "Place and Date of Birth": "Samarinda, 10 Juni 1990",
    "Address": "JI Cempaka No 15, Samarinda",
    "Marital Status": "Married",
    "Nationality": "Indonesia",
    "Religion": "Moslem",
    "Telephone": "082931023940",
    "Email": "boni.rustami@cakemail.com"
  },
  "Document Travels": {
    "Seaman Book": {
      "Number": "F18273909",
      "Issued At": "Samarinda",
      "Issued Date": "18 Sep 2019",
      "Expire Date": "18 Sep 2022"
    },
    "Passport": {
      "Number": "AD4934810",
      "Issued At": "Jakarta",
      "Issued Date": "15 Agt 2017",
      "Expire Date": "15 Agt 2022"
    }
  },
  "Certificate of Competency": {
    "Ratings as Able Seafarer Deck": {
      "Certificate Number": "61229384960948340983",
      "Place and Date of Issued": "Samarinda, 15 Feb 2010"
    }
  },
  "Certificate of Prof

In [33]:
print("\n--- Final Extracted Key-Value Pairs (JSON) ---")
if isinstance(extracted_kv_pairs, dict):
    print(json.dumps(extracted_kv_pairs, indent=2))
elif extracted_kv_pairs:
     print(extracted_kv_pairs)
else:
    print("No key-value pairs were successfully extracted or an error occurred.")
print("---------------------------------------------")


--- Final Extracted Key-Value Pairs (JSON) ---
{
  "Personal Data": {
    "Full Name": "Boni Rustami",
    "Rank": "Cook",
    "Place and Date of Birth": "Samarinda, 10 Juni 1990",
    "Address": "JI Cempaka No 15, Samarinda",
    "Marital Status": "Married",
    "Nationality": "Indonesia",
    "Religion": "Moslem",
    "Telephone": "082931023940",
    "Email": "boni.rustami@cakemail.com"
  },
  "Document Travels": {
    "Seaman Book": {
      "Number": "F18273909",
      "Issued At": "Samarinda",
      "Issued Date": "18 Sep 2019",
      "Expire Date": "18 Sep 2022"
    },
    "Passport": {
      "Number": "AD4934810",
      "Issued At": "Jakarta",
      "Issued Date": "15 Agt 2017",
      "Expire Date": "15 Agt 2022"
    }
  },
  "Certificate of Competency": {
    "Ratings as Able Seafarer Deck": {
      "Certificate Number": "61229384960948340983",
      "Place and Date of Issued": "Samarinda, 15 Feb 2010"
    }
  },
  "Certificate of Proficiency": {
    "Basic Safety Training (BST