### Setup your environnement variables

In [1]:
from dotenv import load_dotenv
assert load_dotenv("../.env.local", override=True) # Load environment variables from a .env file
# setup OPENAI_API_KEY and RETAB_API_KEY in the .env.local file
# Or you can set them directly in the code:
# import os
# os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
# os.environ["RETAB_API_KEY"] = "YOUR API KEY" # go to https://retab.dev to create your API Key

# Get started

In [2]:
from retab import Retab
from retab._utils.json_schema import filter_auxiliary_fields_json
reclient = Retab()
models_list = reclient.models.list()

  from .autonotebook import tqdm as notebook_tqdm


# Example 1 - Analyze informations from a document

#### Option A - You use our preprocess endpoint and openAI client

In [3]:
import json
from retab import Retab, Schema
from openai import OpenAI

with open("freight/schema.json", "r") as f:
    json_schema = json.load(f)


reclient = Retab()
doc_msg = reclient.documents.create_messages(
    document = "freight/booking_confirmation.jpg",
)

schema_obj = Schema(
    json_schema = json_schema
)



In [4]:

# Now you can use your favorite model to analyze your document
client = OpenAI()
completion = client.chat.completions.create(
    model="gpt-4.1-nano",
    messages=schema_obj.openai_messages + doc_msg.openai_messages,
    response_format={
        "type": "json_schema",
        "json_schema": {
            "name": schema_obj.id,
            "schema": schema_obj.inference_json_schema,
            "strict": True
        }
    }
)
print(completion.choices[0].message.content)

{"reasoning___root":"The document appears to be a detailed freight transport confirmation, including client and shipment details, with clear segmentation into transporter info, goods, and addresses. The document type is a 'Confirmation d'affrètement' (Freight Booking Confirmation) from ACME CORPORATION, with explicit freight, shipping, and invoice details. It includes transportation instructions, company info, and specific shipment details with references. The overall structure defines client info, transporter's engagement, and multiple shipment entries, each with sender, recipient, goods, and constraints.","booking_id":null,"payment":{"total_price":1500,"currency":"EUR"},"client":{"company_name":"ACME Corporation","reasoning___VAT_number":"","VAT_number":null,"city":"Manchester","postal_code":"M1 4WP","country":"GB","code":null,"email":"client@acme.com"},"reasoning___shipments":"The document lists a single shipment (though formatted in a way resembling multiple list entries). The ship

In [5]:
# Validate the response against the original schema if you want to remove the reasoning fields
assert completion.choices[0].message.content is not None
extraction = schema_obj.pydantic_model.model_validate(
     filter_auxiliary_fields_json(completion.choices[0].message.content)
)
extraction.model_dump()

{'booking_id': None,
 'payment': {'total_price': 1500.0, 'currency': 'EUR'},
 'client': {'company_name': 'ACME Corporation',
  'VAT_number': None,
  'city': 'Manchester',
  'postal_code': 'M1 4WP',
  'country': 'GB',
  'code': None,
  'email': 'client@acme.com'},
 'shipments': [{'shipment_id': None,
   'sender': {'company_name': 'TRANSPORTEUR EXPRESS',
    'address': {'city': 'Manchester',
     'postal_code': 'M1 4WP',
     'country': 'GB',
     'line1': 'Site de : Zone Industrielle Est, Bâtiment C',
     'line2': ''},
    'phone_number': '+44 23 45 67 89',
    'email_address': '',
    'pickup_datetime': {'date': '2023-02-05',
     'start_time': '08:00:00',
     'end_time': '12:00:00'},
    'observations': 'La note indique que le transport doit respecter la capacité, la température contrôlée si nécessaire, et les contraintes de déchargement.'},
   'recipient': {'company_name': '',
    'address': {'city': 'Munich',
     'postal_code': '80331',
     'country': 'DE',
     'line1': 'Beta I

In [6]:
## If you want to log the extraction (keep it in our database)
# Limitation: we don't get the likelihoods
# For now one might send the messages in Retab-compatible format.
reclient.documents.extractions.log(
    document = "freight/booking_confirmation.jpg",
    messages = schema_obj.messages + doc_msg.messages,
    completion = completion,
    json_schema = json_schema,
    model = "gpt-4.1-nano",
    temperature = 0,
)

{'extraction_id': 'extr_z8loUSIi9cfMcD5lRdlZr',
 'status': 'success',
 'error_message': None}

In [7]:
# You can use the messages from different providers (but you cannot mix them)
# out_log_openai = reclient.documents.extractions.log(
#     document = "freight/booking_confirmation.jpg",
#     openai_messages = schema_obj.openai_messages + doc_msg.openai_messages,
#     completion = completion,
#     json_schema = json_schema,
#     model = "gpt-4.1-nano",
#     temperature = 0,
# )
# print(out_log_openai)

# out_log_anthropic = reclient.documents.extractions.log(
#     document = "freight/booking_confirmation.jpg",
#     anthropic_messages = schema_obj.anthropic_messages + doc_msg.anthropic_messages,
#     anthropic_system_prompt = schema_obj.anthropic_system_prompt,
#     completion = completion,
#     json_schema = json_schema,
#     model = "gpt-4.1-nano",
#     temperature = 0,
# )
# print(out_log_anthropic)

#### Option B - Using Retab `extract` endpoint

In [8]:
import json
from retab.client import Retab

reclient = Retab()
completion = reclient.documents.extractions.parse(
    json_schema = "freight/schema.json",
    document = "freight/booking_confirmation.jpg",
    model="gpt-4.1-nano",
    temperature=0,
)

filter_auxiliary_fields_json(completion.choices[0].message.content or "{}")

# For streaming:
# from IPython.display import clear_output, display

# with reclient.documents.extractions.stream(
#     json_schema = "freight/schema.json",
#     document = "freight/booking_confirmation.jpg",
#     model="gpt-4.1-nano",
#     temperature=0,
# ) as stream:
#     for chunk in stream:
#         clear_output(wait=True)
#         display(chunk.model_dump())
        


{'booking_id': None,
 'payment': {'total_price': 1500, 'currency': 'EUR'},
 'client': {'company_name': 'ACME Corporation',
  'VAT_number': None,
  'city': None,
  'postal_code': None,
  'country': None,
  'code': None,
  'email': None},
 'shipments': [{'shipment_id': 'SHIP-001',
   'sender': {'company_name': 'Transport Express',
    'address': {'city': None,
     'postal_code': None,
     'country': None,
     'line1': 'Zone Industrielle Est, Bâtiment C',
     'line2': None},
    'phone_number': '+01 23 45 67 89',
    'email_address': None,
    'pickup_datetime': {'date': None, 'start_time': None, 'end_time': None},
    'observations': None},
   'recipient': {'company_name': 'Beta Industries',
    'address': {'city': 'Munich',
     'postal_code': '80331',
     'country': None,
     'line1': '79 Prime Street',
     'line2': None},
    'phone_number': None,
    'email_address': None,
    'delivery_datetime': {'date': '2023-03-05',
     'start_time': None,
     'end_time': None},
    'obs

## Appendix A - Use text as modality

In [10]:
from retab.client import Retab

reclient = Retab()

completion = reclient.documents.extractions.parse(
    json_schema = "freight/schema.json",
    document="freight/booking_confirmation.jpg",
    model="gpt-4.1-nano",
    temperature=0,
    modality='text'
)

filter_auxiliary_fields_json(completion.choices[0].message.content or "{}")

{'booking_id': 'SHIP-001',
 'payment': {'total_price': 1500, 'currency': 'EUR'},
 'client': {'company_name': 'ACME CORPORATION',
  'VAT_number': 'FR12 345 678 901',
  'city': 'London',
  'postal_code': 'WC2N 5DU',
  'country': 'GB',
  'code': None,
  'email': 'client@acme.com'}}

## Appendix B - Add image settings

In [11]:
from retab.client import Retab

reclient = Retab()

completion = reclient.documents.extractions.parse(
    json_schema = "freight/schema.json",
    document = "freight/booking_confirmation.jpg",
    model="gpt-4.1-nano",
    temperature=0,
    image_resolution_dpi=96,
    browser_canvas="A4",
    n_consensus=1
)

filter_auxiliary_fields_json(completion.choices[0].message.content or "{}")

{'booking_id': None,
 'payment': {'total_price': 1500, 'currency': None},
 'client': {'company_name': 'Acme Corporation',
  'VAT_number': None,
  'city': None,
  'postal_code': None,
  'country': None,
  'code': None,
  'email': None},
 'shipments': [{'shipment_id': 'BC-67890',
   'sender': {'company_name': 'TRANSPORTE EXPRESS',
    'address': {'city': 'Zone Industrielle Est, Bâtiment C',
     'postal_code': None,
     'country': None,
     'line1': 'Poids: 500 Tallets',
     'line2': 'Etc.'},
    'phone_number': None,
    'email_address': None,
    'pickup_datetime': {'date': '2023-05-02',
     'start_time': None,
     'end_time': None},
    'observations': None},
   'recipient': {'company_name': None,
    'address': {'city': 'Munich',
     'postal_code': '80331',
     'country': 'DE',
     'line1': 'Karlsplatz 10',
     'line2': ''},
    'phone_number': None,
    'email_address': None,
    'delivery_datetime': {'date': '2023-05-03',
     'start_time': None,
     'end_time': None},
  

## Appendix C - Use o3-mini model with reasoning effort

In [12]:
from retab.client import Retab

reclient = Retab()

completion = reclient.documents.extractions.parse(
    json_schema = "freight/schema.json",
    document = "freight/booking_confirmation.jpg",
    model="o3-mini",
    temperature=0,
    reasoning_effort="low",
    modality="text",
    image_resolution_dpi=96,
    browser_canvas="A4",
)

filter_auxiliary_fields_json(completion.choices[0].message.content or "{}")

{'booking_id': None,
 'payment': {'total_price': 1500, 'currency': 'EUR'},
 'client': {'company_name': 'Acme Corporation',
  'VAT_number': 'GB123456789',
  'city': 'London',
  'postal_code': 'WC2N 5DU',
  'country': 'GB',
  'code': None,
  'email': 'client@acme.com'},
 'shipments': [{'shipment_id': 'SHIP-001',
   'sender': {'company_name': 'Acme Corporation',
    'address': {'city': 'London, Greater London',
     'postal_code': 'WC2N 5DU',
     'country': 'GB',
     'line1': '123 Elm Street',
     'line2': 'Suite 500'},
    'phone_number': '+44 20 7946 0958',
    'email_address': 'client@acme.com',
    'pickup_datetime': {'date': '2023-02-05',
     'start_time': '08:00:00',
     'end_time': '12:00:00'},
    'observations': None},
   'recipient': {'company_name': 'Beta Industries',
    'address': {'city': None,
     'postal_code': '80331',
     'country': 'DE',
     'line1': '789 Pine Street',
     'line2': 'Munich, Bavaria'},
    'phone_number': '+49 89 123456',
    'email_address': No

## Appendix D - Consensus extraction

You can also benefit from consensus by running multiple extractions (with non-zero temperature) and gathering the best results with a more reliable likelihoods.

Be careful, the cost of consensus is as much higher as the number of consensus model you run. ($Total = N * Cost(model)$)

In [13]:
from retab.client import Retab

reclient = Retab()

completion = reclient.documents.extractions.parse(
    json_schema = "freight/schema.json",
    document = "freight/booking_confirmation.jpg",
    model="gpt-4.1-nano",
    temperature=0.5,
    # image_settings=image_settings,
    n_consensus=5
)

filter_auxiliary_fields_json(completion.choices[0].message.content or "{}")

{'booking_id': None,
 'payment': {'total_price': 1500, 'currency': 'EUR'},
 'client': {'company_name': 'Acme Corporation',
  'VAT_number': 'FR12345678901',
  'city': 'Manchester',
  'postal_code': 'M1 4WP',
  'country': 'GB',
  'code': None,
  'email': 'client@acme.com'},
 'shipments': [{'shipment_id': 'BC-67890',
   'sender': {'company_name': 'Acme Corporation',
    'address': {'city': 'Manchester',
     'postal_code': '80331',
     'country': 'GB',
     'line1': 'Uncertain',
     'line2': 'Floor 3'},
    'phone_number': '+49 89 123456',
    'email_address': 'client@acme.com',
    'pickup_datetime': {'date': '2023-05-02',
     'start_time': '08:00:00',
     'end_time': '12:00:00'},
    'observations': 'Goods are fragile, some pallets contain liquids, handle with care, special temperature conditions likely required.'},
   'recipient': {'company_name': 'Beta Industries',
    'address': {'city': 'Munich',
     'postal_code': '80331',
     'country': 'DE',
     'line1': '123 Elm Street',


## Appendix E - Using OpenAI Responses API

In [14]:
from openai import OpenAI

# Now you can use your favorite model to analyze your document
client = OpenAI()

responses_output = client.responses.create(
    model="gpt-4.1-nano",
    input=schema_obj.openai_responses_input + doc_msg.openai_responses_input,
    text={
        "format": {
            "type": "json_schema",
            "name": schema_obj.id,
            "schema": schema_obj.inference_json_schema,
            "strict": True
        }
    },
    temperature=0,
    store=False
)
# print(responses_output.output_text)
filter_auxiliary_fields_json(responses_output.output_text or "{}")



{'booking_id': None,
 'payment': {'total_price': 1500, 'currency': 'EUR'},
 'client': {'company_name': 'ACME Corporation',
  'VAT_number': None,
  'city': None,
  'postal_code': None,
  'country': None,
  'code': None,
  'email': None},
 'shipments': [{'shipment_id': None,
   'sender': {'company_name': 'ACME Corporation',
    'address': {'city': 'Manchester',
     'postal_code': 'M1 4WP',
     'country': 'GB',
     'line1': '456 Oak Avenue',
     'line2': 'Floor 3'},
    'phone_number': '+44 20 7946 0958',
    'email_address': 'client@acme.com',
    'pickup_datetime': {'date': None, 'start_time': None, 'end_time': None},
    'observations': None},
   'recipient': {'company_name': 'Beta Industries',
    'address': {'city': 'Munich',
     'postal_code': '80331',
     'country': 'DE',
     'line1': '789 Business Street',
     'line2': None},
    'phone_number': '+49 89 12345',
    'email_address': 'contact@betaind.com',
    'delivery_datetime': {'date': '2023-05-03',
     'start_time': '1

In [15]:
# IF you want to keep it with us:
reclient.documents.extractions.log(
    document = "freight/booking_confirmation.jpg",
    openai_responses_input = schema_obj.openai_responses_input + doc_msg.openai_responses_input,
    openai_responses_output = responses_output,
    json_schema = json_schema,
    model = "gpt-4.1-nano",
    temperature = 0,
)

{'extraction_id': 'extr__AzcorEJRCl2uu2X0qH0O',
 'status': 'success',
 'error_message': None}

In [16]:
## This also work with streaming!
from jiter import from_json
from IPython.display import clear_output, display

client = OpenAI()
output_text_cum = ""
with client.responses.stream(
    model="gpt-4.1-nano",
    input=schema_obj.openai_responses_input + doc_msg.openai_responses_input,
    text={
        "format": {
            "type": "json_schema",
            "name": schema_obj.id,
            "schema": schema_obj.inference_json_schema,
            "strict": True
        }
    } 
) as stream:
    for event in stream:
        if event.type == "response.output_text.delta":
            output_text_cum += event.delta
            clear_output(wait=True)
            # Use this if you want to parse partial json objects.
            display(from_json(bytes(output_text_cum, "utf-8"), partial_mode="trailing-strings"))
        elif event.type == "response.completed":
            clear_output(wait=True)
            # Should be directly a valid json object!
            display(json.loads(event.response.output_text))


{'reasoning___root': "The document is clearly a shipping confirmation or freight booking document from Acme Corporation, titled 'Confirmation d'affrètement'. The information on the company's contact details, site, and address confirms the sender's identity. The document contains sections about the transporter, the shipper's engagement, references, and package details. The detailed shipping instructions, site address, and freight details align with a transport booking confirmation. The presence of a title, company logo, and structured data about shipment, transport constraints, and goods confirms this is a formal transport booking document with high confidence.",
 'booking_id': None,
 'payment': {'total_price': None, 'currency': None},
 'client': {'company_name': 'ACME CORPORATION',
  'reasoning___VAT_number': "VAT number is explicitly mentioned as 'VAT Number: GB123456789' on the top right, next to the VAT number label. It matches the typical format of UK VAT numbers, which can be alph