In [1]:
# %pip install retab

In [2]:
# Draft an Initial Schema
from pydantic import BaseModel

class Invoice(BaseModel):
    date: str
    invoice_number: str
    total: str
    status: str
    customer: str
    customer_address: str
    customer_email: str
    customer_phone: str
    customer_website: str

# Execute with Consensus

from dotenv import load_dotenv
from retab import Retab
import json

load_dotenv() # You need to create a .env file containing your RETAB_API_KEY=sk_retab_***

client = Retab()

response = client.documents.extract(
    documents=["../assets/code/invoice.jpeg"],
    model="gpt-4o-mini",          # or any model your plan supports
    json_schema=Invoice.model_json_schema(),
    temperature=0.5,              # you need to add temperature
    modality="text",            
    n_consensus=5
)

print(json.dumps(response.likelihoods, indent=2))

{
  "date": 1.0,
  "invoice_number": 1.0,
  "total": 1.0,
  "status": 0.6,
  "customer": 1.0,
  "customer_address": 1.0,
  "customer_email": 1.0,
  "customer_phone": 1.0,
  "customer_website": 1.0
}


In [3]:
# Update the Schema

from pydantic import BaseModel, Field
from enum import Enum

class StatusEnum(str, Enum):
    Blank  = "Blank"
    Paid   = "Paid"
    Unpaid = "Unpaid"

class Invoice_v2(BaseModel):
    date: str
    invoice_number: str
    total: str

    # Improvement on this field
    status: StatusEnum = Field(
        default=StatusEnum.Blank,
        description="Invoice status; Blank when no status appears on the document." # We add a description to gain in precision
    )

    customer: str
    customer_address: str
    customer_email: str
    customer_phone: str
    customer_website: str

    # Evaluate the precision of the new Schema
response = client.documents.extract(
    documents=["../assets/code/invoice.jpeg"],
    model="gpt-4o-mini",          # or any model your plan supports
    json_schema=Invoice_v2.model_json_schema(),
    temperature=0.5,              # you need to add temperature
    modality="text",            
    n_consensus=5
)

print(json.dumps(response.likelihoods, indent=2))

{
  "date": 1.0,
  "invoice_number": 1.0,
  "total": 1.0,
  "status": 1.0,
  "customer": 1.0,
  "customer_address": 1.0,
  "customer_email": 1.0,
  "customer_phone": 1.0,
  "customer_website": 1.0
}


We improve the likelihood on the `status` field from 0.6 to 1.0!