Transcribe a catalogue using LLM capabilities: From PDF, to images, to texts

### Initialize

In [None]:
import sys, os
sys.path.append(os.path.abspath('../src'))
import base64
import pymupdf
from io import BytesIO
from PIL import Image
from openai import OpenAI
from mistralai import Mistral
import ollama
import lib
import yaml

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
catalog_language = config['catalog']['language']
mode = config['model']['mode']
llm_provider = config['model']['llm_provider']
model = config['model']['vision_model']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')

# Global variables
folder_path = f"../catalogs/{catalog}"
eta = lib.Eta()
if llm_provider == "openai": client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_OBJECTIVE"))
if llm_provider == "mistralai": client = Mistral(api_key=os.getenv("MISTRALAI_API_KEY_OBJECTIVE"))
input_path = f"{folder_path}/catalog.pdf"
output_path = f"{folder_path}/transcription.txt"
page_begin = 0
page_end = None

In [None]:
# Prompt that will be sent to the LLM
prompt = f"""
Transcribe the following document, which is a scanned page of an auction book.
Your answer can only contain the transcription: no comments, notes or explanation.
If there is no text to transcribe, just answer "[No text]"
If there is an image on the scan, integrate it in the transcription with a small description like "[Image of a man walking on the road]"
If there are headers or footnotes, integrate them like: "[footnote: Lorem ipsum]", "[header: Lorem ipsum]"
It is important that you ignore all handwritten notes. 
If there is a page number (top or bottom), integrate it like: "[page number: 999]"
Keep the original transcription language ({catalog_language}).
""".strip()

### Transcribe

In [None]:
if mode == "direct": 
    
    doc = pymupdf.open(input_path)
    eta.begin(doc.page_count, 'Transcribing catalogue')
    for i, page in enumerate(doc[page_begin:page_end if page_end is not None else len(doc)]):

        # Transform the pdf page into an image
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        # img.save(f"{folder_path}/page-{i}.jpg", format="JPEG") # Save the image on disk, for debugging
        b64_image = base64.b64encode(buffered.getvalue()).decode()

        # Ask OPEN AI
        if llm_provider == "openai": 
            messages = [
                { "role": "user", "content": [
                    { "type": "text", "text": prompt },
                    { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{b64_image}", "detail": "high" }}
                ]}
            ]
            completion = client.chat.completions.create(model=model,messages=messages)
            answer = completion.choices[0].message.content

        # ASK MISTRAL
        if llm_provider == "mistralai":
            messages = [
                { "role": "user", "content": [
                    { "type": "text", "text": prompt },
                    { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{b64_image}", "detail": "high" }}
                ]}
            ]
            chat_response = client.chat.complete(model=model, messages=messages)
            answer = chat_response.choices[0].message.content

        # ASK OLLAMA
        if llm_provider == "ollama":
            messages = [{ "role": "user", "content": prompt, "images": [b64_image] }]
            ollama_answer = ollama.chat(model=model, messages=messages)  
            answer = ollama_answer['message']['content']

        # For debugging:
        # print(f'Page {i}, model answer:')
        # print(answer)

        # Append the current page transcription
        transcription = f"\n\n>>>>> [Page {i + 1}] >>>>>\n\n{answer}"

        # Save the transcription on disk
        print('### Save transcription')
        file = open(output_path, 'a')
        file.write(transcription.replace('—', '-'))
        file.close()

        eta.iter()
    eta.end()

In [None]:
# BATCH: Prepare tasks

if mode == "batch":
    batch_tasks = []

    doc = pymupdf.open(input_path)
    eta.begin(doc.page_count, 'Building batch tasks')
    for i, page in enumerate(doc[page_begin:page_end if page_end is not None else len(doc)]):
        
        # Transform the pdf page into an image
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        buffered = BytesIO()
        img.save(buffered, format="JPEG")
        # img.save(f"{folder_path}/page-{i}.jpg", format="JPEG") # Save the image on disk
        b64_image = base64.b64encode(buffered.getvalue()).decode()
        custom_id = f"{catalog}-transcription-p{str(i).zfill(4)}"


        # Ask MISTRAL
        if llm_provider == "mistralai":
            messages = [
                { "role": "user", "content": [
                    { "type": "text", "text": prompt },
                    { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{b64_image}", "detail": "high" }}
                ]}
            ]
            batch_tasks.append({ 
                "custom_id": custom_id, 
                "body": { "messages": messages }
            })

        # Ask OPEN AI
        if llm_provider == "openai": 
            messages = [
                { "role": "user", "content": [
                    { "type": "text", "text": prompt },
                    { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{b64_image}", "detail": "high" }}
                ]}
            ]
            batch_tasks.append({ 
                "custom_id": custom_id, 
                "method": "POST", 
                "url": "/v1/chat/completions", 
                "body": {"model": model, "messages": messages }
            })


        # ASK OLLAMA
        if llm_provider == "ollama": 
            raise Exception('Batch not implemented with Ollama')

        eta.iter()
    eta.end()

    print(f'{len(batch_tasks)} tasks created')

In [None]:
# BATCH: Create the batch (and wait for results)

if mode == "batch":
    if llm_provider == "mistralai":
        answers = lib.mistralai_batch_execution(
            tasks=batch_tasks,
            client=client, model=model, file_name=f"batch-1-transcription-{catalog}", task_name=f"{catalog}_transcription"
        )
    if llm_provider == "openai":
        answers = lib.openai_batch_execution(
            task=batch_tasks,
            client=client, endpoint="/v1/chat/completions", task_name=f"{catalog}_transcription"
        )

In [None]:
# BATCH: Build the transcription 

if mode == "batch":

    transcription = ""
    for i, answer in enumerate(answers):
        page_index = i + 1
        transcription += f"\n\n>>>>> [PAGE {page_index}] >>>>>\n\n"
        transcription += answer.replace('\n```', '')

In [None]:
# BATCH: Save the transcription

if mode == "batch":

    print('### Save transcription')
    file = open(output_path, 'w')
    file.write(transcription.replace('—', '-').replace('\\', '\\\\').replace('```', ''))
    file.close()