Extract the list of objects from the transcription, in order to have one line for each lots.

### Initialize

In [None]:
import sys, os
sys.path.append(os.path.abspath('../src'))
import re
import lib
from openai import OpenAI
from mistralai import Mistral
import ollama
import yaml

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
page_begin = config['catalog']['object_list_begin_page']
page_end = config['catalog']['object_list_end_page']
mode = config['model']['mode']
llm_provider = config['model']['llm_provider']
model = config['model']['language_model']
pages_parsed_at_once = config['model']['pages_parsed_at_once']
if not page_end: page_end = float('inf')

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')
    page_begin_given = os.getenv('OBJECTIVE_PAGE_BEGIN')
    page_end_given = os.getenv('OBJECTIVE_PAGE_END')
    page_begin = int(page_begin_given) if page_begin_given != '' else page_begin
    page_end = int(page_end_given) if page_end_given != '' else page_end

# Global variables
folder_path = f"../catalogs/{catalog}"
eta = lib.Eta()
if llm_provider == "mistralai": client = Mistral(api_key=os.getenv("MISTRALAI_API_KEY_OBJECTIVE"))
if llm_provider == "openai": client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_OBJECTIVE"))
input_path = f'{folder_path}/transcription.txt'
output_path = f'{folder_path}/list.txt'

In [None]:
# Prompts that will be sent to the LLM
prompt = f"""
From the following extract of an old auction catalog, list me all objects (original number + all description in original language + any other information about the object like details, dimensions, story, ...) that are being sold.
I would like your answer to be a list: do not make aditional comment, note or explaination.
The format of a single description should be for example: "18 - Lorem Ipsum..." with everything in a single line.
Ignore footnotes.

Here is the extract: 
"//extract//" 
""".strip()

### Load transcription

In [None]:
# Load transcription
file = open(input_path, 'r')
transcription = file.read()
file.close()

### Split and clean pages

In [None]:
pages = []

# Remove page separators and empty pages.
pattern = r">>>>> \[PAGE \d+\] >>>>>\n\n"
for match in re.finditer(pattern, transcription):

    # Extract the page content without its title
    page_begin_idx = match.end()
    rest_transcription = transcription[match.end():]
    next_match = re.search(pattern, rest_transcription)
    if next_match:
        page_end_idx = re.search(pattern, rest_transcription).start()
        page_content = rest_transcription[0:page_end_idx].strip()
    else:
        page_content = rest_transcription

    # Exclude empty pages
    page_content = page_content.replace('[No text]', '')

    # Record the page if there is something
    pages.append(page_content)

### List objects (with LLM)

In [None]:
if mode == 'direct':

    # Retrieve informations
    all_objects = []

    eta.begin(len(pages), "Retrieving object list")
    for extrac_begin_page in range(page_begin, min(page_end, len(pages)), pages_parsed_at_once):
        extract_end_page = extrac_begin_page + pages_parsed_at_once

        # Build the prompt
        # Here we start one page before, 
        # in order to have the object description that has been cut by the new page
        extract = '\n\n'.join(pages[extrac_begin_page - 1:extract_end_page])
        prompt_ = prompt.replace('//extract//', extract)

        messages = [{'role': 'user', 'content': prompt_}]

        # Ask the LLM
        if llm_provider == 'openai':
            response = client.chat.completions.create(model=model, messages=messages)
            answer = response.choices[0].message.content
        if llm_provider == 'ollama':
            response = ollama.chat(model=model, messages=messages)  
            answer = response['message']['content']
        if llm_provider == 'mistralai':
            response = client.chat.complete(model=model, messages=messages)
            answer = response.choices[0].message.content

        # Save the raw list object
        file = open(output_path, 'a')
        file.write('\n' + answer.replace('—', '-'))
        file.close()

        eta.iter(extrac_begin_page)
    eta.end()

    # Now that everything is done, load all objects
    file = open(output_path, 'r')
    objects = file.read().split('\n')
    file.close()

    # Deduplicate objects
    objects = list(set(objects))

    # Write objects list on disk
    print('### Save object list')
    file = open(output_path, 'w')
    file.write(objects.join('\n'))
    file.close()

In [None]:
# BATCH: Prepare tasks

if mode == 'batch':

    batch_tasks = []

    eta.begin(len(pages), "Retrieving object list")
    for extrac_begin_page in range(page_begin, min(page_end, len(pages)), pages_parsed_at_once):
        extract_end_page = extrac_begin_page + pages_parsed_at_once

        # Build the prompt
        # Here we start one page before, 
        # in order to have the object description that has been cut by the new page
        extract = '\n\n'.join(pages[extrac_begin_page - 1:extract_end_page])
        prompt_ = prompt.replace('//extract//', extract)
        messages = [{'role': 'user', 'content': prompt_}]
        custom_id = f"{catalog}-object-list-{str(extrac_begin_page).zfill(4)}"

        # Create an OpenAI task
        if llm_provider == 'openai':
            batch_tasks.append({
                "custom_id": custom_id, 
                "method": "POST", 
                "url": "/v1/chat/completions", 
                "body": {"model": model, "messages": messages }
            })
            
        # Create a MistralAI task
        if llm_provider == 'mistralai':
            batch_tasks.append({ 
                "custom_id": custom_id, 
                "body": { "messages": messages }
            })

        # Create an Ollama task
        if llm_provider == "ollama": 
            raise Exception('Batch not implemented with Ollama')

        eta.iter(extrac_begin_page)
    eta.end()

    print(f'{len(batch_tasks)} tasks created')

In [None]:
# BATCH: Create the batch (and wait for results)

if mode == "batch":
    if llm_provider == "mistralai":
        answers = lib.mistralai_batch_execution(
            tasks=batch_tasks,
            client=client, model=model, file_name=catalog, task_name=f"{catalog}_list"
        )
    if llm_provider == "openai":
        answers = lib.openai_batch_execution(
            task=batch_tasks, endpoint="/v1/chat/completions",
            client=client, mistral_file_name=catalog, task_name=f"{catalog}_list"
        )

In [None]:
# BATCH: Parse object list and save them

if mode == 'batch':

    # Extract the full object list
    objects = []
    for answer in answers:
        objects += answer.replace('—', '-').split('\n')

    # Remove prepending quotes
    for i, _ in enumerate(objects):
        if objects[i].startswith('"') or objects[i].startswith("'"):
            objects[i] = objects[i][1:]
        if objects[i].endswith('"') or objects[i].endswith("'"):
            objects[i] = objects[i][:-1]

    # Deduplicate objects
    clean_object = []
    have_objects = set()
    for object in objects:
        if object == '': 
            continue
        elif object not in have_objects:
            clean_object.append(object)
            have_objects.add(object)

    
    # Save the object list
    print('### Save object list')
    file = open(output_path, 'w')
    file.write('\n'.join(clean_object))
    file.close()