Extract information from lot descriptions.

### Initialize

In [None]:
import sys, os
sys.path.append(os.path.abspath('../src'))
import json
import pandas as pd
import lib
import ollama
from mistralai import Mistral
from openai import OpenAI
import yaml

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
catalog_language = config['catalog']['language']
mode = config['model']['mode']
llm_provider = config['model']['llm_provider']
model = config['model']['language_model']
object_number_by_prompt = config['model']['object_number_by_prompt']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')

# Global variables
folder_path = f"../catalogs/{catalog}"
eta = lib.Eta()
if llm_provider == "mistralai": client = Mistral(api_key=os.getenv("MISTRALAI_API_KEY_OBJECTIVE"))
if llm_provider == "openai": client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_OBJECTIVE"))
input_path = f'{folder_path}/list.txt'
output_path = f'{folder_path}/objects.csv'

In [None]:
# Prompt
prompt = f"""
From the following {object_number_by_prompt} descriptions of objects lots, extract me the following information. 
- description: the original description (with index)
- index: the number given to the lot; eg "1", "123", "45"...
- object_type: what the lot is; eg "table", "plate", "statue"...
- number: how many object are in this lot; eg "1", "2", "3"...
- material_technique (if mentioned): the main material of the lot with its technique if mentioned; eg "painted enamel", "lacque", "embroidered silk", "carved wood"...
- origin (if mentioned): country or region of origin of the lot; eg "Germany", "France", "Europe"...
Your answer should be a JSON object (an array of length {object_number_by_prompt}), do not add comments, notes or explanations.
Each object property is a list and should be as small as possible, a few words at maximum.
Extracted information should be in {catalog_language}.

Here are the {object_number_by_prompt} descriptions:
"//descriptions//"
""".strip()

In [None]:
def parse_value(value: str | int ) -> str:
    """Function to parse the return value of a description attribute."""
    if isinstance(value, list): return ', '.join(list(map(lambda v: str(v), value))).lower()
    if isinstance(value, str): return value.lower()
    if isinstance(value, int): return str(value)
    return ''

### Load list

In [None]:
file = open(input_path, 'r')
objects = file.read().split('\n')
file.close()

### Extract objects

In [None]:
if mode == "direct":
    objects = []

    eta.begin(len(objects), "Extracting information from object description")
    for i in range(0, len(objects), object_number_by_prompt):
        selection = objects[i:i+object_number_by_prompt]
        descriptions = '\n'.join(selection)

        # Prompt creation
        prompt_ = prompt.replace('//descriptions//', descriptions)
        messages = [{'role': 'user', 'content': prompt_}]
        
        # Ask the LLM
        if llm_provider == "ollama":
            response = ollama.chat(model=model, messages=messages)
            answer: str = response['message']['content']
        if llm_provider == "mistralai":
            response = client.chat.complete(model=model, messages=messages)
            answer = response.choices[0].message.content
        if llm_provider == "openai":
            response = client.chat.completions.create(model=model, messages=messages)
            answer = response.choices[0].message.content

        # Parse the answer
        try:
            begin_index = answer.index('```json') + 7
            end_index = answer.index('```', begin_index)
            object = json.loads(answer[begin_index:end_index].strip())
            record = {
                "description": parse_value(object['description'] if 'description' in object else None),
                "index": parse_value(object['index'] if 'index' in object else None),
                "object_type": parse_value(object['object_type'] if 'object_type' in object else None),
                "material": parse_value(object['material'] if 'material' in object else None),
                "origin": parse_value(object['origin'] if 'origin' in object else None),
                "number": parse_value(object['number'] if 'number' in object else None),
            }
            objects.append(record)

        except Exception as err:
            print('')
            print(answer)
            raise err

        eta.iter(i)
    eta.end()

    print('### Save table')
    objects = pd.DataFrame(data=objects)
    objects.to_csv(output_path, index=None)

In [None]:
# BATCH: Prepare tasks

if mode == "batch":
    
    batch_tasks = []

    eta.begin(len(objects), "Extracting information from object description")
    for i in range(0, len(objects), object_number_by_prompt):
        selection = objects[i:i+object_number_by_prompt]
        descriptions = '\n'.join(selection)

        # Prompt creation
        prompt_ = prompt.replace('//descriptions//', descriptions)
        messages = [{'role': 'user', 'content': prompt_}]
        custom_id = f"{catalog}-table-{str(i + 1).zfill(4)}"

        # Create an OpenAI task
        if llm_provider == 'openai':
            batch_tasks.append({
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions", 
                "body": {"model": model, "messages": messages }
            })
        
        # Create a MistralAI task
        if llm_provider == 'mistralai':
            batch_tasks.append({ 
                "custom_id": custom_id, 
                "body": { "messages": messages }
            })

        # Create an Ollama task
        if llm_provider == "ollama": 
            raise Exception('Batch not implemented with Ollama')

        eta.iter(i)
    eta.end()

    print(f'{len(batch_tasks)} tasks created')

In [None]:
# BATCH: Create the batch (and wait for results)

if mode == "batch":
    if llm_provider == "mistralai":
        answers = lib.mistralai_batch_execution(
            tasks=batch_tasks,
            client=client, model=model, file_name=f"batch-1-transcription-{catalog}", task_name=f"{catalog}_objects",
        )
    if llm_provider == "openai":
        answers = lib.openai_batch_execution(
            task=batch_tasks,
            client=client, endpoint="/v1/chat/completions", task_name=f"{catalog}_objects",
        )

In [None]:
# BATCH: Parse answers and create output file

if mode == 'batch':
    
    # Create the table
    objects = []
    objects_clue = [] # temp
    for i, answer in enumerate(answers):
        try:
            # Extract the JSON from the answer
            if "```json" in answer:
                begin_index = answer.index('```json') + 7
                end_index = answer.index('```', begin_index)
                answer_obj_list = json.loads(answer[begin_index:end_index].strip())
            elif "```" in answer:
                begin_index = answer.index('```') + 3
                end_index = answer.index('```', begin_index)
                answer_obj_list = json.loads(answer[begin_index:end_index].strip())
            else:
                answer_obj_list = json.loads(answer)

            objects_clue += answer_obj_list # temp


            # Replace all arrays by a single value and add it to the table
            for answer_obj in answer_obj_list:
                for key, value in answer_obj.items():
                    if isinstance(value, list):
                        value_str = list(map(lambda v: str(v) if isinstance(v, int) else v, value))
                        answer_obj[key] = ', '.join(value_str)
                        if key != 'description': answer_obj[key] = answer_obj[key].lower()
                if answer_obj['description'] and answer_obj['description'] != '':
                    objects.append(answer_obj)
                else: 
                    print('--- index', i)
                    print(answer_obj)

        except Exception as err:
            print(answer)
            print(answer_obj)
            raise err

In [None]:
# BATCH: Save the table

print('### Save result')
if mode == "batch":
    objects = pd.DataFrame(data=objects)
    objects['index'] = objects['index'].astype(pd.StringDtype())
    objects.to_csv(output_path, index=None)

---

In [None]:
# # In case there were a crash AFTER OUTPUT FILES HAS BEEN FETCHED this code finalizes the notebook from the output files
# import json
# import pandas as pd

# catalog = "1861-04-08_Prince-Soltykoff"
# folder_path = f"../catalogs/{catalog}"
# output_path = f"../batch_files/1861-04-08_Prince-Soltykoff_objects_output_2025-05-21_09-51.jsonl"

# # Read the output files
# results = []
# file = open(output_path, 'r')
# for line in file.readlines():
#     results.append(json.loads(line.strip()))
# file.close()

# # Sort results by given ids
# results = sorted(results, key=lambda x: x['custom_id'])

# # Get answers out of object
# answers = []
# for result in results:
#     answer = result['response']['body']['choices'][0]['message']['content']
#     answer = answer.replace('\\', '\\\\') # Because of an error
#     answers.append(answer)


# # Create the table
# objects = []
# objects_clue = [] # temp
# for i, answer in enumerate(answers):
#     try:
#         # Extract the JSON from the answer
#         if "```json" in answer:
#             begin_index = answer.index('```json') + 7
#             end_index = answer.index('```', begin_index)
#             answer_obj_list = json.loads(answer[begin_index:end_index].strip())
#         elif "```" in answer:
#             begin_index = answer.index('```') + 3
#             end_index = answer.index('```', begin_index)
#             answer_obj_list = json.loads(answer[begin_index:end_index].strip())
#         else:
#             answer_obj_list = json.loads(answer)

#         objects_clue += answer_obj_list # temp


#         # Replace all arrays by a single value and add it to the table
#         for answer_obj in answer_obj_list:
#             for key, value in answer_obj.items():
#                 if isinstance(value, list):
#                     value_str = list(map(lambda v: str(v) if isinstance(v, int) else v, value))
#                     answer_obj[key] = ', '.join(value_str)
#                     if key != 'description': answer_obj[key] = answer_obj[key].lower()
#             if answer_obj['description'] and answer_obj['description'] != '':
#                 objects.append(answer_obj)
#             else: 
#                 print('--- index', i)
#                 print(answer_obj)

#     except Exception as err:
#         print(answer)
#         print(answer_obj)
#         raise err
    
# # Save the table
# output_path = f'{folder_path}/objects.csv'
# objects = pd.DataFrame(data=objects)
# objects['index'] = objects['index'].astype(pd.StringDtype())
# objects.to_csv(output_path, index=None)