Extract author information from lot descriptions.

### Initialize

In [None]:
import sys, os, time
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import spacy
import warnings
import lib
import yaml
import ollama
warnings.filterwarnings("ignore")

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
spacy_model = config['catalog']['spacy_model']
local_model = config['model']['local_model']
cooldown = config['model']['local_cooldown']
details = config['model']['author_details']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')
    
# Global Variables
nlp = spacy.load(spacy_model)
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")
eta = lib.Eta()
folder_path = f"../catalogs/{catalog}"
input_path = f'{folder_path}/objects.csv'
output_path = f'{folder_path}/objects.csv'
param_path = f"./01-authors-blacklist.yaml"


### Load objects

In [None]:
objects = pd.read_csv(input_path)
objects['index'] = objects['index'].astype(pd.StringDtype())

if 'author' not in objects.columns:
    objects['author'] = pd.NA

### Load authors blacklist

In [None]:
with open(param_path, "r") as f:
    blacklist = list(map(lambda s: s.strip().lower(), yaml.safe_load(f)))

### Find authors

In [None]:
eta.begin(len(objects), "Finding authors")
for i, row in objects.iterrows():
    authors = row['author'].lower() if pd.notna(row['author']) else ""

    doc = nlp(row['description'])
    word_before = ""
    for token in doc:
        if token.pos_ in ['PROPN', 'NOUN'] and word_before in ["de", "par", "d'"] and token.text[0].isupper():
        
            # Check if the token is blacklisted
            if token.text.lower() in blacklist: 
                continue
            
            # If the option is set in the config file, display details for the authors part
            if details:
                # Reload blacklist, so that it can be extanded along the run
                with open(param_path, "r") as f:
                    blacklist = list(map(lambda s: s.strip().lower(), yaml.safe_load(f)))

                # Print to increase blacklist
                eta.print("Supposed author: " + token.text)

            # Ask LLM if it is the author
            prompt = f"From the following object description, can we say that {token.text} is the author?\nHere is the description: \"{row['description']}\"\nAnswer with a single word: \"yes\" or \"no\", with no additionnal explaination."
            messages = [{ "role": "user", "content": prompt }]
            response = ollama.chat(model=local_model, messages=messages)   
            time.sleep(cooldown) # To let computer cool down
            answer: str = response['message']['content']

            # If LLM says yes, save result
            if "yes" in answer.lower():
                authors = lib.add_element(authors, token.text)

        word_before = token.text

    objects.at[i, 'author'] = lib.clean_elements_str(authors)

    eta.iter()
eta.end()

### Save objects

In [None]:
objects.to_csv(output_path, index=False)