Run some NLP rules on each properties to see if it makes sense given the lot description.

### Initialize

In [None]:
import sys, os, time
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import lib
import yaml
import ollama

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
local_model = config['model']['local_model']
cooldown = config['model']['local_cooldown']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')
    
# Global variables
folder_path = f"../catalogs/{catalog}"
eta = lib.Eta()
input_path = f"{folder_path}/objects.csv"
output_path = f"{folder_path}/objects.csv"

### Load objects

In [None]:
objects = pd.read_csv(input_path)
objects['index'] = objects['index'].astype(pd.StringDtype())

### Find and validate strange records

In [None]:
# Initialize the flag for each lines
columns_before = list(objects.columns)
objects['verify'] = None
objects = objects[['verify'] + columns_before]

eta.begin(len(objects), "Find and flag strange records")
for i, row in objects.iterrows():

    verify = []

    # Verify presence of index in the description
    index = str(row['index']).replace('.0', '')
    if index not in row['description'] or index.strip() == '':
        verify.append('index')

    # Verify presence of each object_type in description
    if pd.notna(row['object_type']):
        object_types = row['object_type'].lower().split(', ')
        for object_type in object_types:
            # Remove trailing "s"
            if object_type.endswith('s') or object_type.endswith('x'): 
                object_type = object_type[:-1]
            if object_type not in row['description'].lower() and 'object_type' not in verify:
                verify.append('object_type')
        if object_types == '':
            verify.append('object_type')

    # Ollama verifications - Object Type
    if "object_type" in verify:
        object_types = row['object_type'].lower().split(', ')
        for object_type in object_types:
            prompt = f"From the following object description, can we say that the main object is a {object_type}?\nHere is the description: \"{row['description']}\"\nAnswer with a single word: \"yes\" or \"no\", with no additionnal explaination."
            # # Temp
            # print('')
            # print('>>> OLLAMA PROMPT')
            # print(prompt)
            messages = [{ "role": "user", "content": prompt }]
            response = ollama.chat(model=local_model, messages=messages) 
            time.sleep(cooldown) # To let computer cool down
            answer: str = response['message']['content']
            # # Temp
            # print('')
            # print('>>> OLLAMA ANSWER')
            # print(answer)
            # print('---')

            # If LLM says yes, save result
            if "yes" in answer.lower():
                try: verify.remove("object_type")
                except: pass


    # Verify presence of each material in description
    if pd.notna(row['material_technique']):
        materials = row['material_technique'].lower().split(', ')
        for material in materials:
            # Remove trailing "s"
            if material.endswith('s') or material.endswith('x'): 
                material = material[:-1]
            if material not in row['description'].lower() and 'material_technique' not in verify:
                verify.append('material_technique')

    # Ollama verifications - Material & Techniques
    if "material_technique" in verify:
        material_techniques = row['material_technique'].lower().split(', ')
        for material_technique in material_techniques:
            prompt = f"From the following object description, can we say that the main object is made of {material_technique}?\nHere is the description: \"{row['description']}\"\nAnswer with a single word: \"yes\" or \"no\", with no additionnal explaination."
            # # Temp
            # print('')
            # print('>>> OLLAMA PROMPT')
            # print(prompt)
            messages = [{ "role": "user", "content": prompt }]
            response = ollama.chat(model=local_model, messages=messages) 
            time.sleep(cooldown) # To let computer cool down
            answer: str = response['message']['content']
            # # Temp
            # print('')
            # print('>>> OLLAMA ANSWER')
            # print(answer)
            # print('---')

            # If LLM says yes, save result
            if "yes" in answer.lower():
                try: verify.remove("material_technique")
                except: pass


    # Verify presence of each origin in description
    if pd.notna(row['origin']):
        origins = row['origin'].lower().split(', ')
        for origin in origins:
            # Remove trailing "s"
            if origin.endswith('s') or origin.endswith('x'): 
                origin = origin[:-1]
            if origin not in row['description'].lower() and 'origin' not in verify:
                verify.append('origin')

    # Ollama verifications - Origin
    if "origin" in verify:
        origins = row['origin'].lower().split(', ')
        for origin in origins:
            prompt = f"From the following object description, can we say that the main object comes from {origin}?\nHere is the description: \"{row['description']}\"\nAnswer with a single word: \"yes\" or \"no\", with no additionnal explaination."
            # # Temp
            # print('')
            # print('>>> OLLAMA PROMPT')
            # print(prompt)
            messages = [{ "role": "user", "content": prompt }]
            response = ollama.chat(model=local_model, messages=messages) 
            time.sleep(cooldown) # To let computer cool down
            answer: str = response['message']['content']
            # # Temp
            # print('')
            # print('>>> OLLAMA ANSWER')
            # print(answer)
            # print('---')

            # If LLM says yes, save result
            if "yes" in answer.lower():
                try: verify.remove("origin")
                except: pass

    
    # Set the verification flag
    objects.at[i, 'verify'] = ', '.join(verify)

    eta.iter()
eta.end()

In [None]:
to_verify = objects[objects['verify'] != '']
print('Verify count:', len(to_verify), f"({lib.percent(len(to_verify) / len(objects))})")

### Save objects with verifications

In [None]:
objects.to_csv(output_path, index=False)