Transform extracted text into their lemma version. 

First step of harmonization.

### Initialize

In [None]:
import sys, os
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import spacy
import warnings
import lib
import yaml

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
spacy_model = config['catalog']['spacy_model']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')
    
# Global Variables
eta = lib.Eta()
folder_path = f"../catalogs/{catalog}"
input_path = f'{folder_path}/objects.csv'
output_path = f'{folder_path}/objects.csv'
warnings.filterwarnings("ignore")
nlp = spacy.load(spacy_model)

### Load objects

In [None]:
objects = pd.read_csv(input_path)

### Get objects types lemmas

In [None]:
eta.begin(len(objects), "Get lemma of object types")
for i, row in objects.iterrows():
    if pd.isna(row['object_type']): continue
    object_types = row['object_type'].split(', ')
    new_object_types = []
    for object_type in object_types:
        doc = nlp(object_type)
        lemmas = ""
        for token in doc:
            if token.pos_ == 'NOUN': 
                lemmas += token.lemma_ + ' '
            else: 
                lemmas += token.text + ' '
        new_object_types.append(lemmas.strip())

    objects.at[i, 'object_type'] = ', '.join(new_object_types)
    eta.iter()
eta.end()

### Get materials & techniques lemmas

In [None]:
eta.begin(len(objects), "Get lemma of materials and techniques")
for i, row in objects.iterrows():
    if pd.isna(row['material_technique']): continue
    mats_techs = row['material_technique'].split(', ')
    new_mats_techs = []
    for mats_techs in mats_techs:
        doc = nlp(mats_techs)
        lemmas = ""
        for token in doc:
            if token.pos_ == 'NOUN': 
                lemmas += token.lemma_ + ' '
            else: 
                lemmas += token.text + ' '
        new_mats_techs.append(lemmas.strip())

    objects.at[i, 'material_technique'] = ', '.join(new_mats_techs)
    eta.iter()
eta.end()

### Get origins lemmas

In [None]:
eta.begin(len(objects), "Get lemma of origins")
for i, row in objects.iterrows():
    if pd.isna(row['origin']): continue
    origins = row['origin'].split(', ')
    new_origin = []
    for origin in origins:
        doc = nlp(origin)
        lemmas = ""
        for token in doc:
            if token.pos_ == 'NOUN': 
                lemmas += token.lemma_ + ' '
            else: 
                lemmas += token.text + ' '
        new_origin.append(lemmas.strip())

    objects.at[i, 'origin'] = ', '.join(new_origin)
    eta.iter()
eta.end()

### Save objects

In [None]:
objects.to_csv(output_path, index=False)