Extract period of production information from lot descriptions.

### Initialization

In [1]:
import sys, os, datetime
sys.path.append(os.path.abspath('../src'))
import pandas as pd, spacy
import lib
import yaml
import warnings
warnings.filterwarnings("ignore")

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
catalog = config['catalog']['folder_name']
spacy_model = config['catalog']['spacy_model']

# Overwrite variables in case of pipeline mode
if os.getenv('OBJECTIVE_MODE') == 'pipeline':
    catalog = os.getenv('OBJECTIVE_CATALOG')
    
# Global variables
nlp = spacy.load(spacy_model)
eta = lib.Eta()
folder_path = f"../catalogs/{catalog}"
input_path = f'{folder_path}/objects.csv'
output_path = f'{folder_path}/objects.csv'

### Load objects

In [2]:
objects = pd.read_csv(input_path)
objects['period'] = pd.NA

### Manually add periods

In [3]:
for i, _ in objects.iterrows():

    # Prepare variables
    descr = objects.at[i, 'description'].lower() if pd.notna(objects.at[i, 'description']) else ""
    origin = objects.at[i, 'origin'].lower() if pd.notna(objects.at[i, 'origin']) else ""
    period = objects.at[i, 'period'].lower() if pd.notna(objects.at[i, 'period']) else ""

    ##### KINGS #####

    if "henri ii" in descr and "henri iii" not in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Henri II')
    if "henri iv" in descr:
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Henri IV')
    if "louis xiii" in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XIII')
    if "louis xiv" in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XIV')
    if "louis xv" in descr and "louis xvi" not in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XV')
    if "louis xvi" in descr and "louis xvii" not in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XVI')
    if "louis xviii" in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XVIII')
    if "charles x" in descr: 
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Charles X')
    if "napoleon" in descr and "napoleon iii" not in descr: 
        origin = lib.add_element(origin, 'France')
        origin = lib.add_element(period, 'Empire')
    if "napoleon iii" in descr: 
        origin = lib.add_element(origin, 'France')
        origin = lib.add_element(period, 'Second Empire')


    ##### STYLES #####

    if 'rocaille' in descr:
        origin = lib.add_element(origin, 'France')
    if 'régence' in descr:
        origin = lib.add_element(origin, 'France')
    if 'empire' in descr:
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Empire')
    if 'renaissance' in descr:
        origin = lib.add_element(origin, 'Europe')
        period = lib.add_element(period, 'Renaissance')
    if 'rococo' in descr:
        origin = lib.add_element(origin, 'Italy')
    if 'moyen age' in descr or 'moyen-age' in descr:
        origin = lib.add_element(origin, 'Europe')
        period = lib.add_element(period, 'Moyen-Age')

    # Set the new values 
    objects.at[i, 'origin'] = lib.clean_elements_str(origin) 
    objects.at[i, 'period'] = lib.clean_elements_str(period) 


### Work on centuries

In [4]:
periods_blacklist_chars = ['°', 'ᵉ', 'e', '⁰', '^']
correspondance = {
    'xvir': 'xvi',
    'xviir': 'xvii',
    'xvièm': 'xvi',
    'xviièm': 'xvii',
    '13': 'xiii',
    '14': 'xiv',
    '15': 'xv',
    '16': 'xvi',
    '17': 'xvii',
    '18': 'xviii',
    '19': 'xix',
    'troisièm': 'iii',
    'sizièm': 'vi',
    'sixièm': 'vi',
    'dixièm': 'x',
    'onzièm': 'xi',
    'douzièm': 'xii',
    'trizièm': 'xiii',
    'quatorzièm': 'xiv',
    'quinzièm': 'xv',
    'dix-sptièm': 'xvii',
    'dix-huitièm': 'xvii',
    'drnir': 'xviii'
}
all_periods = set()

def get_period(period: str) -> str:
    text = period.lower()
    for bl in periods_blacklist_chars:
        text = text.replace(bl, '')
    if text in correspondance:
        text = correspondance[text]
    return text

In [None]:
eta.begin(len(objects), 'Getting periods')
for i, row in objects.iterrows():
    descr = row['description'] if pd.notna(row['description']) else ''
    period = objects.at[i, 'period'].lower() if pd.notna(objects.at[i, 'period']) else ""

    if 'siècle' in descr:
        doc = nlp(descr)
        periods_raw = [get_period(token.text) for token in doc if token.head.text == 'siècle' and token.pos_ == 'ADJ']
        periods = [p for p in periods_raw if p != '']
        for p in periods:
            period = lib.add_element(period, p) 
        
    objects.at[i, 'period'] = lib.clean_elements_str(period) 
    eta.iter()
eta.end()

### Save table

In [None]:
objects.to_csv(output_path, index=False)