Apply various rules of corrections to objects information.

### Initialization

In [None]:
import sys, os, datetime
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import lib
import yaml

# Global variables
eta = lib.Eta()
input_path = "../data/objects-all.csv"
output_path = "../data/objects-all.csv"
corrections_path = f"./02-corrections.yaml"
authors_blacklist_path = f"./01-authors-blacklist.yaml"
corrections = {}

### Load objects

In [None]:
objects = pd.read_csv(input_path)

### Load corrections

In [3]:
with open(corrections_path, "r") as f:
    corrections = yaml.safe_load(f)

### Remove prepending articles

In [None]:
prepending_articles = ["un", "une"]

eta.begin(len(objects), "Remove prepending articles")
for i, row in objects.iterrows():

    # object_type
    if pd.notna(row['object_type']):
        object_types = row['object_type'].split(', ')
        new_obj_types = []
        for object_type in object_types:
            words = object_type.split(' ')
            new_words = []
            for word in words:
                if word not in prepending_articles: 
                    new_words.append(word)
            new_obj_types.append(' '.join(new_words))
        objects.at[i, 'object_type'] = ', '.join(new_obj_types)

    # material_technique
    if pd.notna(row['material_technique']):
        mat_techs = row['material_technique'].split(', ')
        new_mat_techs = []
        for mat_tech in mat_techs:
            words = mat_tech.split(' ')
            new_words = []
            for word in words:
                if word not in prepending_articles: 
                    new_words.append(word)
            new_mat_techs.append(' '.join(new_words))
        objects.at[i, 'material_technique'] = ', '.join(new_mat_techs)

    # origins
    if pd.notna(row['origin']):
        origins = row['origin'].split(', ')
        new_origins = []
        for origin in origins:
            words = origin.split(' ')
            new_words = []
            for word in words:
                if word not in prepending_articles: 
                    new_words.append(word)
            new_origins.append(' '.join(new_words))
        objects.at[i, 'origin'] = ', '.join(new_origins)

    eta.iter()
eta.end()

[ETA] Remove prepending articles - 3544 iterations in 00h00m00s (avg of 00h00m00s/iter)               


### Apply some various rules (NLP)

In [None]:
eta.begin(len(objects), "Applying various NLP rules")
for i, _ in objects.iterrows():

    # Remove spaces after quotes
    if pd.notna(objects.at[i, 'description']):
        objects.at[i, 'description'] = objects.at[i, 'description'].replace("' ", "'").replace("’ ", "'")
    if pd.notna(objects.at[i, 'object_type']):
        objects.at[i, 'object_type'] = objects.at[i, 'object_type'].replace("' ", "'").replace("’ ", "'")
    if pd.notna(objects.at[i, 'material_technique']):
        objects.at[i, 'material_technique'] = objects.at[i, 'material_technique'].replace("' ", "'").replace("’ ", "'")

    # Remove the unrelevant spaces around dashes
    if pd.notna(objects.at[i, 'object_type']):
        objects.at[i, 'object_type'] = objects.at[i, 'object_type'].replace(" - ", "-")
    if pd.notna(objects.at[i, 'material_technique']):
        objects.at[i, 'material_technique'] = objects.at[i, 'material_technique'].replace(" - ", "-")

    eta.iter()
eta.end()

[ETA] Applying various NLP rules - 3544 iterations in 00h00m00s (avg of 00h00m00s/iter)               


### Apply corrections (from correction file)

In [None]:
def clean_str(text: str) -> str:
    text = text.strip()
    if text.endswith('-'): 
        text = text[-1:]
    text = text.replace('’ ', "' ")
    return text.strip()


eta.begin(len(objects), 'Apply corrections from correction file')
for i, row in objects.iterrows():

    # object_type
    if pd.notna(row['object_type']):

        # Get all object_types
        object_types = row['object_type'].split(', ')
        object_types_corrected = []

        # For each one of them, check if there is a correction, if yes, replace it, otherwise keep it as is
        for object_type in object_types:
            object_type = object_type.lower()

            # From the "all" section:
            if object_type in corrections['all']:
                object_types_corrected.append(corrections['all'][object_type])

            # From the "object_type" section:
            elif object_type in corrections['object_type']:
                object_types_corrected.append(corrections['object_type'][object_type])

            # Otherwise, leave it untouched
            else:
                object_types_corrected.append(object_type)

        # Apply changes
        objects.at[i, 'object_type'] = ', '.join(list(map(lambda txt: clean_str(txt), object_types_corrected)))


    # material_technique
    if pd.notna(row['material_technique']):

        # Get all material_technique
        materials = row['material_technique'].split(', ')
        materials_corrected = []

        # For each one of them, check if there is a correction, if yes, replace it, otherwise keep it as is
        for material in materials:
            material = material.lower()

            # From the "all" section:
            if material in corrections['all']:
                materials_corrected.append(corrections['all'][material])

            # From the "material_technique" section:
            elif material in corrections['material_technique']:
                materials_corrected.append(corrections['material_technique'][material])

            # Otherwise, leave it untouched
            else:
                materials_corrected.append(material)

        # Apply changes
        objects.at[i, 'material_technique'] = ', '.join(list(map(lambda txt: clean_str(txt), materials_corrected)))


    # origin
    if pd.notna(row['origin']):

        # Get all origin
        origins = row['origin'].split(', ')
        origins_corrected = []

        # For each one of them, check if there is a correction, if yes, replace it, otherwise keep it as is
        for origin in origins:
            origin = origin.lower()

            # From the "all" section:
            if origin in corrections['all']:
                origins_corrected.append(corrections['all'][origin])

            # From the "origin" section:
            elif origin in corrections['origin']:
                origins_corrected.append(corrections['origin'][origin])

            # Otherwise, leave it untouched
            else:
                origins_corrected.append(origin)

        # Apply changes
        objects.at[i, 'origin'] = ', '.join(list(map(lambda txt: clean_str(txt), origins_corrected)))

    # period
    if pd.notna(row['period']):

        # Get all period
        periods = row['period'].split(', ')
        periods_corrected = []

        # For each one of them, check if there is a correction, if yes, replace it, otherwise keep it as is
        for period in periods:
            period = period.lower()

            # From the "all" section:
            if period in corrections['all']:
                periods_corrected.append(corrections['all'][period])

            # From the "period" section:
            elif period in corrections['period']:
                periods_corrected.append(corrections['period'][period])

            # Otherwise, leave it untouched
            else:
                periods_corrected.append(period)

        # Apply changes
        objects.at[i, 'period'] = ', '.join(list(map(lambda txt: clean_str(txt), periods_corrected)))

    # author
    if pd.notna(row['author']):

        # Get all author
        authors = row['author'].split(', ')
        authors_corrected = []

        # For each one of them, check if there is a correction, if yes, replace it, otherwise keep it as is
        for author in authors:
            author = author.lower()

            # From the "all" section:
            if author in corrections['all']:
                authors_corrected.append(corrections['all'][author])

            # From the "author" section:
            elif author in corrections['author']:
                authors_corrected.append(corrections['author'][author])

            # Otherwise, leave it untouched
            else:
                authors_corrected.append(author)

        # Apply changes
        objects.at[i, 'author'] = ', '.join(list(map(lambda txt: clean_str(txt), authors_corrected)))

    eta.iter()
eta.end()

[ETA] Apply corrections from correction file - 3544 iterations in 00h00m00s (avg of 00h00m00s/iter)               


### Apply other manual corrections

In [None]:
for i, _ in objects.iterrows():

    # Prepare variables
    descr = objects.at[i, 'description'].lower() if pd.notna(objects.at[i, 'description']) else ""
    obj_type = objects.at[i, 'object_type'].lower() if pd.notna(objects.at[i, 'object_type']) else ""
    mat_tech = objects.at[i, 'material_technique'].lower() if pd.notna(objects.at[i, 'material_technique']) else ""
    origin = objects.at[i, 'origin'].lower() if pd.notna(objects.at[i, 'origin']) else ""
    period = objects.at[i, 'period'].lower() if pd.notna(objects.at[i, 'period']) else ""
    author = objects.at[i, 'author'].lower() if pd.notna(objects.at[i, 'author']) else ""


    ##### PORCELAINES #####

    # Whenever there is "porcelaine de nymphenburg" in the description, the origin is "Nymphenburg"
    if "porcelaine de nymphenburg" in descr:
        author = lib.add_element(author, "Nymphenburg")
        origin = lib.add_element(origin, "Nymphenburg")


    ##### BRONZES #####

    # All "bronze florentin" are actually "bronze" and origin is "Florence"
    if "bronze florentin" in mat_tech:
        mat_tech = lib.remove_element(mat_tech, 'bronze florentin')
        mat_tech = lib.add_element(mat_tech, 'bronze')
        origin = lib.add_element(origin, 'Florence')


    ##### KINGS #####

    if lib.has_element(period, 'henri ii'):
        period = lib.remove_element(period, 'henri ii')
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'xvie siècle')
        
    if lib.has_element(period, 'henri iv'):
        period = lib.remove_element(period, 'henri iv')
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'xvie siècle')

    if lib.has_element(period, 'louis xiii'):
        origin = lib.add_element(origin, 'France')

    if lib.has_element(period, 'louis xiv'):
        origin = lib.add_element(origin, 'France')

    if lib.has_element(period, 'louis xv'):
        origin = lib.add_element(origin, 'France')
    
    if lib.has_element(period, 'louis xvi'):
        origin = lib.add_element(origin, 'France')
    
    if lib.has_element(period, 'napoleon'):
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Empire')

    if lib.has_element(period, 'françois 1er'):
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'xvie siècle')
    

    ##### STYLES #####

    if 'rocaille' in descr:
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Louis XV')
    if lib.has_element(period, 'rocaille'):
        period = lib.remove_element(period, 'rocaille')
        period = lib.add_element(period, 'Louis XV')
    if 'régence' in descr:
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Régence')
    if 'empire' in descr:
        origin = lib.add_element(origin, 'France')
        period = lib.add_element(period, 'Empire')
    if 'restauration' in descr:
        origin = lib.add_element(origin, 'France')
    if 'renaissance' in descr:
        origin = lib.add_element(origin, 'Europe')
        period = lib.add_element(period, 'XVI')
    if 'rococo' in descr:
        origin = lib.add_element(origin, 'Italie')


    ##### MISC #####

    # All "pierre de lard" are from "chine"
    if "pierre de lard" in mat_tech or "pierre de lard" in descr:
        origin = lib.add_element(origin, 'Chine')

    # Replace "tasse" by "coupe" when it is a mistake
    if "tasse" in obj_type and "tasse" not in descr and "coupe" in descr:
        obj_type = lib.remove_element(obj_type, "tasse")
        obj_type = lib.add_element(obj_type, "coupe")

    # All "jade" are from "chine"
    if "jade" in mat_tech:
        origin = lib.add_element(origin, 'Chine')

    # If there is only "émail" as an object type, it is actually a "plaque en émail"
    obj_types = obj_type.split(', ')
    for obj in obj_types:
        if obj == "émail":
            obj_type = lib.remove_element(obj_type, "émail")
            obj_type = lib.add_element(obj_type, "plaque en émail")
            break

    # Whenever there is "tonkin" in the description, it comes from Vietnam
    if "tonkin" in descr:
        origin = lib.add_element(origin, "Vietnam")

    # Remove "Tonkin" from origins
    if "tonkin" in origin:
        origin = lib.remove_element(origin, 'tonkin')

    # Whenever there is "bocaro" or "boccaro" in the description, it comes from italie
    if "bocaro" in descr or "boccaro" in descr:
        origin = lib.add_element(origin, "italie")

    # Remove "bocaro" from origins
    if "bocaro" in origin:
        origin = lib.remove_element(origin, 'bocaro')
    # Remove "boccaro" from origins
    if "boccaro" in origin:
        origin = lib.remove_element(origin, 'boccaro')

    # If "sèvres" in origin, it is also an author
    if "sèvres" in origin:
        author = lib.add_element(author, 'sèvres')

    # If "gobelins" in origin, it is also an author
    if "gobelins" in origin:
        author = lib.add_element(author, 'gobelins')

    # Remove elements from object types (errors)
    remove_object_types = [
        "groupe", "tête", "objet", "idem", "chien", "chat", "cheval", "éléphant", "autre", "poule",
        "baroques", "la vierge et l'enfant jésus", "dame", "aigle", "chien de fó", "femme", "mort de cléopâtre"
        "saint jean", "sphinx", "groupe d'enfant", "chien de fô", "tête d'enfant", "mandarin", "jeune femme",
        "mars et vénus", "pièce diverses", "tête d'homme", "tête de femme"
    ]
    for elt in remove_object_types:
        if lib.has_element(obj_type, elt):
            obj_type = lib.remove_element(obj_type, elt)


    # Origins
    # All of following origins are not origins, but actually a period
    replace_origins_by_periods = [
        "régence", "rocaille", "bas-empire", "gothique", "empire", 
        "renaissance", "antique", "époque régence", "roman"
    ]
    for elt in replace_origins_by_periods:
        if lib.has_element(origin, elt):
            origin = lib.remove_element(origin, elt)
            period = lib.add_element(period, elt)
    

    # Origins
    # All of following origins are not origins, but actually authors
    replace_origins_by_author = [
        "thomire", "palissy", "boule", "donatello", "de lafosse", 
        "wedgwood", "atelier d' erhard", "savonnerie"
    ]
    for elt in replace_origins_by_author:
        if lib.has_element(origin, elt):
            origin = lib.remove_element(origin, elt)
            author = lib.add_element(author, elt)
    
    # Origin blacklist
    origin_blacklist = [
        "palais de versailles", "grand-trianon", "mauresque", "clèves", "caroline", "la bastille", "christ en bas-rhin", 
        "ce des inde", "tour", "nord", "trianon", "tartare", "malmaison", "maremme", "moine", "mauresque", "juliers", "étrangères", 
        "janet", "bibliothèque nationale", "blesenne", "lafloques", "setangue", "dorot", "lévêcque", "bade", "ginori", "itrie", 
        "cochinchine", "parme", 
    ]
    for elt in origin_blacklist:
        if lib.has_element(origin, elt):
            origin = lib.remove_element(origin, elt)


    ##### Origins tweaks #####

    if lib.has_element(origin, 'wedgwood'):
        origin = lib.remove_element(origin, 'wedgwood')
        origin = lib.add_element(origin, 'angleterre')
        author = lib.add_element(author, 'wedgwood')
    if lib.has_element(origin, 'minton'):
        origin = lib.remove_element(origin, 'minton')
        origin = lib.add_element(origin, 'angleterre')
        author = lib.add_element(author, 'minton')
    if lib.has_element(origin, 'savonnerie'):
        origin = lib.remove_element(origin, 'savonnerie')
        origin = lib.add_element(origin, 'angleterre')
        author = lib.add_element(author, 'savonnerie')


    ##### Periods tweaks #####

    # For those who are on vi century, we observe that it is actually xvi, correct it if it is the case
    if lib.has_element(period, 'vi') and 'seizième' in descr:
        period = lib.remove_element(period, 'vi')
        period = lib.add_element(period, 'xvi')

    # "siècle" word is missing
    add_siecle = ['vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix']
    for to_add in add_siecle:
        if lib.has_element(period, to_add):
            period = lib.remove_element(period, to_add)
            period = lib.add_element(period, to_add + 'e siècle')

    # Those are not periods
    period_blacklist = ['zièm', 'moindr', 'iii', 'xxii']
    for bl in period_blacklist:
        if lib.has_element(period, bl):
            period = lib.remove_element(period, bl)


    ##### Author tweaks #####

    # Special case for Jean de Bologne
    if "d'après jean de bologne" in descr or "par jean de bologne" in descr or "de jean de bologne" in descr or "à jean de bologne":
        if lib.has_element(author, 'jean'): author = lib.remove_element(author, 'jean')
        author = lib.add_element(author, "jean de bologne")

    # Special case for Jean III Pénéaud
    if "jean iii péneaud" in descr:
        if lib.has_element(author, 'jean iii'): 
            author = lib.remove_element(author, 'jean iii')
        author = lib.add_element(author, "jean iii péneaud")

    # special case to remove from author and put as origin
    not_authors_but_origins = ["dresde", "genève", "naples", "carthage", "canton", "neuilly", "augsbourg", "ratisbonne", "leyde", "nuremberg", "saint-germain", "capo di monte"]
    for not_author in not_authors_but_origins:
        if lib.has_element(author, not_author):
            author = lib.remove_element(author, not_author)
            origin = lib.add_element(origin, not_author)
    


    # Set the new values
    objects.at[i, 'object_type'] = lib.clean_elements_str(obj_type) 
    objects.at[i, 'material_technique'] = lib.clean_elements_str(mat_tech)  
    objects.at[i, 'origin'] = lib.clean_elements_str(origin) 
    objects.at[i, 'period'] = lib.clean_elements_str(period) 
    objects.at[i, 'author'] = lib.clean_elements_str(author) 

### Remove all blacklisted authors

In [None]:
with open(authors_blacklist_path, "r") as f:
    blacklist = list(map(lambda s: s.strip().lower(), yaml.safe_load(f)))


for i, row in objects.iterrows():

    # Do nothing if there is no author
    if pd.isna(row['author']): 
        continue

    # Remove all blacklisted authors
    authors = row['author']
    for bl_auth in blacklist:
        authors = lib.remove_element(authors, bl_auth)

    # Reset in the table
    objects.at[i, 'author'] = lib.clean_elements_str(authors) 



In [None]:
objects.to_csv(output_path, index=False)