Merge all individual object files into one CSV table.

### Initialize

In [None]:
import sys, os
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import lib


# Global variables
columns = ['catalog', 'index', 'description', 'number', 'object_type', 'material_technique', 'origin', 'author', 'period', 'verify']
folder_path = f"../catalogs/"
eta = lib.Eta()
output_path = f"../data/objects-all.csv"

### Initialize all objects file

In [None]:
if not os.path.exists(output_path):
    df = pd.DataFrame(columns=columns)
    df.to_csv(output_path, index=False)

### Load all objects

In [None]:
all_objects = pd.read_csv(output_path)
all_objects['index'] = all_objects['index'].apply(lib.try_parse_int)

### Merge all catalog objects

In [None]:
# Find all objects from extracted catalogs
catalogs_folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
eta.begin(len(catalogs_folders), 'Merging all catalogs')
for folder in catalogs_folders:
    path = os.path.join(folder_path, folder, 'objects.csv')
    if os.path.exists(path):
        # Load the catalog objects
        objects = pd.read_csv(path)
        objects['index'] = objects['index'].astype(pd.StringDtype()).apply(lib.try_parse_int)
        
        # Add catalog information
        objects['catalog'] = folder

        # Concat with all objects
        all_objects = pd.concat([objects, all_objects])

        # Deduplicate based on catalog name and description (to assure unicity)
        all_objects.drop_duplicates(subset=['catalog', 'index'], inplace=True, keep='first')
        
    eta.iter()
eta.end()

### Format and sort all objects

In [None]:
# Columns in right order
all_objects = all_objects[columns]

# Sort table
all_objects['index_int'] = pd.to_numeric(all_objects['index'], errors='coerce')
all_objects.sort_values(by=['catalog', 'index_int'], inplace=True)
all_objects.drop(columns=['index_int'], inplace=True)

### Save all objects

In [None]:
all_objects.to_csv(output_path, index=False)
print('Total number of objects:', len(all_objects))