In [58]:
import os
from dotenv import load_dotenv
import pandas as pd
import xml.etree.ElementTree as ET
from fuzzywuzzy import fuzz

In [65]:
load_dotenv()

True

In [66]:
# Get the file paths from the environment variables
csv_file = os.getenv('CSV_FILE_PATH')
alto_file = os.getenv('ALTO_FILE_PATH')

# Check if the environment variables are set
if csv_file is None:
    raise ValueError("The environment variable 'CSV_FILE_PATH' is not set in the .env file.")
if alto_file is None:
    raise ValueError("The environment variable 'ALTO_FILE_PATH' is not set in the .env file.")

# Read the CSV file
df = pd.read_csv(csv_file, delimiter=';')
df = df[['Page', 'Line', 'Text']].dropna()

In [61]:
df.head()

Unnamed: 0,Page,Line,Text
4,1,1,Diari de les excavacions en el poblat de TEST
5,1,2,Sant Andreu d’Ullastret.
6,1,3,Antecedents i Bibliografia – a part d’una nota...
7,1,4,"ment publicada referent a aquest poblat, en el..."
8,1,5,"lum de VII-VIII de la Revista Empúries, corre..."


In [72]:
# Function to update ALTO XML with manual transcriptions using fuzzy matching
def update_alto_with_csv(alto_file, df, output_dir='output', similarity_threshold=90):
    # Parse the ALTO XML file
    tree = ET.parse(alto_file)
    root = tree.getroot()

    # Namespace dictionary for ALTO XML
    namespaces = {'ns0': 'http://www.loc.gov/standards/alto/ns-v4#'}

    # Track replaced lines and their content
    replaced_lines = set()
    line_contents = {}

    # Iterate over each row in the CSV
    for index, row in df.iterrows():
        page_num = row['Page']
        line_num = row['Line']
        manual_text = row['Text']

        # Find the corresponding text block in the ALTO file
        for text_block in root.findall('.//ns0:TextBlock', namespaces):
            for text_line in text_block.findall('.//ns0:TextLine', namespaces):
                for string in text_line.findall('.//ns0:String', namespaces):
                    content = string.attrib.get('CONTENT', '')
                    line_contents[(page_num, line_num)] = content
                    # Calculate similarity score
                    similarity_score = fuzz.ratio(content, manual_text)
                    if similarity_score >= similarity_threshold:
                        # Replace the HTR transcription with the manual transcription
                        string.attrib['CONTENT'] = manual_text
                        replaced_lines.add((page_num, line_num))

    # Check for lines that were not replaced
    for index, row in df.iterrows():
        page_num = row['Page']
        line_num = row['Line']
        content = row['Text']
        if (page_num, line_num) not in replaced_lines:
            print(f"Warning: Line {line_num} on Page {page_num} was not replaced. Content: '{content}'")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Define the output file path
    output_file = os.path.join(output_dir, os.path.basename(alto_file))

    # Write the modified ALTO XML to the output file
    tree.write(output_file, encoding='utf-8', xml_declaration=True)

In [73]:
update_alto_with_csv(alto_file, df)

