In [None]:
import hashlib
from bs4 import BeautifulSoup
import json
import re

# Function to generate a unique ID
def generate_id(name, date):
    unique_string = f"{name}_{date}"
    return hashlib.md5(unique_string.encode()).hexdigest()

# Function to split and prepend the prefix to each number
def split_and_prepend(input_string):
    # Find the position of the first comma
    first_comma_index = input_string.index(',')
    
    # Extract the prefix
    prefix = input_string[:first_comma_index].rsplit(' ', 1)[0]
    
    # Extract the numbers part
    numbers_part = input_string[first_comma_index + 1:]
    
    # Split the numbers part by comma and strip whitespace
    numbers = [num.strip() for num in numbers_part.split(",")]
    
    # Prepend the prefix to each number
    result = [f"{prefix} {num}" for num in numbers]
    
    return result

# Load the HTML content
# Specify the path to your HTML file
file_path = "mvp.html"  # Replace with the actual path to your HTML file

# Load the HTML content from the file
with open(file_path, "r", encoding="utf-8") as file:
    html_content = file.read()
    soup = BeautifulSoup(html_content, "html.parser")

    # Initialize an empty list for structured data
    entries = []

    # Iterate through each <thead> (entry header)
    for entry in soup.find_all("thead"):
        # Extract the name and date
        name_date = entry.find_all("th")
        if len(name_date) == 2:
            name = name_date[0].get_text(strip=True)
            date = name_date[1].get_text(strip=True)

            # Generate a unique ID for the entry
            unique_id = generate_id(name, date)

            # Initialize the entry dictionary
            entry_data = {
                "ID": unique_id,
                "Name": name,
                "Date": date,
                "Sources": [],
                "Facts": []
            }

            # Find the corresponding <blockquote> for this entry
            blockquote = entry.find_next("blockquote")
            if blockquote:
                # Extract sources and their references
                sources_text = blockquote.find("p").decode_contents().replace("\n", "")
                sources = []
                source_map = {}  # Map letters (e.g., 'a', 'b') to references
                for source in sources_text.split(";"):
                    parts = source.split(",")
                    if len(parts) > 1:
                        # Use split_and_prepend to handle the prefix
                        prefixed_parts = split_and_prepend(source)
                        for part in prefixed_parts:
                            match = re.match(r"^(.*?)<sup>([a-z]+)</sup>$", part.strip())
                            if match:
                                resource = match.group(1).strip()
                                references = list(match.group(2))  # Extract individual letters
                                for ref in references:
                                    sources.append({"Resource": resource, "Reference": ref})
                                    source_map[ref] = resource  # Map each letter to its resource
                    else:
                        match = re.match(r"^(.*?)<sup>([a-z]+)</sup>$", source.strip())
                        if match:
                            resource = match.group(1).strip()
                            references = list(match.group(2))  # Extract individual letters
                            for ref in references:
                                sources.append({"Resource": resource, "Reference": ref})
                                source_map[ref] = resource  # Map each letter to its resource
                        else:
                            resource = source.strip()
                            sources.append({"Resource": resource})

                entry_data["Sources"] = sources

                # Extract facts and their references
                for fact_line in blockquote.find_all("p")[1:]:  # Skip the first <p> (sources)
                    fact_text = fact_line.decode_contents().replace("\n", "")
                    # Split the fact text by commas and process each part
                    for part in fact_text.split(","):
                        match = re.match(r"^(.*?)<sup>([a-z]+)</sup>$", part.strip())
                        if match:
                            fact = match.group(1).strip()
                            references = list(match.group(2))  # Extract letters
                            entry_data["Facts"].append({
                                "Fact": fact,
                                "Sources": references
                            })
                        else:
                            # Handle lines without <sup> tags
                            fact = part.strip()
                            entry_data["Facts"].append({
                                "Fact": fact,
                                "Sources": []
                            })

            # Append the entry to the entries list
            entries.append(entry_data)

    # Output the structured data as JSON
    output_file = "output_with_ids7.json"  # Specify your output file name
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(entries, f, ensure_ascii=False, indent=4)

    print(f"Data extraction complete. Saved to '{output_file}'.")

Grunauer, p. 53 (emission mark 6 <img alt="sigla 06" src="media/image1.jpeg" style="width:0.27083in
height:0.11806in"/>), Gruppe XIII(48-40 BC), Serie 5, Tafel 8, Gruppe XIV (48-40 BC), Serie 11, Tafel 8,Gruppe XV (48-40 BC), Serie 9-10, Tafel 9, Gruppe XVI (43-31 BC), Serie1, Tafel 11 suggests Amphimachos or Amphis (but the abbreviation seemsto include a rho)
Rizakis #247
 IG V 1. 682
Rizakis #247
 IG V 1. 682
SEG XI 565
IG V 1. 118 (partial A)
IG V 1. 94 (partial A, 3-4 letters missing)
Rizakis #379
 IG V 1. 786 (partial A)
Rizakis #248 (“Severan”)
 IG V 1. 303
Rizakis #248 (“Severan”)
 IG V 1. 303
Rizakis #248 (“Severan”)
 IG V 1. 303
IG V 1. 380
IG VII 417<sup>a</sup>
 [XII. 3 (supplement) 1299<sup>b</sup>,1625<sup>c</sup>]
IG VII 417<sup>a</sup>
 [XII. 3 (supplement) 1299<sup>b</sup>,1625<sup>c</sup>]
IG VII 417<sup>a</sup>
 [XII. 3 (supplement) 1299<sup>b</sup>,1625<sup>c</sup>]
IG V 1. 212
[IG VII 417]<sup>a</sup>
 XII. 3 (supplement) 1299<sup>b</sup>,1625<sup>c</sup>
Grunauer, 

In [3]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import DC, FOAF, RDF, RDFS
from urllib.parse import quote
import hashlib
import json

# Function to generate a unique ID based on the name and date
def generate_id(entry):
    unique_string = f"{entry['Name']}_{entry['Date']}"
    return hashlib.md5(unique_string.encode()).hexdigest()

# Load the structured data
with open("output_with_ids.json", "r", encoding="utf-8") as f:
    data = json.load(f)

    # Create an RDF graph
    g = Graph()

    # Define namespaces
    EX = Namespace("http://example.org/ontology/")  # Custom namespace for your data
    CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")

    g.bind("ex", EX)
    g.bind("dc", DC)
    g.bind("foaf", FOAF)
    g.bind("crm", CRM)

    # Add entries to the graph
    for entry in data:
        # Generate a unique ID for the person
        unique_id = generate_id(entry)
        person_uri = URIRef(f"http://example.org/person/{unique_id}")

        # Encode the person URI
        person_uri = URIRef(f"http://example.org/person/{quote(entry['Name'])}")
        g.add((person_uri, RDF.type, CRM.E21_Person))
        g.add((person_uri, FOAF.name, Literal(entry["Name"])))
        g.add((person_uri, DC.date, Literal(entry["Date"])))
        # Add a human-readable label
        g.add((person_uri, RDFS.label, Literal(entry["Name"])))
        # Add the unique ID as a property
        g.add((person_uri, EX.hasID, Literal(unique_id)))

        # Add sources
        for source in entry["Sources"]:
            # Encode source URI
            source_uri = URIRef(f"http://example.org/source/{quote(source['Resource'])}")
            g.add((person_uri, EX.hasSource, source_uri))
            g.add((source_uri, EX.hasReference, Literal(source["Reference"])))

        # Add facts
        for fact in entry["Facts"]:
            fact_node = URIRef(f"http://example.org/fact/{quote(fact['Fact'])}")
            g.add((person_uri, EX.hasFact, fact_node))
            g.add((fact_node, DC.description, Literal(fact["Fact"])))
            for ref in fact["Sources"]:
                g.add((fact_node, EX.hasSourceReference, Literal(ref)))

# Save the RDF graph to a file
output_rdf_file = "output.ttl"
g.serialize(destination=output_rdf_file, format="turtle")

print(f"RDF data saved to '{output_rdf_file}'.")

RDF data saved to 'output.ttl'.
