In [5]:
import re


def parse_ttl(file: str) -> dict:
    # Regular expression patterns
    pattern = r"\.\s+|\.\t+|\.\r+"

    d = {
        "classes": list(),
        "literals": list(),
        "entities": list(),
        "properties": list(),
    }

    matches = None

    # Open the TTL file
    with open(file, "r") as file:
        # Read the file content
        content = file.read()

        # Find matches using the pattern
        matches = re.split(pattern, content)

        for line in matches:
            r = parse_line(line)
            for key in d.keys():
                if key in r.keys():
                    d[key] += r[key]

    return d


def parse_line(line: str) -> dict:
    tokens = line.split()

    # skip empty lines
    if len(tokens) < 1:
        return {}

    # skip prefix lines
    if "@prefix" in [token.lower() for token in tokens]:
        return {}

    triples = list()
    triple = (tokens[0], tokens[1], tokens[2])
    triples.append(triple)

    # parse all the triples that starts with ; as subject
    for index in [i for i, x in enumerate(tokens) if x == ";"]:
        triple = (tokens[0], tokens[index + 1], tokens[index + 2])
        triples.append(triple)

    classes = list()
    entities = list()
    literals = list()
    properties = list()

    for t in triples:
        entities.append(t[0])

        if t[1] == "a":
            classes.append(t[2])
            continue

        properties.append(t[2])

        if t[2].startswith('"') and t[2].endswith('"'):
            literals.append(t[2])

    d = {}

    if len(classes) > 0:
        d["classes"] = classes

    if len(entities) > 0:
        d["entities"] = entities

    if len(literals) > 0:
        d["literals"] = literals

    if len(properties) > 0:
        d["properties"] = properties

    return d

In [7]:
import json

ttl_file = "datasets/16034/ore-rem.ttl"

d = parse_ttl(ttl_file)

print(json.dumps(d, sort_keys=True, indent=4))

{
    "classes": [
        "ore:ResourceMap",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ore:Aggregation",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container",
        "ldp:Container"
    ],
    "entities": [
        "<>",
        "<>",
        "<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/obj/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_flask_scs_n15_event.txt.ttl>",
        "<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/obj/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_flask_brw_all_month.txt.ttl>",
        "<bag://9F64