In [27]:
import re

ttl_file = "datasets/16034/ore-rem.ttl"

# Regular expression patterns
pattern = r"\.\s+|\.\t+|\.\r+"

matches = None

# Open the TTL file
with open(ttl_file, "r") as file:
    # Read the file content
    content = file.read()

    # Find matches using the pattern
    matches = re.split(pattern, content)

In [28]:
matches

['@prefix ore:   <http://www.openarchives.org/ore/terms/> ',
 '@prefix ldp:   <http://www.w3.org/ns/ldp#> ',
 '@prefix iana:  <http://www.iana.org/assignments/relation/> ',
 '<>      a              ore:ResourceMap ;\n        ore:describes  <#Aggregation> ',
 '<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/obj/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_flask_scs_n15_event.txt.ttl>\n        iana:describes  <bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_flask_scs_n15_event.txt> ',
 '<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/obj/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_flask_brw_all_month.txt.ttl>\n        iana:describes  <bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.cmdl.noaa.gov/data/trace_gases/h2/flask/surface/h2_fl

In [29]:
# Process the matches

def split_in_triples(lst: list):

    groups = list()

    i = 0
    while i < len(lst):
        terminal = i+3 if i+3 < len(lst) else len(lst) - i
        groups.append(lst[i:terminal])
        i += 3

    return groups

def process_triples(triples: list) -> dict:

    classes = list()
    literals = list()
    properties = list()
    entities = list()

    for match in matches:
        tokens = match.split()

        if len(tokens) < 1 or "@prefix" in tokens[0].lower():
            continue

        triples = [x for x in split_in_triples(tokens) if x != []]

        for i in range(0, len(triples) - 1):
            s = triples[i][0]
            p = triples[i][1]
            o = triples[i][2]

            # If you read a comma as subject of the next triple, group together all the triples in a subject until you get a point
            if triples[i + 1][0] == ",":
                y = i
                finished_list = False

                object_list = list()
                object_list.append(o)

                while y < len(triples) and not finished_list:
                    for val in triples[y]:
                        if val == ".":
                            i = y
                            finished_list = True
                        if val == ",":
                            continue
                        object_list.append(val)
                    y += 1

                # increment i to skip processed triples
                i = y

                # assign the correct class
                if p == "a":
                    entities.append(s)
                    classes += object_list
                else:
                    entities.append(s)
                    entities += object_list
                    properties.append(o)

                continue

            # If you read 'a' as predicate then subject is an entity and object is a class
            if p == "a":
                entities.append(s)
                classes.append(o)
                continue

            # literals should be wrapped around
            if o.startswith('"') and o.endswith('"'):
                entities.append(s)
                properties.append(p)
                literals.append(o)
                continue

            # if none of the cases above
            entities.append(s)
            entities.append(o)
            properties.append(o)

    return {
        "classes": classes,
        "literals": literals, 
        "properties": properties,
        "entities": entities
    }

In [30]:
process_triples(matches)

{'classes': ['ldp:Container',
  'ldp:Container',
  'ldp:Container',
  'ldp:Container',
  'ore:Aggregation',
  'ldp:Container',
  'ldp:Container',
  'ldp:Container',
  'ldp:Container'],
 'literals': [],
 'properties': ['<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.ncdc.noaa.gov/pub/data/paleo/icecore/antarctica/epica_domec/edc_dust.txt>',
  ',',
  '<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.ncdc.noaa.gov/pub/data/paleo/icecore/antarctica/epica_domec/edc99_ecm.txt>',
  ',',
  '<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.ncdc.noaa.gov/pub/data/paleo/icecore/antarctica/epica_domec/edc2009d13co2.txt>',
  ',',
  '<bag://9F64ACFA-C054-407A-A97D-59DD081A3334/data/bin/9F64ACFA-C054-407A-A97D-59DD081A3334/data/ftp.ncdc.noaa.gov/pub/data/paleo/icecore/antarctica/epica_domec/edc-n2o-2010-800k.xls>',
  ',',
  '<bag://9F64ACFA-C054-407A-