In [1]:
import rdflib

# Initialize RDF graph
g = rdflib.Graph()
brick_file = "Brick.ttl"  # Update with your path
g.parse(brick_file, format="turtle")

# Set to store unique full terms
full_terms = set()

# Common namespaces
brick_ns = rdflib.Namespace("https://brickschema.org/schema/Brick#")
qudt_ns = rdflib.Namespace("http://qudt.org/schema/qudt/")
sh_ns = rdflib.Namespace("http://www.w3.org/ns/shacl#")
bacnet = rdflib.Namespace("http://data.ashrae.org/bacnet/2020#")
brick = rdflib.Namespace("https://brickschema.org/schema/Brick#")
bsh = rdflib.Namespace("https://brickschema.org/schema/BrickShape#")
dcterms = rdflib.Namespace("http://purl.org/dc/terms/")
owl = rdflib.Namespace("http://www.w3.org/2002/07/owl#")
qudt = rdflib.Namespace("http://qudt.org/schema/qudt/")
qudtqk = rdflib.Namespace("http://qudt.org/vocab/quantitykind/")
rdf = rdflib.Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
rdfs = rdflib.Namespace("http://www.w3.org/2000/01/rdf-schema#")
rec = rdflib.Namespace("https://w3id.org/rec#")
ref = rdflib.Namespace("https://brickschema.org/schema/Brick/ref#")
s223 = rdflib.Namespace("http://data.ashrae.org/standard223#")
sdo = rdflib.Namespace("http://schema.org/")
sh = rdflib.Namespace("http://www.w3.org/ns/shacl#")
skos = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
sosam = rdflib.Namespace("http://www.w3.org/ns/sosa/")
tag = rdflib.Namespace("https://brickschema.org/schema/BrickTag#")
unit = rdflib.Namespace("http://qudt.org/vocab/unit/")
vcardm = rdflib.Namespace("http://www.w3.org/2006/vcard/ns#")
xsd = rdflib.Namespace("http://www.w3.org/2001/XMLSchema#")

# Extract full terms (classes and properties)
for s, p, o in g:
    # Subjects (classes or instances)
    if (
        str(s).startswith(str(brick_ns))
        or str(s).startswith(str(qudt_ns))
        or str(s).startswith(str(sh_ns))
    ):
        full_terms.add(
            str(s)
            .replace(str(brick_ns), "brick:")
            .replace(str(qudt_ns), "qudt:")
            .replace(str(sh_ns), "sh:")
        )

    # Predicates (relations)
    if (
        str(p).startswith(str(brick_ns))
        or str(p).startswith(str(qudt_ns))
        or str(p).startswith(str(sh_ns))
    ):
        full_terms.add(
            str(p)
            .replace(str(brick_ns), "brick:")
            .replace(str(qudt_ns), "qudt:")
            .replace(str(sh_ns), "sh:")
        )

    # Objects (classes or types)
    if (
        str(o).startswith(str(brick_ns))
        or str(o).startswith(str(qudt_ns))
        or str(o).startswith(str(sh_ns))
    ):
        full_terms.add(
            str(o)
            .replace(str(brick_ns), "brick:")
            .replace(str(qudt_ns), "qudt:")
            .replace(str(sh_ns), "sh:")
        )

# Save full terms to a text file
with open("brick_full_terms.txt", "w") as f:
    for term in sorted(full_terms):
        f.write(f"{term}\n")

print(
    f"Extracted {len(full_terms)} unique full terms and saved to 'brick_full_terms.txt'."
)

Extracted 1723 unique full terms and saved to 'brick_full_terms.txt'.


In [2]:
import rdflib
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize RDF graph
g = rdflib.Graph()
brick_file = "Brick.ttl"  # Update with your path to Brick.ttl
g.parse(brick_file, format="turtle")

# Define namespaces from the provided prefixes
namespaces = {
    "bacnet": rdflib.Namespace("http://data.ashrae.org/bacnet/2020#"),
    "brick": rdflib.Namespace("https://brickschema.org/schema/Brick#"),
    "bsh": rdflib.Namespace("https://brickschema.org/schema/BrickShape#"),
    "dcterms": rdflib.Namespace("http://purl.org/dc/terms/"),
    "owl": rdflib.Namespace("http://www.w3.org/2002/07/owl#"),
    "qudt": rdflib.Namespace("http://qudt.org/schema/qudt/"),
    "qudtqk": rdflib.Namespace("http://qudt.org/vocab/quantitykind/"),
    "rdf": rdflib.Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"),
    "rdfs": rdflib.Namespace("http://www.w3.org/2000/01/rdf-schema#"),
    "rec": rdflib.Namespace("https://w3id.org/rec#"),
    "ref": rdflib.Namespace("https://brickschema.org/schema/Brick/ref#"),
    "s223": rdflib.Namespace("http://data.ashrae.org/standard223#"),
    "sdo": rdflib.Namespace("http://schema.org/"),
    "sh": rdflib.Namespace("http://www.w3.org/ns/shacl#"),
    "skos": rdflib.Namespace("http://www.w3.org/2004/02/skos/core#"),
    "sosa": rdflib.Namespace("http://www.w3.org/ns/sosa/"),
    "tag": rdflib.Namespace("https://brickschema.org/schema/BrickTag#"),
    "unit": rdflib.Namespace("http://qudt.org/vocab/unit/"),
    "vcard": rdflib.Namespace("http://www.w3.org/2006/vcard/ns#"),
    "xsd": rdflib.Namespace("http://www.w3.org/2001/XMLSchema#"),
}

# Set to store unique full terms (classes and relations)
full_terms = set()

# Extract classes and relations from the specified namespaces
for s, p, o in g:
    # Check subject (could be a class or instance)
    for prefix, ns in namespaces.items():
        if str(s).startswith(str(ns)):
            full_terms.add(str(s).replace(str(ns), f"{prefix}:"))
            break

    # Check predicate (relations/properties)
    for prefix, ns in namespaces.items():
        if str(p).startswith(str(ns)):
            full_terms.add(str(p).replace(str(ns), f"{prefix}:"))
            break

    # Check object (could be a class or type)
    for prefix, ns in namespaces.items():
        if str(o).startswith(str(ns)):
            full_terms.add(str(o).replace(str(ns), f"{prefix}:"))
            break

# Save full terms to a text file
with open("all_relations_and_classes.txt", "w") as f:
    for term in sorted(full_terms):
        f.write(f"{term}\n")

print(
    f"Extracted {len(full_terms)} unique terms (classes and relations) and saved to 'all_relations_and_classes.txt'."
)

# # Optional: Add tokens to T5 tokenizer
# tokenizer = T5Tokenizer.from_pretrained("t5-small")
# num_added = tokenizer.add_tokens(list(full_terms))
# print(f"Added {num_added} new tokens to the tokenizer.")

# # Save updated tokenizer
# tokenizer.save_pretrained("t5_with_all_tokens")

# # Load model and resize embeddings
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
# model.resize_token_embeddings(len(tokenizer))
# model.save_pretrained("t5_with_all_tokens")

# print("Tokenizer and model updated with all terms from specified namespaces.")

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Extracted 3332 unique terms (classes and relations) and saved to 'all_relations_and_classes.txt'.
