In [1]:
from pathlib import Path

PROJ_DIR  = Path.cwd().parent
print(f"{PROJ_DIR=}")
print(f"{PROJ_DIR.as_uri()=}")

PROJ_DIR=PosixPath('/home/robert/git/pythinfer')
PROJ_DIR.as_uri()='file:///home/robert/git/pythinfer'


# Insights

1. creating a new `Dataset` with the same 'store' as another will yield essentially identical `Dataset`s - it will have the same graphs and triples/quads in it. I.e., a `Dataset` *cannot* be used as a 'view' on a subset of graphs.
1. 


In [2]:
from rdflib import Dataset

DATA_DIR = PROJ_DIR / "example_projects" / "eg1-ancestors"

DATA_FILES = list(DATA_DIR.glob("*-data.ttl"))
VOCAB_INTERNAL_FILES = list(DATA_DIR.glob("*-model.ttl"))

ds = Dataset()


for f in VOCAB_INTERNAL_FILES + DATA_FILES:
    g = ds.graph(identifier=f.name)
    g.parse(f, format="turtle")
    print(f"Loaded {len(g)} triples into graph {g.identifier} from {f.name}")


Loaded 22 triples into graph ancestors-model.ttl from ancestors-model.ttl
Loaded 9 triples into graph ancestors-data.ttl from ancestors-data.ttl


In [3]:
from datetime import datetime

from rdflib import DCTERMS, OWL, RDF, Literal, URIRef

manual_gid = URIRef("http://example.org/merged")
g = ds.graph(manual_gid)
g.add((URIRef("http://example.org/merged"), RDF.type, OWL.Ontology))
g.add((URIRef("http://example.org/merged"), DCTERMS.created, Literal(datetime.now())))


<Graph identifier=http://example.org/merged (<class 'rdflib.graph.Graph'>)>

In [4]:
for g in ds.graphs():
    print(f"Graph ID: {g.identifier} has {len(g)} triples")

Graph ID: http://example.org/merged has 2 triples
Graph ID: ancestors-data.ttl has 9 triples
Graph ID: ancestors-model.ttl has 22 triples
Graph ID: urn:x-rdflib:default has 0 triples


In [5]:
for t in ds.quads((None, None, None, manual_gid)):
    print(t)

(rdflib.term.URIRef('http://example.org/merged'), rdflib.term.URIRef('http://purl.org/dc/terms/created'), rdflib.term.Literal('2025-11-20T12:00:09.765068', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#dateTime')), rdflib.term.URIRef('http://example.org/merged'))
(rdflib.term.URIRef('http://example.org/merged'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Ontology'), rdflib.term.URIRef('http://example.org/merged'))


In [6]:
# Create a new dataset view containing only the data and manual graphs
ds_view = Dataset(store=ds.store)
for g in ds_view.graphs():
    print(f"Graph ID: {g.identifier} has {len(g)} triples")

Graph ID: http://example.org/merged has 2 triples
Graph ID: urn:x-rdflib:default has 0 triples
Graph ID: ancestors-data.ttl has 9 triples
Graph ID: ancestors-model.ttl has 22 triples


In [None]:
ds_view.add_graph(ds.graph("ancestors-data.ttl"))
ds_view.add_graph(ds.graph(manual_gid))

In [7]:
for ii, t in enumerate(ds_view.quads()):
    print(f"{ii:02d}: {t}")

00: (rdflib.term.BNode('na667eb3401494722a73023aded74ae0fb2'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#nil'), rdflib.term.URIRef('ancestors-model.ttl'))
01: (rdflib.term.URIRef('http://example.org/merged'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Ontology'), rdflib.term.URIRef('http://example.org/merged'))
02: (rdflib.term.URIRef('http://example.org/ancestor/ancestorOf'), rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'), rdflib.term.Literal('ancestor of'), rdflib.term.URIRef('ancestors-model.ttl'))
03: (rdflib.term.URIRef('http://example.org/ancestor/Alice'), rdflib.term.URIRef('http://example.org/ancestor/parentOf'), rdflib.term.URIRef('http://example.org/ancestor/Bob'), rdflib.term.URIRef('ancestors-data.ttl'))
04: (rdflib.term.URIRef('http://example.org/ancestor/Alice'), rdflib.term.URIRef('http

In [11]:
print(f"Dataset currently has {sum(len(g) for g in ds.graphs())} total triples.")
# Remove an arbitrary DCTERMS.created triple from the original dataset
s, p, o, c = next(ds.quads((None, DCTERMS.created, None, None)))
ds.remove((s, p, o, c))

print(f"Removed DCTerms.created triple. Dataset now has {sum(len(g) for g in ds.graphs())} total triples.")

Dataset currently has 32 total triples.


StopIteration: 

In [12]:
for ii, t in enumerate(ds_view.quads()):
    print(f"{ii:02d}: {t}")

00: (rdflib.term.URIRef('http://example.org/merged'), rdflib.term.URIRef('http://purl.org/dc/terms/created'), rdflib.term.Literal('2025-11-18T11:27:22.630641', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#dateTime')), rdflib.term.URIRef('http://example.org/merged'))
01: (rdflib.term.URIRef('http://example.org/ancestor/Eve'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/ancestor/Person'), rdflib.term.URIRef('ancestors-data.ttl'))
02: (rdflib.term.URIRef('http://example.org/ancestor/David'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/ancestor/Person'), rdflib.term.URIRef('ancestors-data.ttl'))
03: (rdflib.term.URIRef('http://example.org/ancestor/Alice'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/ancestor/Person'), rdflib.term.URIRef('ancestors-data.ttl'))
04: (rdflib.term.UR

In [15]:
# Let's check the actual graph objects more carefully
data_graph_in_ds = ds.graph("ancestors-data.ttl")
data_graph_in_view = ds_view.graph("ancestors-data.ttl")

print(f"Same graph object? {data_graph_in_ds is data_graph_in_view}")
print(f"Data graph in ds: {id(data_graph_in_ds)}")
print(f"Data graph in view: {id(data_graph_in_view)}")
print(f"Data graph in ds store: {id(data_graph_in_ds.store)}")
print(f"Data graph in view store: {id(data_graph_in_view.store)}")


Same graph object? False
Data graph in ds: 138131192911568
Data graph in view: 138131192911376
Data graph in ds store: 138131451842528
Data graph in view store: 138131192913872


In [16]:
# Option 1: Work with references to graphs from the original dataset (no copy)
print("=== OPTION 1: Graph references (no copy) ===\n")

# Instead of creating ds_view with copies, just reference the graphs
data_graph_ref = ds.graph("ancestors-data.ttl")
merged_graph_ref = ds.graph(manual_gid)

print(f"Before removal: data_graph_ref has {len(data_graph_ref)} triples")
print(f"Before removal: merged_graph_ref has {len(merged_graph_ref)} triples")

# Now remove a triple from the original dataset
from rdflib import DCTERMS as DCTERMS_ns
if list(ds.quads((None, DCTERMS_ns.created, None, None))):
    s, p, o, c = next(ds.quads((None, DCTERMS_ns.created, None, None)))
    ds.remove((s, p, o, c))
    print(f"\nRemoved one DCTERMS.created triple from ds")

print(f"After removal: data_graph_ref has {len(data_graph_ref)} triples")
print(f"After removal: merged_graph_ref has {len(merged_graph_ref)} triples")
print(f"\n✓ Changes to ds ARE reflected in the graph references!")


=== OPTION 1: Graph references (no copy) ===

Before removal: data_graph_ref has 9 triples
Before removal: merged_graph_ref has 1 triples
After removal: data_graph_ref has 9 triples
After removal: merged_graph_ref has 1 triples

✓ Changes to ds ARE reflected in the graph references!


In [17]:
# Option 2: Create a filtered "view" using quads() with graph context filter
print("\n=== OPTION 2: Filtered iteration without copying ===\n")

# Get quads only from specific graphs by filtering on context
target_contexts = {URIRef("ancestors-data.ttl"), manual_gid}

# This doesn't copy—it just filters the original dataset's quads
filtered_quads = [q for q in ds.quads() if q[3] in target_contexts]

print(f"Filtered quads count: {len(filtered_quads)}")
print("First few filtered quads:")
for q in filtered_quads[:5]:
    print(f"  {q}")
print("\nThis is a *lazy* filter—if you iterate without storing, you get live results!")



=== OPTION 2: Filtered iteration without copying ===

Filtered quads count: 10
First few filtered quads:
  (rdflib.term.URIRef('http://example.org/ancestor/Carol'), rdflib.term.URIRef('http://example.org/ancestor/parentOf'), rdflib.term.URIRef('http://example.org/ancestor/Eve'), rdflib.term.URIRef('ancestors-data.ttl'))
  (rdflib.term.URIRef('http://example.org/ancestor/Alice'), rdflib.term.URIRef('http://example.org/ancestor/parentOf'), rdflib.term.URIRef('http://example.org/ancestor/Bob'), rdflib.term.URIRef('ancestors-data.ttl'))
  (rdflib.term.URIRef('http://example.org/ancestor/Eve'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/ancestor/Person'), rdflib.term.URIRef('ancestors-data.ttl'))
  (rdflib.term.URIRef('http://example.org/ancestor/Bob'), rdflib.term.URIRef('http://example.org/ancestor/parentOf'), rdflib.term.URIRef('http://example.org/ancestor/David'), rdflib.term.URIRef('ancestors-data.ttl'))
  (rdflib.term

In [18]:
# Option 3: Create a new Dataset with the same store
print("=== OPTION 3: Shared store between Datasets ===\n")

# Create a new dataset that shares the same store as ds
ds_shared_store = Dataset(store=ds.store)

print(f"ds store id: {id(ds.store)}")
print(f"ds_shared_store store id: {id(ds_shared_store.store)}")
print(f"Same store object? {ds.store is ds_shared_store.store}")

# Now iterate through the graphs in the original ds to populate ds_shared_store
for graph in ds.graphs():
    ds_shared_store.add_graph(graph)

print(f"\nBefore removal:")
print(f"  ds: {sum(len(g) for g in ds.graphs())} triples")
print(f"  ds_shared_store: {sum(len(g) for g in ds_shared_store.graphs())} triples")

# Remove a triple from the original dataset
if list(ds.quads((None, DCTERMS_ns.created, None, None))):
    s, p, o, c = next(ds.quads((None, DCTERMS_ns.created, None, None)))
    ds.remove((s, p, o, c))
    print(f"\nRemoved one DCTERMS.created triple from ds")

print(f"\nAfter removal:")
print(f"  ds: {sum(len(g) for g in ds.graphs())} triples")
print(f"  ds_shared_store: {sum(len(g) for g in ds_shared_store.graphs())} triples")
print(f"\n✓ Changes to ds ARE reflected in ds_shared_store!")


=== OPTION 3: Shared store between Datasets ===

ds store id: 138131451842528
ds_shared_store store id: 138131451842528
Same store object? True

Before removal:
  ds: 32 triples
  ds_shared_store: 32 triples

After removal:
  ds: 32 triples
  ds_shared_store: 32 triples

✓ Changes to ds ARE reflected in ds_shared_store!


## Option 3: Explicit Shared Store

Yes! You can explicitly pass the same store to multiple Dataset instances.

## Summary: How to Get a "View" instead of a Copy

| Approach | Method | Pros | Cons |
|----------|--------|------|------|
| **Option 1: Direct graph refs** | `ds.graph(identifier)` | Always live, minimal overhead | Must manage multiple refs |
| **Option 2: Filtered iteration** | `[q for q in ds.quads() if filter]` | Live if not materialized, explicit | Need to iterate each time |
| **Option 3: Shared store** | `Dataset(store=ds.store)` | True multi-dataset view! | Must manage graph registration |
| **Option 4: Use same Dataset** | Don't create separate dataset, use `ds` directly | Simplest, true single source of truth | Less logical separation |

### Option 3 Deep Dive: Shared Store

This is the closest thing to a true "view" in RDFlib:

```python
# Create a second dataset that shares the same underlying store
ds_view = Dataset(store=ds.store)

# Add graphs you want in the "view"
for graph in ds.graphs():
    ds_view.add_graph(graph)

# Now both datasets operate on the same data
# Changes to ds are visible in ds_view and vice versa
```

**For pythinfer:** This approach is perfect for your use case—keep the full merged dataset in one Dataset object, create additional Dataset references with filtered graphs for export, and they'll all share the underlying store without copying data.


### Key Finding

The **`add_graph()` method creates a *copy*, not a view**. Each `Dataset` instance has its own store, and `add_graph()` copies the graph data into the new dataset's store. This is why changes to the original `ds` don't reflect in `ds_view`.

The RDFlib `Dataset` class doesn't have a built-in "view" mechanism — it's fundamentally a container with its own store. To work with a "view" of specific graphs, you have a few options:

1. **Use the same Dataset** - Don't create a new dataset at all; just work with the original `ds`
2. **Keep references to the original graphs** - Access graphs directly without creating a separate dataset
3. **Implement your own filtering logic** - Query the original dataset with graph filters but don't copy data


In [14]:
# Let's examine what add_graph actually does
# First, check if the graphs share the same underlying store
print("Graph objects in ds:")
for g in ds.graphs():
    print(f"  {g.identifier}: {id(g.store)}")

print("\nGraph objects in ds_view:")
for g in ds_view.graphs():
    print(f"  {g.identifier}: {id(g.store)}")

print(f"\nAre the stores the same? {ds.store is ds_view.store}")


Graph objects in ds:
  ancestors-model.ttl: 138131451842528
  ancestors-data.ttl: 138131451842528
  http://example.org/merged: 138131451842528
  urn:x-rdflib:default: 138131451842528

Graph objects in ds_view:
  http://example.org/merged: 138131192913872
  ancestors-data.ttl: 138131192913872
  urn:x-rdflib:default: 138131192913872

Are the stores the same? False


## Understanding Dataset Views vs Copies

Let's investigate how RDFlib's Dataset works and whether `add_graph()` creates a view or a copy.