Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
387f63a
beginning of RDF.Isomorphic comparison algorithm
marcelotto Jun 11, 2017
3346e1e
Merge branch 'master' into isomorphic
marcelotto Nov 11, 2017
8873863
RDF.Quad can be created from triple and RDF.Triple can be created fro…
marcelotto Nov 15, 2017
44934f4
Add missing test file for RDF.Statement and fix existing doctest
marcelotto Nov 15, 2017
131a1e8
Add RDF.Statement.map/2
marcelotto Nov 15, 2017
ee08962
Add RDF.Statement functions to get the coerced components of a statement
marcelotto Nov 17, 2017
411a73f
Merge branch 'master' into isomorphic
marcelotto Jan 19, 2018
994bc9e
Merge branch 'master' into isomorphic
marcelotto Feb 28, 2018
9143abb
Add W3C RDF Dataset Normalization test cases
marcelotto Mar 1, 2018
df51cbb
Remove old RDF graph isomorphism attempt
marcelotto Mar 1, 2018
3b6a399
First implementation attempt of the W3C RDF normalization algorithm
marcelotto Mar 1, 2018
99d182d
Second, GenServer-based implementation of the W3C RDF normalization a…
marcelotto Mar 1, 2018
a300bc7
Merge branch 'master' into isomorphic
marcelotto Nov 23, 2020
eae1c61
Make normalization compilable again
marcelotto Nov 23, 2020
64d3b87
Apply mix formatter
marcelotto Nov 23, 2020
db4e695
Various fixes and clean up of the normalization algorithm
marcelotto Nov 25, 2020
421c274
To be continued fixing of the RDF Dataset normalization
marcelotto Dec 20, 2020
38ccf68
Merge branch 'master' into isomorphic
marcelotto Nov 14, 2022
3caab46
Update to latest RDF Data canonicalization test suite
marcelotto Nov 23, 2022
c9372aa
Add RDF.Statement.bnodes/1
marcelotto Nov 25, 2022
f95cc37
Fix bugs in URDNA2015 implementation
marcelotto Nov 27, 2022
81864b8
Reimplementation of the URDNA2015 algorithm
marcelotto Nov 28, 2022
b0503d9
Fix bugs in URDNA2015 implementation
marcelotto Nov 30, 2022
caed8cb
Clean up URDNA2015 implementation
marcelotto Dec 2, 2022
7de4cd5
Ignore "_:" prefix in RDF.BlankNode.new/1
marcelotto Dec 2, 2022
c19b2c3
Remove legacy code
marcelotto Dec 2, 2022
4bdd924
Add additional step to URDNA2015 to circumvent global state
marcelotto Dec 2, 2022
e13c6dd
Add module doc on RDF.Canonicalization
marcelotto Dec 2, 2022
c746ce5
Update CHANGELOG
marcelotto Dec 2, 2022
86f604d
Update docs
marcelotto Dec 4, 2022
d57a953
Update CHANGELOG
marcelotto Dec 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ This project adheres to [Semantic Versioning](http://semver.org/) and

## Unreleased

### Added

- implementation of the [Standard RDF Dataset Canonicalization Algorithm](https://w3c-ccg.github.io/rdf-dataset-canonicalization/spec/)
- `RDF.Statement.bnodes/1`, `RDF.Triple.bnodes/1`, `RDF.Quad.bnodes/1` to get a list
of all blank nodes within a statement
- `RDF.Statement.include_value?/2`, `RDF.Triple.include_value?/2`, `RDF.Quad.include_value?/2`
to check whether a given value is a component of a statement

### Changed

- `RDF.BlankNode.new/1` ignores the prefix `"_:"` in a given blank node name

### Fixed

- the `term_to_iri/1` macro didn't work properly in all types of pattern matches
Expand Down Expand Up @@ -875,8 +887,13 @@ Elixir versions < 1.6 are no longer supported

### Added

- implementation of the standard RDF dataset normalization algorithm
- `isomorphic?` ...
- `Collectable` implementations for all `RDF.Data` structures so they can be
used as destinations of `Enum.into` and `for` comprehensions
- `RDF.Quad` can be created from triple and `RDF.Triple` can be created from quad
- `RDF.Statement.map/2` which creates a statement with mapped nodes from another statement
- `RDF.Statement` functions to get the coerced components of a statement

### Fixed

Expand Down
333 changes: 333 additions & 0 deletions lib/rdf/canonicalization/canonicalization.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
defmodule RDF.Canonicalization do
@moduledoc """
An implementation of the standard RDF Dataset Canonicalization Algorithm.

See <https://w3c-ccg.github.io/rdf-dataset-canonicalization/spec/>.
"""

use RDF
alias RDF.Canonicalization.{IdentifierIssuer, State}
alias RDF.{BlankNode, Dataset, Quad, Statement, Utils}

def normalize(input) do
urdna2015(input)
end

defp urdna2015(input) do
input
|> State.new()
|> create_canonical_identifiers_for_single_node_hashes()
|> create_canonical_identifiers_for_multiple_node_hashes()
|> apply_canonicalization(input)
end

# 3)
defp create_canonical_identifiers_for_single_node_hashes(state) do
non_normalized_identifiers = Map.keys(state.bnode_to_statements)
do_create_canonical_identifiers_for_single_node_hashes(state, non_normalized_identifiers)
end

# 4)
defp do_create_canonical_identifiers_for_single_node_hashes(
state,
non_normalized_identifiers,
simple \\ true
)

# 5)
defp do_create_canonical_identifiers_for_single_node_hashes(
state,
non_normalized_identifiers,
true
) do
# 5.2)
state = State.clear_hash_to_bnodes(state)

# 5.3) Calculate hashes for first degree nodes
state =
Enum.reduce(non_normalized_identifiers, state, fn identifier, state ->
State.add_bnode_hash(state, identifier, hash_first_degree_quads(state, identifier))
end)

# 5.4) Create canonical replacements for hashes mapping to a single node
{non_normalized_identifiers, state, simple} =
state.hash_to_bnodes
|> Enum.sort()
|> Enum.reduce({non_normalized_identifiers, state, false}, fn
{hash, identifier_list}, {non_normalized_identifiers, state, simple} ->
case MapSet.to_list(identifier_list) do
[identifier] ->
state = State.issue_canonical_identifier(state, identifier)

{
List.delete(non_normalized_identifiers, identifier),
State.delete_bnode_hash(state, hash),
true
}

[] ->
raise "unexpected empty identifier list"

_ ->
{non_normalized_identifiers, state, simple}
end
end)

do_create_canonical_identifiers_for_single_node_hashes(
state,
non_normalized_identifiers,
simple
)
end

defp do_create_canonical_identifiers_for_single_node_hashes(state, _, false), do: state

# 6)
defp create_canonical_identifiers_for_multiple_node_hashes(state) do
state.hash_to_bnodes
|> Enum.sort()
|> Enum.reduce(state, fn {_hash, identifier_list}, state ->
# 6.1-2) Create a hash_path_list for all bnodes using a temporary identifier used to create canonical replacements
identifier_list
|> Enum.reduce([], fn identifier, hash_path_list ->
if IdentifierIssuer.issued?(state.canonical_issuer, identifier) do
hash_path_list
else
{_issued_identifier, temporary_issuer} =
"_:b"
|> IdentifierIssuer.new()
|> IdentifierIssuer.issue_identifier(identifier)

[
hash_n_degree_quads(state, identifier, temporary_issuer)
| hash_path_list
]
end
end)
|> Enum.sort()
# 6.3) Create canonical replacements for nodes
|> Enum.reduce(state, fn {_hash, issuer}, state ->
issuer
|> IdentifierIssuer.issued_identifiers()
|> Enum.reduce(state, &State.issue_canonical_identifier(&2, &1))
end)
end)
end

# 7)
defp apply_canonicalization(state, data) do
Enum.reduce(data, Dataset.new(), fn statement, canonicalized_data ->
Dataset.add(
canonicalized_data,
if Statement.has_bnode?(statement) do
Statement.map(statement, fn
{_, %BlankNode{} = bnode} ->
state.canonical_issuer
|> IdentifierIssuer.identifier(bnode)
|> BlankNode.new()

{_, node} ->
node
end)
else
statement
end
)
end)
end

# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-first-degree-quads
defp hash_first_degree_quads(state, ref_bnode_id) do
state.bnode_to_statements
|> Map.get(ref_bnode_id, [])
|> Enum.map(fn statement ->
statement
|> Quad.new()
|> Statement.map(fn
{_, ^ref_bnode_id} -> ~B<a>
{_, %BlankNode{}} -> ~B<z>
{_, node} -> node
end)
|> RDF.dataset()
|> NQuads.write_string!()
end)
|> Enum.sort()
|> Enum.join()
|> hash()

# |> IO.inspect(label: "1deg: node: #{inspect(ref_bnode_id)}, hash_first_degree_quads")
end

# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-related-blank-node
defp hash_related_bnode(state, related, statement, issuer, position) do
identifier =
IdentifierIssuer.identifier(state.canonical_issuer, related) ||
IdentifierIssuer.identifier(issuer, related) ||
hash_first_degree_quads(state, related)

input = to_string(position)

input =
if position != :g do
"#{input}<#{Statement.predicate(statement)}>"
else
input
end <> identifier

hash(input)
# |> IO.inspect(label: "hrel: input: #{inspect(input)}, hash_related_bnode")
end

# see https://www.w3.org/community/reports/credentials/CG-FINAL-rdf-dataset-canonicalization-20221009/#hash-n-degree-quads
def hash_n_degree_quads(state, identifier, issuer) do
# IO.inspect(identifier, label: "ndeg: identifier")

# 1-3)
hash_to_related_bnodes =
Enum.reduce(state.bnode_to_statements[identifier], %{}, fn statement, map ->
Map.merge(
map,
hash_related_statement(state, identifier, statement, issuer),
fn _, terms, new -> terms ++ new end
)
end)

# |> IO.inspect(label: "ndeg: hash_to_related_bnodes")

{data_to_hash, issuer} =
hash_to_related_bnodes
|> Enum.sort()
|> Enum.reduce({"", issuer}, fn
{related_hash, bnode_list}, {data_to_hash, issuer} ->
# 5.1)
data_to_hash = data_to_hash <> related_hash
chosen_path = ""
chosen_issuer = nil

# 5.2-4)
{chosen_path, chosen_issuer} =
bnode_list
|> Utils.permutations()
|> Enum.reduce({chosen_path, chosen_issuer}, fn
permutation, {chosen_path, chosen_issuer} ->
# IO.inspect(permutation, label: "ndeg: perm")

issuer_copy = IdentifierIssuer.copy(issuer)
chosen_path_length = String.length(chosen_path)

# 5.4.4)
{path, recursion_list, issuer_copy} =
Enum.reduce_while(permutation, {"", [], issuer_copy}, fn
related, {path, recursion_list, issuer_copy} ->
{path, recursion_list, issuer_copy} =
if issued_identifier =
IdentifierIssuer.identifier(state.canonical_issuer, related) do
{path <> issued_identifier, recursion_list, issuer_copy}
else
if issued_identifier = IdentifierIssuer.identifier(issuer_copy, related) do
{path <> issued_identifier, recursion_list, issuer_copy}
else
{issued_identifier, issuer_copy} =
IdentifierIssuer.issue_identifier(issuer_copy, related)

{
path <> issued_identifier,
[related | recursion_list],
issuer_copy
}
end
end

if chosen_path_length != 0 and
String.length(path) >= chosen_path_length and
path > chosen_path do
{:halt, {path, recursion_list, issuer_copy}}
else
{:cont, {path, recursion_list, issuer_copy}}
end
end)

# IO.puts("ndeg: related_hash: #{related_hash}, path: #{path}, recursion: #{inspect(recursion_list)}")

# 5.4.5)
{issuer_copy, path} =
recursion_list
|> Enum.reverse()
|> Enum.reduce_while({issuer_copy, path}, fn related, {issuer_copy, path} ->
# Note: The following steps seem to be the only steps in the whole algorithm
# which really rely on global state.

# 5.4.5.1)
{result_hash, result_issuer} =
hash_n_degree_quads(state, related, issuer_copy)

# This step was added to circumvent the need for global state.
# It's unclear whether it is actually required, since all test
# of the test suite pass without it.
# see https://github.com/w3c-ccg/rdf-dataset-canonicalization/issues/31
result_issuer =
if result_issuer.id == issuer_copy.id do
{_, issuer} = IdentifierIssuer.issue_identifier(result_issuer, related)
issuer
else
result_issuer
end

# 5.4.5.2)
{issued_identifier, _issuer_copy} =
IdentifierIssuer.issue_identifier(issuer_copy, related)

path = path <> issued_identifier <> "<#{result_hash}>"

if chosen_path_length != 0 and
String.length(path) >= chosen_path_length and
path > chosen_path do
{:halt, {result_issuer, path}}
else
{:cont, {result_issuer, path}}
end
end)

if chosen_path_length == 0 or path < chosen_path do
{path, issuer_copy}
else
{chosen_path, chosen_issuer}
end
end)

# 5.5)
{data_to_hash <> chosen_path, chosen_issuer}
end)

# IO.puts("ndeg: datatohash: #{data_to_hash}, hash: #{hash(data_to_hash)}")

{hash(data_to_hash), issuer}
end

# 4.8.2.3.1) Group adjacent bnodes by hash
defp hash_related_statement(state, identifier, statement, issuer) do
[
s: Statement.subject(statement),
o: Statement.object(statement),
g: Statement.graph_name(statement)
]
|> Enum.reduce(%{}, fn
{_, ^identifier}, map ->
map

{pos, %BlankNode{} = term}, map ->
hash = hash_related_bnode(state, term, statement, issuer, pos)

Map.update(map, hash, [term], fn terms ->
if term in terms, do: terms, else: [term | terms]
end)

_, map ->
map
end)
end

defp hash(data) do
:crypto.hash(:sha256, data) |> Base.encode16(case: :lower)
end
end
Loading