[issue-558] add optional feature to generate a relationship graph

Signed-off-by: Meret Behrens <meret.behrens@tngtech.com>
spdx · Apr 6, 2023 · 91bb744 · 91bb744
1 parent 0e1df0a
commit 91bb744
Show file tree

Hide file tree

Showing 8 changed files with 295 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -38,9 +38,11 @@ This library implements SPDX parsers, convertors, validators and handlers in Pyt
 
 # Features
 
-* API to create and manipulate SPDX v2.2 and v2.3 documents.
+* API to create and manipulate SPDX v2.2 and v2.3 documents
 * Parse, convert, create and validate SPDX files
 * supported formats: Tag/Value, RDF, JSON, YAML, XML
+* visualize the structure of a SPDX document by creating an `AGraph`. Note: This is an optional feature and requires 
+additional installation of optional dependencies
 
 # Planned features
 
@@ -78,6 +80,18 @@ instead of `bin`.
 
 * For help use `pyspdxtools --help`
 
+3. **GRAPH GENERATION** (optional feature)
+
+* This feature generates a graph representing all elements in the SPDX document and their connections based on the provided
+  relationships. The graph can be rendered to a picture. Below is an example for the file `tests/data/formats/SPDXJSONExample-v2.3.spdx.json`:
+![SPDXJSONExample-v2.3.spdx.png](assets/SPDXJSONExample-v2.3.spdx.png)
+* Make sure you install the optional dependencies `networkx` and `pygraphviz`. To do so run `pip install ".[graph_generation]"`.
+* Use `pyspdxtools -i <input_file> --graph -o <output_file>` where `<output_file>` is an output file name with valid format for `pygraphviz` (check 
+  the documentation [here](https://pygraphviz.github.io/documentation/stable/reference/agraph.html#pygraphviz.AGraph.draw)). 
+* If you are using a source distribution, try running
+  `pyspdxtools -i tests/data/formats/SPDXJSONExample-v2.3.spdx.json --graph -o SPDXJSONExample-v2.3.spdx.png` to generate 
+  a png with an overview of the structure of the example file.  
+
 ## Library usage
 1. **DATA MODEL**
   * The `src.spdx.model` package constitutes the internal SPDX v2.3 data model (v2.2 is a simply a subset of this).

diff --git a/assets/SPDXJSONExample-v2.3.spdx.png b/assets/SPDXJSONExample-v2.3.spdx.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ dynamic = ["version"]
 [project.optional-dependencies]
 test = ["pytest"]
 code_style = ["isort", "black", "flake8"]
+graph_generation = ["pygraphviz", "networkx"]
 
 [project.scripts]
 pyspdxtools = "spdx.clitools.pyspdxtools:main"

diff --git a/src/spdx/clitools/pyspdxtools.py b/src/spdx/clitools/pyspdxtools.py
@@ -18,6 +18,7 @@
 
 import click
 
+from spdx.graph_generation import export_graph_from_document
 from spdx.model.document import Document
 from spdx.parser.error import SPDXParsingError
 from spdx.parser.parse_anything import parse_file
@@ -32,7 +33,8 @@
 @click.option(
     "--outfile",
     "-o",
-    help="The file to write the converted document to (write a dash for output to stdout or omit for no conversion).",
+    help="The file to write the converted document to (write a dash for output to stdout or omit for no conversion). "
+    "If you add the option --graph to the command the generated graph will be written to this file.",
 )
 @click.option(
     "--version",
@@ -41,7 +43,15 @@
     default=None,
 )
 @click.option("--novalidation", is_flag=True, help="Don't validate the provided document.")
-def main(infile: str, outfile: str, version: str, novalidation: bool):
+@click.option(
+    "--graph",
+    is_flag=True,
+    default=False,
+    help="Generate a relationship graph from the input file. "
+    "The generated graph is saved to the file specified with --outfile. "
+    "Note: You need to install the optional dependencies 'networkx' and 'pygraphviz' for this feature.",
+)
+def main(infile: str, outfile: str, version: str, novalidation: bool, graph: bool):
     """
     CLI-tool for validating SPDX documents and converting between RDF, TAG-VALUE, JSON, YAML and XML formats.
     Formats are determined by the file endings.
@@ -50,9 +60,6 @@ def main(infile: str, outfile: str, version: str, novalidation: bool):
     try:
         document: Document = parse_file(infile)
 
-        if outfile == "-":
-            tagvalue_writer.write_document(document, sys.stdout)
-
         if not novalidation:
             if not version:
                 version = document.creation_info.spdx_version
@@ -72,7 +79,20 @@ def main(infile: str, outfile: str, version: str, novalidation: bool):
             else:
                 logging.info("The document is valid.")
 
-        if outfile and outfile != "-":
+        if outfile == "-":
+            tagvalue_writer.write_document(document, sys.stdout)
+
+        elif graph:
+            try:
+                export_graph_from_document(document, outfile)
+            except ImportError:
+                logging.error(
+                    "To be able to draw a relationship graph of the parsed document "
+                    "you need to install 'networkx' and 'pygraphviz'. Run 'pip install \".[graph_generation]\"'."
+                )
+                sys.exit(1)
+
+        elif outfile:
             write_file(document, outfile, validate=False)
 
     except NotImplementedError as err:

diff --git a/src/spdx/document_utils.py b/src/spdx/document_utils.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2022 spdx contributors
 #
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Union
+from typing import Dict, List, Union
 
 from spdx.model.document import Document
 from spdx.model.file import File
@@ -17,9 +17,15 @@ def get_contained_spdx_element_ids(document: Document) -> List[str]:
 
 
 def get_element_from_spdx_id(document: Document, spdx_id: str) -> Union[Package, File, Snippet, None]:
-    elements = [file_ for file_ in document.files]
-    elements.extend([package_ for package_ in document.packages])
-    elements.extend([snippet_ for snippet_ in document.snippets])
-    for element in elements:
-        if element.spdx_id == spdx_id:
-            return element
+    contained_spdx_elements: Dict[str, Union[Package, File, Snippet]] = get_contained_spdx_elements(document)
+    if spdx_id not in contained_spdx_elements:
+        return None
+    return contained_spdx_elements[spdx_id]
+
+
+def get_contained_spdx_elements(document: Document) -> Dict[str, Union[Package, File, Snippet]]:
+    contained_spdx_elements = {package.spdx_id: package for package in document.packages}
+    contained_spdx_elements.update({file.spdx_id: file for file in document.files})
+    contained_spdx_elements.update({snippet.spdx_id: snippet for snippet in document.snippets})
+
+    return contained_spdx_elements
diff --git a/src/spdx/graph_generation.py b/src/spdx/graph_generation.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: 2023 spdx contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Union
+
+from spdx.model.file import File
+from spdx.model.package import Package
+from spdx.model.snippet import Snippet
+
+try:
+    from networkx import DiGraph
+except ImportError:
+    DiGraph = None
+from spdx.document_utils import get_contained_spdx_elements
+from spdx.model.document import Document
+from spdx.model.relationship import Relationship
+
+
+def export_graph_from_document(document: Document, file_name: str) -> None:
+    from networkx.drawing import nx_agraph
+
+    graph = generate_relationship_graph_from_spdx(document)
+    _color_nodes(graph)
+    attributes_graph = nx_agraph.to_agraph(graph)  # convert to a pygraphviz graph
+    attributes_graph.draw(file_name, prog="dot")
+
+
+def generate_relationship_graph_from_spdx(document: Document) -> DiGraph:
+    from networkx import DiGraph
+
+    graph = DiGraph()
+    graph.add_node(document.creation_info.spdx_id, element=document.creation_info)
+
+    contained_elements: Dict[str, Union[Package, File, Snippet]] = get_contained_spdx_elements(document)
+    contained_element_nodes = [(spdx_id, {"element": element}) for spdx_id, element in contained_elements.items()]
+    graph.add_nodes_from(contained_element_nodes)
+
+    relationships_by_spdx_id: Dict[str, List[Relationship]] = dict()
+    for relationship in document.relationships:
+        relationships_by_spdx_id.setdefault(relationship.spdx_element_id, []).append(relationship)
+
+    for spdx_id, relationships in relationships_by_spdx_id.items():
+        if spdx_id not in graph.nodes():
+            # this will add any external spdx_id to the graph where we have no further information about the element,
+            # to indicate that this node represents an element we add the attribute "element"
+            graph.add_node(spdx_id, element=None)
+        for relationship in relationships:
+            relationship_node_key = relationship.spdx_element_id + "_" + relationship.relationship_type.name
+            graph.add_node(relationship_node_key, comment=relationship.comment)
+            graph.add_edge(relationship.spdx_element_id, relationship_node_key)
+            # if the related spdx element is SpdxNone or SpdxNoAssertion we need a type conversion
+            related_spdx_element_id = str(relationship.related_spdx_element_id)
+
+            if related_spdx_element_id not in graph.nodes():
+                # this will add any external spdx_id to the graph where we have no further information about
+                # the element, to indicate that this node represents an element we add the attribute "element"
+                graph.add_node(
+                    related_spdx_element_id,
+                    element=None,
+                )
+            graph.add_edge(relationship_node_key, related_spdx_element_id)
+
+    return graph
+
+
+def _color_nodes(graph: DiGraph) -> None:
+    for node in graph.nodes():
+        if "_" in node:
+            # nodes representing a RelationshipType are concatenated with the spdx_element_id,
+            # to only see the RelationshipType when rendering the graph to a picture we add
+            # a label to these nodes
+            graph.add_node(node, color="lightgreen", label=node.split("_", 1)[-1])
+        elif node == "SPDXRef-DOCUMENT":
+            graph.add_node(node, color="indianred2")
+        else:
+            graph.add_node(node, color="lightskyblue")
diff --git a/tests/spdx/test_document_utils.py b/tests/spdx/test_document_utils.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from spdx.document_utils import get_contained_spdx_element_ids, get_element_from_spdx_id
+from spdx.document_utils import get_contained_spdx_element_ids, get_contained_spdx_elements, get_element_from_spdx_id
 from tests.spdx.fixtures import document_fixture, file_fixture, package_fixture, snippet_fixture
 
 
@@ -26,3 +26,11 @@ def test_get_element_from_spdx_id(variables):
     assert get_element_from_spdx_id(document, file.spdx_id) == file
     assert get_element_from_spdx_id(document, snippet.spdx_id) == snippet
     assert get_element_from_spdx_id(document, "unknown_id") is None
+
+
+def test_get_contained_spdx_elements(variables):
+    document, package, file, snippet = variables
+    contained_elements = get_contained_spdx_elements(document)
+    assert contained_elements[package.spdx_id] == package
+    assert contained_elements[file.spdx_id] == file
+    assert contained_elements[snippet.spdx_id] == snippet
diff --git a/tests/spdx/test_graph_generation.py b/tests/spdx/test_graph_generation.py
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: 2023 spdx contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+from typing import List
+from unittest import TestCase
+
+import pytest
+
+from spdx.graph_generation import generate_relationship_graph_from_spdx
+from spdx.model.document import Document
+from spdx.model.relationship import Relationship, RelationshipType
+from spdx.parser.parse_anything import parse_file
+from tests.spdx.fixtures import document_fixture, file_fixture, package_fixture
+
+try:
+    import networkx  # noqa: F401
+    import pygraphviz  # noqa: F401
+except ImportError:
+    pytest.skip("Skip this module as the tests need optional dependencies to run.", allow_module_level=True)
+
+
+@pytest.mark.parametrize(
+    "file_name, nodes_count, edges_count, relationship_node_keys",
+    [
+        (
+            "SPDXJSONExample-v2.3.spdx.json",
+            22,
+            22,
+            ["SPDXRef-Package_DYNAMIC_LINK", "SPDXRef-JenaLib_CONTAINS"],
+        ),
+        (
+            "SPDXJSONExample-v2.2.spdx.json",
+            20,
+            19,
+            ["SPDXRef-Package_DYNAMIC_LINK", "SPDXRef-JenaLib_CONTAINS"],
+        ),
+        (
+            "SPDXRdfExample-v2.3.spdx.rdf.xml",
+            22,
+            22,
+            ["SPDXRef-Package_DYNAMIC_LINK", "SPDXRef-JenaLib_CONTAINS"],
+        ),
+        (
+            "SPDXRdfExample-v2.2.spdx.rdf.xml",
+            20,
+            17,
+            ["SPDXRef-Package_DYNAMIC_LINK", "SPDXRef-JenaLib_CONTAINS"],
+        ),
+        (
+            "SPDXTagExample-v2.3.spdx",
+            22,
+            22,
+            ["SPDXRef-Package_DYNAMIC_LINK", "SPDXRef-JenaLib_CONTAINS"],
+        ),
+    ],
+)
+def test_generate_graph_from_spdx(
+    file_name: str,
+    nodes_count: int,
+    edges_count: int,
+    relationship_node_keys: List[str],
+) -> None:
+    document = parse_file(str(Path(__file__).resolve().parent.parent / "spdx" / "data" / "formats" / file_name))
+    graph = generate_relationship_graph_from_spdx(document)
+
+    assert document.creation_info.spdx_id in graph.nodes()
+    assert graph.number_of_nodes() == nodes_count
+    assert graph.number_of_edges() == edges_count
+    assert "SPDXRef-DOCUMENT_DESCRIBES" in graph.nodes()
+    for relationship_node_key in relationship_node_keys:
+        assert relationship_node_key in graph.nodes()
+
+
+def test_complete_connected_graph() -> None:
+    document = _create_minimal_document()
+
+    graph = generate_relationship_graph_from_spdx(document)
+
+    TestCase().assertCountEqual(
+        graph.nodes(),
+        [
+            "SPDXRef-DOCUMENT",
+            "SPDXRef-Package-A",
+            "SPDXRef-Package-B",
+            "SPDXRef-File",
+            "SPDXRef-DOCUMENT_DESCRIBES",
+            "SPDXRef-Package-A_CONTAINS",
+            "SPDXRef-Package-B_CONTAINS",
+        ],
+    )
+    TestCase().assertCountEqual(
+        graph.edges(),
+        [
+            ("SPDXRef-DOCUMENT", "SPDXRef-DOCUMENT_DESCRIBES"),
+            ("SPDXRef-DOCUMENT_DESCRIBES", "SPDXRef-Package-A"),
+            ("SPDXRef-DOCUMENT_DESCRIBES", "SPDXRef-Package-B"),
+            ("SPDXRef-Package-A", "SPDXRef-Package-A_CONTAINS"),
+            ("SPDXRef-Package-A_CONTAINS", "SPDXRef-File"),
+            ("SPDXRef-Package-B", "SPDXRef-Package-B_CONTAINS"),
+            ("SPDXRef-Package-B_CONTAINS", "SPDXRef-File"),
+        ],
+    )
+
+
+def test_complete_unconnected_graph() -> None:
+    document = _create_minimal_document()
+    document.packages += [package_fixture(spdx_id="SPDXRef-Package-C", name="Package without connection to document")]
+
+    graph = generate_relationship_graph_from_spdx(document)
+
+    TestCase().assertCountEqual(
+        graph.nodes(),
+        [
+            "SPDXRef-DOCUMENT",
+            "SPDXRef-Package-A",
+            "SPDXRef-Package-B",
+            "SPDXRef-File",
+            "SPDXRef-DOCUMENT_DESCRIBES",
+            "SPDXRef-Package-A_CONTAINS",
+            "SPDXRef-Package-B_CONTAINS",
+            "SPDXRef-Package-C",
+        ],
+    )
+    TestCase().assertCountEqual(
+        graph.edges(),
+        [
+            ("SPDXRef-DOCUMENT", "SPDXRef-DOCUMENT_DESCRIBES"),
+            ("SPDXRef-DOCUMENT_DESCRIBES", "SPDXRef-Package-A"),
+            ("SPDXRef-DOCUMENT_DESCRIBES", "SPDXRef-Package-B"),
+            ("SPDXRef-Package-A", "SPDXRef-Package-A_CONTAINS"),
+            ("SPDXRef-Package-A_CONTAINS", "SPDXRef-File"),
+            ("SPDXRef-Package-B", "SPDXRef-Package-B_CONTAINS"),
+            ("SPDXRef-Package-B_CONTAINS", "SPDXRef-File"),
+        ],
+    )
+
+
+def _create_minimal_document() -> Document:
+    packages = [
+        package_fixture(spdx_id="SPDXRef-Package-A", name="Package-A"),
+        package_fixture(spdx_id="SPDXRef-Package-B", name="Package-B"),
+    ]
+    files = [
+        file_fixture(spdx_id="SPDXRef-File", name="File"),
+    ]
+    relationships = [
+        Relationship("SPDXRef-DOCUMENT", RelationshipType.DESCRIBES, "SPDXRef-Package-A"),
+        Relationship("SPDXRef-DOCUMENT", RelationshipType.DESCRIBES, "SPDXRef-Package-B"),
+        Relationship("SPDXRef-Package-A", RelationshipType.CONTAINS, "SPDXRef-File"),
+        Relationship("SPDXRef-Package-B", RelationshipType.CONTAINS, "SPDXRef-File"),
+    ]
+    document = document_fixture(packages=packages, files=files, relationships=relationships, snippets=[])
+
+    return document