From 539901b86ed8ce1ad1e05812ee26407490c0e1f0 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Mon, 9 Jun 2025 13:48:23 +0100 Subject: [PATCH 01/11] Add uv.lock to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8898f647..95d38542 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,6 @@ rust/debug rust/target rust/flamegraph.svg target + +# UV +uv.lock From f88be04b853a6c24301fcab32162d67357196e25 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 10:29:57 +0100 Subject: [PATCH 02/11] Define BasicFileSystem protocol --- src/grimp/application/ports/filesystem.py | 26 ++++++++++++++++++++ src/grimp/application/ports/importscanner.py | 4 +-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/grimp/application/ports/filesystem.py b/src/grimp/application/ports/filesystem.py index b11125be..d2966255 100644 --- a/src/grimp/application/ports/filesystem.py +++ b/src/grimp/application/ports/filesystem.py @@ -1,5 +1,6 @@ import abc from typing import Iterator, List, Tuple +from typing import Protocol class AbstractFileSystem(abc.ABC): @@ -80,3 +81,28 @@ def write(self, file_name: str, contents: str) -> None: Write the contents to a file. """ raise NotImplementedError + + +class BasicFileSystem(Protocol): + """ + A more limited file system, used by the ImportScanner. + + Having two different file system APIs is an interim approach, allowing us to + implement BasicFileSystem in Rust, for use with the ImportScanner, without needing + to implement the full range of methods used by other parts of Grimp. + + Eventually we'll move the full implementation to Rust, and have a single + FileSystem interface. + """ + + def join(self, *components: str) -> str: + ... + + def split(self, file_name: str) -> Tuple[str, str]: + ... + + def read(self, file_name: str) -> str: + ... + + def exists(self, file_name: str) -> bool: + ... diff --git a/src/grimp/application/ports/importscanner.py b/src/grimp/application/ports/importscanner.py index 34da5be9..26df8aa3 100644 --- a/src/grimp/application/ports/importscanner.py +++ b/src/grimp/application/ports/importscanner.py @@ -1,7 +1,7 @@ import abc from typing import Set -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import BasicFileSystem from grimp.application.ports.modulefinder import FoundPackage from grimp.domain.valueobjects import DirectImport, Module @@ -13,7 +13,7 @@ class AbstractImportScanner(abc.ABC): def __init__( self, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool = False, ) -> None: From b79f5ccc63e72171c695bbb537842f73700a48f0 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 10:58:52 +0100 Subject: [PATCH 03/11] Implement FakeBasicFileSystem --- rust/Cargo.lock | 22 +++ rust/Cargo.toml | 3 + rust/src/filesystem.rs | 175 ++++++++++++++++++++++ rust/src/lib.rs | 3 + tests/unit/adaptors/test_filesystem.py | 196 ++++++++++++++++++------- 5 files changed, 348 insertions(+), 51 deletions(-) create mode 100644 rust/src/filesystem.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index f4ae21c1..9a2b2796 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -21,11 +21,14 @@ dependencies = [ "ruff_python_parser", "ruff_source_file", "rustc-hash 2.1.1", + "serde", "serde_json", + "serde_yaml", "slotmap", "string-interner", "tap", "thiserror", + "unindent", ] [[package]] @@ -666,6 +669,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.9.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "siphasher" version = "1.0.1" @@ -810,6 +826,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "version_check" version = "0.9.5" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 8e5acab2..845daeb7 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -25,6 +25,9 @@ const_format = "0.2.34" ruff_python_parser = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } ruff_python_ast = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } ruff_source_file = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } +serde = { version = "1.0", features = ["derive"] } +serde_yaml = "0.9" +unindent = "0.2.4" [dependencies.pyo3] version = "0.24.1" diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs new file mode 100644 index 00000000..9203f4f8 --- /dev/null +++ b/rust/src/filesystem.rs @@ -0,0 +1,175 @@ +use pyo3::exceptions::PyFileNotFoundError; +use pyo3::prelude::*; +use std::collections::HashMap; +type FileSystemContents = HashMap; +use unindent::unindent; + +#[pyclass] +pub struct FakeBasicFileSystem { + contents: FileSystemContents, +} + +#[pymethods] +impl FakeBasicFileSystem { + #[pyo3(signature = (contents=None, content_map=None))] + #[new] + fn new(contents: Option<&str>, content_map: Option>) -> PyResult { + let mut parsed_contents = match contents { + Some(contents) => parse_indented_fs_string(contents), + None => HashMap::new(), + }; + if let Some(content_map) = content_map { + let unindented_map: HashMap = content_map + .into_iter() + .map(|(key, val)| (key, unindent(&val).trim().to_string())) + .collect(); + parsed_contents.extend(unindented_map); + }; + Ok(FakeBasicFileSystem { + contents: parsed_contents, + }) + } + + #[getter] + fn sep(&self) -> String { + "/".to_string() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + let sep = self.sep(); // Get the separator from the getter method + components + .into_iter() + .map(|c| c.trim_end_matches(&sep).to_string()) + .collect::>() + .join(&sep) + } + + fn split(&self, file_name: &str) -> (String, String) { + let components: Vec<&str> = file_name.split('/').collect(); + + if components.is_empty() { + return ("".to_string(), "".to_string()); + } + + let tail = components.last().unwrap_or(&""); // Last component, or empty if components is empty (shouldn't happen from split) + + let head_components = &components[..components.len() - 1]; // All components except the last + + let head = if head_components.is_empty() { + // Case for single component paths like "filename.txt" or empty string "" + "".to_string() + } else if file_name.starts_with('/') + && head_components.len() == 1 + && head_components[0].is_empty() + { + // Special handling for paths starting with '/', e.g., "/" or "/filename.txt" + // If components were ["", ""], head_components is [""] -> should be "/" + // If components were ["", "file.txt"], head_components is [""] -> should be "/" + "/".to_string() + } else { + // Default joining for multiple components + head_components.join("/") + }; + + (head, tail.to_string()) + } + + /// Checks if a file or directory exists within the file system. + fn exists(&self, file_name: &str) -> bool { + self.contents.contains_key(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + match self.contents.get(file_name) { + Some(file_name) => Ok(file_name.clone()), + None => Err(PyFileNotFoundError::new_err("")), + } + } +} + +/// Parses an indented string representing a file system structure +/// into a HashMap where keys are full file paths. +/// +pub fn parse_indented_fs_string(input: &str) -> HashMap { + let mut file_paths_map: HashMap = HashMap::new(); + let mut path_stack: Vec = Vec::new(); // Stores current directory path components + let mut first_line = true; // Flag to handle the very first path component + + // Normalize newlines and split into lines + let buffer = input.replace("\r\n", "\n"); + let lines: Vec<&str> = buffer.split('\n').collect(); + + for line_raw in lines.clone() { + let line = line_raw.trim_end(); // Remove trailing whitespace + if line.is_empty() { + continue; // Skip empty lines + } + + let current_indent = line.chars().take_while(|&c| c.is_whitespace()).count(); + let trimmed_line = line.trim_start(); + + // Assuming 4 spaces per indentation level for calculating depth + // Adjust this if your indentation standard is different (e.g., 2 spaces, tabs) + let current_depth = current_indent / 4; + + if first_line { + // The first non-empty line sets the base path. + // It might be absolute (/a/b/) or relative (a/b/). + let root_component = trimmed_line.trim_end_matches('/').to_string(); + path_stack.push(root_component); + first_line = false; + } else { + // Adjust the path_stack based on indentation level + // Pop elements from the stack until we reach the correct parent directory depth + while path_stack.len() > current_depth { + path_stack.pop(); + } + + // If the current line is a file, append it to the path for inserting into map, + // then pop it off so that subsequent siblings are correctly handled. + // If it's a directory, append it and it stays on the stack for its children. + let component_to_add = trimmed_line.trim_end_matches('/').to_string(); + if !component_to_add.is_empty() { + // Avoid pushing empty strings due to lines like just "/" + path_stack.push(component_to_add); + } + } + + // Construct the full path + // Join components on the stack. If the first component started with '/', + // ensure the final path also starts with '/'. + let full_path = if !path_stack.is_empty() { + let mut joined = path_stack.join("/"); + // If the original root started with a slash, ensure the final path does too. + // But be careful not to double-slash if a component is e.g. "/root" + if lines[0].trim().starts_with('/') && !joined.starts_with('/') { + joined = format!("/{}", joined); + } + joined + } else { + "".to_string() + }; + + // If it's a file (doesn't end with '/'), add it to the map + // A file is not a directory, so its name should be removed from the stack after processing + // so that sibling items are at the correct level. + if !trimmed_line.ends_with('/') { + file_paths_map.insert(full_path, String::new()); // Value can be empty or actual content + if !path_stack.is_empty() { + path_stack.pop(); // Pop the file name off the stack + } + } + } + + // Edge case: If the very first line was a file and it ended up on the stack, it needs to be processed. + // This handles single-file inputs like "myfile.txt" + if !path_stack.is_empty() + && !path_stack.last().unwrap().ends_with('/') + && !file_paths_map.contains_key(&path_stack.join("/")) + { + file_paths_map.insert(path_stack.join("/"), String::new()); + } + + file_paths_map +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 18327cd2..d18c9df6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -3,6 +3,7 @@ pub mod exceptions; pub mod graph; pub mod import_parsing; pub mod module_expressions; +mod filesystem; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{InvalidModuleExpression, ModuleNotPresent, NoSuchContainer, ParseError}; @@ -18,11 +19,13 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; +use crate::filesystem::FakeBasicFileSystem; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; + m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; m.add( diff --git a/tests/unit/adaptors/test_filesystem.py b/tests/unit/adaptors/test_filesystem.py index a59a173c..899d10f6 100644 --- a/tests/unit/adaptors/test_filesystem.py +++ b/tests/unit/adaptors/test_filesystem.py @@ -1,13 +1,148 @@ from copy import copy - +from typing import Type import pytest # type: ignore - +from grimp.application.ports.filesystem import BasicFileSystem from tests.adaptors.filesystem import FakeFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] + + +class _Base: + """ + Tests for methods that AbstractFileSystem and BasicFileSystem share. + """ + + file_system_cls: Type[BasicFileSystem] + + @pytest.mark.parametrize("path", ("/path/to", "/path/to/")) + def test_join(self, path): + file_system = self.file_system_cls() + assert "/path/to/mypackage/file.py" == file_system.join(path, "mypackage", "file.py") + + def test_split(self): + file_system = self.file_system_cls() + assert ("/path/to/mypackage", "file.py") == file_system.split("/path/to/mypackage/file.py") + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/mypackage", False), + ("/path/to/mypackage/", False), + ("/path/to/mypackage/readme.txt", True), + ("/path/to/mypackage/foo/one.txt", True), + ("/path/to/mypackage/foo/two/green.txt", True), + ("/path/to/nonexistent.txt", False), + ("/path/to/mypackage/purple.txt", False), + ], + ) + def test_exists_content_only(self, file_name, expected): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """ + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/", False), + ("/path/to/file.txt", True), + ("/path/to/a/deeper/file.txt", True), + ("/path/to/nonexistent.txt", False), + ], + ) + def test_exists_content_map_only(self, file_name, expected): + file_system = self.file_system_cls( + content_map={ + "/path/to/file.txt": "hello", + "/path/to/a/deeper/file.txt": "hello", + } + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/file.txt", True), + ("/path/to/mypackage/foo/two/green.txt", True), + ("/path/to/nonexistent.txt", False), + ], + ) + def test_exists_content_map_and_content(self, file_name, expected): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """, + content_map={ + "/path/to/file.txt": "hello", + "/path/to/a/deeper/file.txt": "hello", + }, + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected_contents", + ( + ("/path/to/mypackage/readme.txt", "Lorem"), + ("/path/to/mypackage/foo/one.txt", "Ipsum"), + # Listed in contents, but not in content_map. + ("/path/to/mypackage/foo/two/green.txt", ""), + # Listed in content_map, but not in contents. + ("/path/to/mypackage/foo/two/blue.txt", "Dolor sic"), + ( + "/path/to/mypackage/indented.txt", + "This is indented text.\n We should dedent it.", + ), + ("/path/to/mypackage/nonexistent", FileNotFoundError), + ("/path/to/mypackage/foo/three/nonexistent", FileNotFoundError), + # TODO - should we raise an exception if we attempt to read a directory? + ), + ) + def test_read(self, file_name, expected_contents): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """, + content_map={ + "/path/to/mypackage/readme.txt": "Lorem", + "/path/to/mypackage/foo/one.txt": "Ipsum", + "/path/to/mypackage/foo/two/blue.txt": "Dolor sic", + "/path/to/mypackage/indented.txt": """ + This is indented text. + We should dedent it. + """, + }, + ) + if isinstance(expected_contents, type) and issubclass(expected_contents, Exception): + with pytest.raises(expected_contents): + file_system.read(file_name) + else: + assert file_system.read(file_name) == expected_contents + +class TestFakeFileSystem(_Base): + file_system_cls = FakeFileSystem -class TestFakeFileSystem: def test_walk(self): - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ __init__.py @@ -29,7 +164,7 @@ def test_walk(self): ] == list(file_system.walk("/path/to/mypackage")) def test_empty_if_directory_does_not_exist(self): - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ __init__.py @@ -38,18 +173,9 @@ def test_empty_if_directory_does_not_exist(self): assert [] == list(file_system.walk("/path/to/nonexistent/package")) def test_dirname(self): - file_system = FakeFileSystem() + file_system = self.file_system_cls() assert "/path/to" == file_system.dirname("/path/to/file.txt") - @pytest.mark.parametrize("path", ("/path/to", "/path/to/")) - def test_join(self, path): - file_system = FakeFileSystem() - assert "/path/to/mypackage/file.py" == file_system.join(path, "mypackage", "file.py") - - def test_split(self): - file_system = FakeFileSystem() - assert ("/path/to/mypackage", "file.py") == file_system.split("/path/to/mypackage/file.py") - def test_dirnames_can_be_modified_in_place(self): """ From the os.walk docs: @@ -59,7 +185,7 @@ def test_dirnames_can_be_modified_in_place(self): of visiting, or even to inform walk() about directories the caller creates or renames before it resumes walk() again. """ - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ foo/ @@ -90,38 +216,6 @@ def test_dirnames_can_be_modified_in_place(self): assert expected_tuples == actual_tuples - @pytest.mark.parametrize( - "file_name, expected_contents", - ( - ("/path/to/mypackage/readme.txt", "Lorem"), - ("/path/to/mypackage/foo/one.txt", "Ipsum"), - # Listed in contents, but not in content_map. - ("/path/to/mypackage/foo/two/green.txt", ""), - # Listed in content_map, but not in contents. - ("/path/to/mypackage/foo/two/blue.txt", "Dolor sic"), - ("/path/to/mypackage/nonexistent", FileNotFoundError), - ("/path/to/mypackage/foo/three/nonexistent", FileNotFoundError), - # TODO - should we raise an exception if we attempt to read a directory? - ), - ) - def test_read(self, file_name, expected_contents): - file_system = FakeFileSystem( - contents=""" - /path/to/mypackage/ - readme.txt - foo/ - one.txt - two/ - green.txt - """, - content_map={ - "/path/to/mypackage/readme.txt": "Lorem", - "/path/to/mypackage/foo/one.txt": "Ipsum", - "/path/to/mypackage/foo/two/blue.txt": "Dolor sic", - }, - ) - if isinstance(expected_contents, type) and issubclass(expected_contents, Exception): - with pytest.raises(expected_contents): - file_system.read(file_name) - else: - assert expected_contents == file_system.read(file_name) + +class TestFakeBasicFileSystem(_Base): + file_system_cls = rust.FakeBasicFileSystem From 7dd7b21f6de009fb7bd4554f2fb4c809d1089217 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 12:47:08 +0100 Subject: [PATCH 04/11] Use FakeBasicFileSystem in unit test --- tests/unit/adaptors/test_importscanner.py | 25 ++++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index 54d3cccf..e8315960 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -6,7 +6,8 @@ from grimp.adaptors.importscanner import ImportScanner from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module -from tests.adaptors.filesystem import FakeFileSystem + +from grimp import _rustgrimp as rust # type: ignore[attr-defined] @pytest.mark.parametrize( @@ -50,7 +51,7 @@ ) def test_absolute_imports(include_external_packages, expected_result): all_modules = {Module("foo.one"), Module("foo.two")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": """ import foo.two @@ -152,7 +153,7 @@ def test_single_namespace_package_portion(): MODULE_ALPHA, } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/one.py": """ import namespace.foo.two @@ -235,7 +236,7 @@ def test_import_of_portion_not_in_graph(include_external_packages): MODULE_FOO_ONE, MODULE_FOO_BLUE, } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/one.py": """ import namespace.bar.one.orange @@ -415,7 +416,7 @@ def test_absolute_from_imports(include_external_packages, expected_result): Module("foo.two.yellow"), Module("foo.three"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -467,7 +468,7 @@ def test_relative_from_imports(): Module("foo.two.yellow"), Module("foo.three"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -537,7 +538,7 @@ def test_trims_to_known_modules(import_source): Module("foo.two"), Module("foo.two.yellow"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -580,7 +581,7 @@ def test_trims_to_known_modules_within_init_file(): Module("foo.one.blue"), Module("foo.one.blue.alpha"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -633,7 +634,7 @@ def test_trims_to_known_modules_within_init_file(): def test_trims_whitespace_from_start_of_line_contents(): all_modules = {Module("foo"), Module("foo.one"), Module("foo.two")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -706,7 +707,7 @@ def my_function(): def test_external_package_imports_for_namespace_packages(statement, expected_module_name): module_to_scan = Module("namespace.foo.blue.alpha") - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/blue/alpha.py": statement, } @@ -751,7 +752,7 @@ def test_external_package_imports_for_namespace_packages(statement, expected_mod def test_scans_multiple_packages(statement): foo_modules = {Module("foo"), Module("foo.one"), Module("foo.two")} bar_modules = {Module("bar"), Module("bar.green"), Module("bar.blue")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": f""" import foo.two @@ -819,7 +820,7 @@ def test_exclude_type_checking_imports( Module("foo.four"), Module("foo.five"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": f""" import foo.two From b51ccc8e54d96e9a82a589b07a31d74242b83316 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 14:29:16 +0100 Subject: [PATCH 05/11] Pass BasicFileSystem to ImportScanner from use case --- src/grimp/adaptors/filesystem.py | 5 ++++- src/grimp/application/ports/filesystem.py | 8 ++++++++ src/grimp/application/usecases.py | 7 ++++--- tests/adaptors/filesystem.py | 14 +++++++++++++- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/grimp/adaptors/filesystem.py b/src/grimp/adaptors/filesystem.py index 9f637982..b69abe19 100644 --- a/src/grimp/adaptors/filesystem.py +++ b/src/grimp/adaptors/filesystem.py @@ -2,7 +2,7 @@ import tokenize from typing import Iterator, List, Tuple -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem class FileSystem(AbstractFileSystem): @@ -44,3 +44,6 @@ def write(self, file_name: str, contents: str) -> None: os.makedirs(dirname) with open(file_name, "w") as file: print(contents, file=file) + + def convert_to_basic(self) -> BasicFileSystem: + return self # FileSystem already corresponds to the BasicFileSystem protocol. diff --git a/src/grimp/application/ports/filesystem.py b/src/grimp/application/ports/filesystem.py index d2966255..bfe129a2 100644 --- a/src/grimp/application/ports/filesystem.py +++ b/src/grimp/application/ports/filesystem.py @@ -1,3 +1,4 @@ +from __future__ import annotations import abc from typing import Iterator, List, Tuple from typing import Protocol @@ -82,6 +83,13 @@ def write(self, file_name: str, contents: str) -> None: """ raise NotImplementedError + @abc.abstractmethod + def convert_to_basic(self) -> BasicFileSystem: + """ + Convert this file system to a BasicFileSystem. + """ + raise NotImplementedError + class BasicFileSystem(Protocol): """ diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index eee3286c..f403e344 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -7,6 +7,7 @@ import joblib # type: ignore +from .ports.filesystem import BasicFileSystem from ..application.ports import caching from ..application.ports.filesystem import AbstractFileSystem from ..application.ports.graph import ImportGraph @@ -145,7 +146,7 @@ def _scan_packages( imports_by_module_file.update( _scan_imports( remaining_module_files_to_scan, - file_system=file_system, + file_system=file_system.convert_to_basic(), found_packages=found_packages, include_external_packages=include_external_packages, exclude_type_checking_imports=exclude_type_checking_imports, @@ -213,7 +214,7 @@ def _read_imports_from_cache( def _scan_imports( module_files: Collection[ModuleFile], *, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, @@ -258,7 +259,7 @@ def _decide_number_of_processes(number_of_module_files: int) -> int: def _scan_chunks( chunks: Collection[Collection[ModuleFile]], - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, diff --git a/tests/adaptors/filesystem.py b/tests/adaptors/filesystem.py index 6eac8cca..74854da0 100644 --- a/tests/adaptors/filesystem.py +++ b/tests/adaptors/filesystem.py @@ -1,8 +1,10 @@ from typing import Any, Dict, Generator, List, Optional, Tuple + import yaml -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] DEFAULT_MTIME = 10000.0 @@ -43,6 +45,7 @@ def __init__( i.e. last modified times. """ self.contents = self._parse_contents(contents) + self._raw_contents = contents self.content_map = content_map if content_map else {} self.mtime_map: Dict[str, float] = mtime_map if mtime_map else {} @@ -186,3 +189,12 @@ def get_mtime(self, file_name: str) -> float: def write(self, file_name: str, contents: str) -> None: self.content_map[file_name] = contents self.mtime_map[file_name] = DEFAULT_MTIME + + def convert_to_basic(self) -> BasicFileSystem: + """ + Convert this file system to a BasicFileSystem. + """ + return rust.FakeBasicFileSystem( + contents=self._raw_contents, + content_map=self.content_map, + ) From 781fcabc6bec8da14fe003d095ea9a1d9b282b00 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:22:31 +0100 Subject: [PATCH 06/11] Implement RealBasicFileSystem --- rust/src/filesystem.rs | 58 +++++++++++++++++++++++++++++++++++++++++- rust/src/lib.rs | 3 ++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index 9203f4f8..fd15970e 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -1,8 +1,64 @@ -use pyo3::exceptions::PyFileNotFoundError; +use pyo3::exceptions::{PyFileNotFoundError,PyRuntimeError}; use pyo3::prelude::*; use std::collections::HashMap; type FileSystemContents = HashMap; use unindent::unindent; +use std::path::{Path,PathBuf}; +use std::fs; + +#[pyclass] +pub struct RealBasicFileSystem {} + +#[pymethods] +impl RealBasicFileSystem { + #[new] + fn new() -> Self { + RealBasicFileSystem {} + } + + #[getter] + fn sep(&self) -> String { + std::path::MAIN_SEPARATOR.to_string() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + let mut path = PathBuf::new(); + for component in components { + path.push(component); + } + path.to_str().unwrap().to_string() + } + + fn split(&self, file_name: &str) -> (String, String) { + let path = Path::new(file_name); + + // Get the "tail" part (the file name or last directory) + let tail = match path.file_name() { + Some(name) => PathBuf::from(name), + None => PathBuf::new(), // If there's no file name (e.g., path is a root), return empty + }; + + // Get the "head" part (the parent directory) + let head = match path.parent() { + Some(parent_path) => parent_path.to_path_buf(), + None => PathBuf::new(), // If there's no parent (e.g., just a filename), return empty + }; + + (head.to_str().unwrap().to_string(), tail.to_str().unwrap().to_string()) + } + + fn exists(&self, file_name: &str) -> bool { + Path::new(file_name).is_file() + } + + fn read(&self, file_name: &str) -> PyResult { + // TODO: is this good enough for handling non-UTF8 encodings? + fs::read_to_string(file_name).map_err( + |_| PyRuntimeError::new_err(format!("Could not read {}", file_name)) + ) + } +} #[pyclass] pub struct FakeBasicFileSystem { diff --git a/rust/src/lib.rs b/rust/src/lib.rs index d18c9df6..9016bc63 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -19,12 +19,13 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; -use crate::filesystem::FakeBasicFileSystem; +use crate::filesystem::{RealBasicFileSystem,FakeBasicFileSystem}; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; From 99729e621f729325f9ad32fe078bed91c8021b54 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:23:28 +0100 Subject: [PATCH 07/11] Use RealBasicFileSystem in prod This necessitates us turning off multiprocessing, but we'll switch to multithreading in a subsequent commit. --- src/grimp/adaptors/filesystem.py | 3 +- src/grimp/application/usecases.py | 15 ++--- tests/unit/application/test_usecases.py | 77 +------------------------ 3 files changed, 7 insertions(+), 88 deletions(-) diff --git a/src/grimp/adaptors/filesystem.py b/src/grimp/adaptors/filesystem.py index b69abe19..83b54436 100644 --- a/src/grimp/adaptors/filesystem.py +++ b/src/grimp/adaptors/filesystem.py @@ -3,6 +3,7 @@ from typing import Iterator, List, Tuple from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] class FileSystem(AbstractFileSystem): @@ -46,4 +47,4 @@ def write(self, file_name: str, contents: str) -> None: print(contents, file=file) def convert_to_basic(self) -> BasicFileSystem: - return self # FileSystem already corresponds to the BasicFileSystem protocol. + return rust.RealBasicFileSystem() diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index f403e344..9ee6e558 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -16,7 +16,6 @@ from ..application.ports.packagefinder import AbstractPackageFinder from ..domain.valueobjects import DirectImport, Module from .config import settings -import os class NotSupplied: @@ -245,16 +244,10 @@ def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFi def _decide_number_of_processes(number_of_module_files: int) -> int: - min_number_of_modules = int( - os.environ.get( - MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME, - DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, - ) - ) - if number_of_module_files < min_number_of_modules: - # Don't incur the overhead of multiple processes. - return 1 - return min(joblib.cpu_count(), number_of_module_files) + # The Rust-based RealBasicFileSystem doesn't support pickling, + # which means we can't use multiprocessing. But we won't need to use + # multiprocessing shortly so we can live with this. + return 1 def _scan_chunks( diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index a8f14207..d6d19fcb 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -1,8 +1,6 @@ -import os from typing import Dict, Optional, Set -from unittest.mock import sentinel, patch +from unittest.mock import sentinel -import joblib # type: ignore import pytest # type: ignore from grimp.application import usecases @@ -11,7 +9,6 @@ from grimp.domain.valueobjects import DirectImport, Module from tests.adaptors.filesystem import FakeFileSystem from tests.adaptors.packagefinder import BaseFakePackageFinder -from tests.adaptors.modulefinder import BaseFakeModuleFinder from tests.config import override_settings SOME_CPU_COUNT = 8 @@ -136,75 +133,3 @@ def write( if supplied_cache_dir is not sentinel.not_supplied: kwargs["cache_dir"] = supplied_cache_dir usecases.build_graph("mypackage", **kwargs) - - @patch.object(usecases, "_scan_chunks", return_value={}) - @patch.object(joblib, "cpu_count", return_value=SOME_CPU_COUNT) - @pytest.mark.parametrize( - "number_of_modules, fake_environ, expected_number_of_chunks", - [ - ( - usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING - 1, - {}, - 1, - ), - ( - usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING, - {}, - SOME_CPU_COUNT, - ), - ( - usecases.DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING + 1, - {}, - SOME_CPU_COUNT, - ), - ( - 149, - {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, - 1, - ), - ( - 150, - {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, - SOME_CPU_COUNT, - ), - ( - 151, - {usecases.MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME: 150}, - SOME_CPU_COUNT, - ), - ], - ) - def test_scanning_multiprocessing_respects_min_number_of_modules( - self, - mock_cpu_count, - mock_scan_chunks, - number_of_modules, - fake_environ, - expected_number_of_chunks, - ): - class FakePackageFinder(BaseFakePackageFinder): - directory_map = {"mypackage": "/path/to/mypackage"} - - class FakeModuleFinder(BaseFakeModuleFinder): - module_files_by_package_name = { - "mypackage": frozenset( - { - ModuleFile( - module=Module(f"mypackage.mod_{i}"), - mtime=999, - ) - for i in range(number_of_modules) - } - ) - } - - with override_settings( - FILE_SYSTEM=FakeFileSystem(), - PACKAGE_FINDER=FakePackageFinder(), - MODULE_FINDER=FakeModuleFinder(), - ), patch.object(os, "environ", fake_environ): - usecases.build_graph("mypackage", cache_dir=None) - - [call] = mock_scan_chunks.call_args_list - chunks = call.args[0] - assert len(chunks) == expected_number_of_chunks From 43cf3b247e9cc98a6f4dfb1baf534601d31c8a02 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:30:47 +0100 Subject: [PATCH 08/11] Strip out multiprocessing code --- docs/usage.rst | 1 + pyproject.toml | 1 - src/grimp/application/usecases.py | 76 +++----------------- tests/functional/test_build_and_use_graph.py | 29 -------- tox.ini | 10 +-- 5 files changed, 10 insertions(+), 107 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index c342f06d..f97c412e 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -84,6 +84,7 @@ Building the graph :return: An import graph that you can use to analyse the package. :rtype: ``ImportGraph`` + # TODO correct this. This method uses multiple operating system processes to build the graph, if the number of modules to scan (not including modules in the cache) is 50 or more. This threshold can be adjusted by setting the ``GRIMP_MIN_MULTIPROCESSING_MODULES`` environment variable to a different number. To disable multiprocessing altogether, set it to a large number (more than diff --git a/pyproject.toml b/pyproject.toml index 0496671e..61c8eb06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,6 @@ authors = [ ] requires-python = ">=3.9" dependencies = [ - "joblib>=1.3.0", "typing-extensions>=3.10.0.0", ] classifiers = [ diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index 9ee6e558..21ea2a0a 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -3,9 +3,7 @@ """ from typing import Dict, Sequence, Set, Type, Union, cast, Iterable, Collection -import math -import joblib # type: ignore from .ports.filesystem import BasicFileSystem from ..application.ports import caching @@ -22,14 +20,6 @@ class NotSupplied: pass -# Calling code can set this environment variable if it wants to tune when to switch to -# multiprocessing, or set it to a large number to disable it altogether. -MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING_ENV_NAME = "GRIMP_MIN_MULTIPROCESSING_MODULES" -# This is an arbitrary number, but setting it too low slows down our functional tests considerably. -# If you change this, update docs/usage.rst too! -DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING = 50 - - def build_graph( package_name, *additional_package_names, @@ -217,72 +207,22 @@ def _scan_imports( found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, -) -> Dict[ModuleFile, Set[DirectImport]]: - chunks = _create_chunks(module_files) - return _scan_chunks( - chunks, - file_system, - found_packages, - include_external_packages, - exclude_type_checking_imports, - ) - - -def _create_chunks(module_files: Collection[ModuleFile]) -> tuple[tuple[ModuleFile, ...], ...]: - """ - Split the module files into chunks, each to be worked on by a separate OS process. - """ - module_files_tuple = tuple(module_files) - - number_of_module_files = len(module_files_tuple) - n_chunks = _decide_number_of_processes(number_of_module_files) - chunk_size = math.ceil(number_of_module_files / n_chunks) - - return tuple( - module_files_tuple[i * chunk_size : (i + 1) * chunk_size] for i in range(n_chunks) - ) - - -def _decide_number_of_processes(number_of_module_files: int) -> int: - # The Rust-based RealBasicFileSystem doesn't support pickling, - # which means we can't use multiprocessing. But we won't need to use - # multiprocessing shortly so we can live with this. - return 1 - - -def _scan_chunks( - chunks: Collection[Collection[ModuleFile]], - file_system: BasicFileSystem, - found_packages: Set[FoundPackage], - include_external_packages: bool, - exclude_type_checking_imports: bool, ) -> Dict[ModuleFile, Set[DirectImport]]: import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS( file_system=file_system, found_packages=found_packages, include_external_packages=include_external_packages, ) - - number_of_processes = len(chunks) - import_scanning_jobs = joblib.Parallel(n_jobs=number_of_processes)( - joblib.delayed(_scan_chunk)(import_scanner, exclude_type_checking_imports, chunk) - for chunk in chunks - ) - - imports_by_module_file = {} - for chunk_imports_by_module_file in import_scanning_jobs: - imports_by_module_file.update(chunk_imports_by_module_file) - return imports_by_module_file - - -def _scan_chunk( - import_scanner: AbstractImportScanner, - exclude_type_checking_imports: bool, - chunk: Iterable[ModuleFile], -) -> Dict[ModuleFile, Set[DirectImport]]: return { module_file: import_scanner.scan_for_imports( module_file.module, exclude_type_checking_imports=exclude_type_checking_imports ) - for module_file in chunk + for module_file in module_files } + + +def _decide_number_of_processes(number_of_module_files: int) -> int: + # The Rust-based RealBasicFileSystem doesn't support pickling, + # which means we can't use multiprocessing. But we won't need to use + # multiprocessing shortly so we can live with this. + return 1 diff --git a/tests/functional/test_build_and_use_graph.py b/tests/functional/test_build_and_use_graph.py index e061a82e..35261567 100644 --- a/tests/functional/test_build_and_use_graph.py +++ b/tests/functional/test_build_and_use_graph.py @@ -1,8 +1,6 @@ from grimp import build_graph from typing import Set, Tuple, Optional import pytest -from unittest.mock import patch -from grimp.application import usecases """ @@ -56,33 +54,6 @@ def test_modules(): } -@patch.object(usecases, "DEFAULT_MIN_NUMBER_OF_MODULES_TO_SCAN_USING_MULTIPROCESSING", 0) -def test_modules_multiprocessing(): - """ - This test runs relatively slowly, but it's important we cover the multiprocessing code. - """ - graph = build_graph("testpackage", cache_dir=None) - - assert graph.modules == { - "testpackage", - "testpackage.one", - "testpackage.one.alpha", - "testpackage.one.beta", - "testpackage.one.gamma", - "testpackage.one.delta", - "testpackage.one.delta.blue", - "testpackage.two", - "testpackage.two.alpha", - "testpackage.two.beta", - "testpackage.two.gamma", - "testpackage.utils", - "testpackage.three", - "testpackage.three.beta", - "testpackage.three.gamma", - "testpackage.three.alpha", - } - - def test_add_module(): graph = build_graph("testpackage", cache_dir=None) number_of_modules = len(graph.modules) diff --git a/tox.ini b/tox.ini index 3027d938..19c2553d 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,6 @@ envlist = check, docs, {py39,py310,py311,py312,py13}, - py13-joblib-earliest, [base] deps = @@ -26,7 +25,6 @@ basepython = py311: {env:TOXPYTHON:python3.11} py312: {env:TOXPYTHON:python3.12} py313: {env:TOXPYTHON:python3.13} - py313-joblib-earliest: {env:TOXPYTHON:python3.13} {clean,check,docs,report}: {env:TOXPYTHON:python3} setenv = PYTHONPATH={toxinidir}/tests @@ -36,15 +34,9 @@ passenv = usedevelop = false deps = {[base]deps} - joblib==1.4.2 commands = {posargs:pytest --cov --cov-report=term-missing --benchmark-skip -vv tests} -[testenv:py313-joblib-earliest] -deps = - {[base]deps} - joblib==1.3.0 - [testenv:check] basepython = py313 deps = @@ -119,4 +111,4 @@ python = 3.10: py310, report 3.11: py311, report 3.12: py312, report - 3.13: py313, py313-joblib-earliest, report, check, docs + 3.13: py313, report, check, docs From f17a172a1315cd4bb227104575cae618a52d7127 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:44:19 +0100 Subject: [PATCH 09/11] Make AbstractImportScanner a Protocol --- src/grimp/adaptors/importscanner.py | 29 +++++++++++++++++--- src/grimp/application/ports/importscanner.py | 15 ++-------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/grimp/adaptors/importscanner.py b/src/grimp/adaptors/importscanner.py index a405facc..50239930 100644 --- a/src/grimp/adaptors/importscanner.py +++ b/src/grimp/adaptors/importscanner.py @@ -6,8 +6,8 @@ from typing import Dict, Optional, Set from grimp import exceptions -from grimp.application.ports.importscanner import AbstractImportScanner from grimp.application.ports.modulefinder import FoundPackage +from grimp.application.ports.filesystem import BasicFileSystem from grimp.domain.valueobjects import DirectImport, Module from grimp import _rustgrimp as rust # type: ignore[attr-defined] @@ -25,9 +25,30 @@ class _ImportedObject: typechecking_only: bool -class ImportScanner(AbstractImportScanner): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) +class ImportScanner: + def __init__( + self, + file_system: BasicFileSystem, + found_packages: Set[FoundPackage], + include_external_packages: bool = False, + ) -> None: + """ + Args: + - found_packages: Set of FoundPackages containing all the modules + for analysis. + - file_system: The file system interface to use. + - include_external_packages: Whether to include imports of external modules (i.e. + modules not contained in modules_by_package_directory) + in the results. + """ + self.file_system = file_system + self.include_external_packages = include_external_packages + self.found_packages = found_packages + + # Flatten all the modules into a set. + self.modules: Set[Module] = set() + for package in self.found_packages: + self.modules |= {mf.module for mf in package.module_files} self._found_packages_by_module: Dict[Module, FoundPackage] = { module_file.module: package diff --git a/src/grimp/application/ports/importscanner.py b/src/grimp/application/ports/importscanner.py index 26df8aa3..82cb21bc 100644 --- a/src/grimp/application/ports/importscanner.py +++ b/src/grimp/application/ports/importscanner.py @@ -1,12 +1,11 @@ -import abc -from typing import Set +from typing import Set, Protocol from grimp.application.ports.filesystem import BasicFileSystem from grimp.application.ports.modulefinder import FoundPackage from grimp.domain.valueobjects import DirectImport, Module -class AbstractImportScanner(abc.ABC): +class AbstractImportScanner(Protocol): """ Statically analyses some Python modules for import statements within their shared package. """ @@ -26,16 +25,7 @@ def __init__( modules not contained in modules_by_package_directory) in the results. """ - self.file_system = file_system - self.include_external_packages = include_external_packages - self.found_packages = found_packages - # Flatten all the modules into a set. - self.modules: Set[Module] = set() - for package in self.found_packages: - self.modules |= {mf.module for mf in package.module_files} - - @abc.abstractmethod def scan_for_imports( self, module: Module, *, exclude_type_checking_imports: bool = False ) -> Set[DirectImport]: @@ -43,4 +33,3 @@ def scan_for_imports( Statically analyses the given module and returns an iterable of Modules that it imports. """ - raise NotImplementedError From a0564fec89c11d18ea8b094e6165dcabcda37b32 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 18:18:57 +0100 Subject: [PATCH 10/11] Define ImportScanner in Rust --- rust/src/filesystem.rs | 129 ++++++-- rust/src/import_scanning.rs | 365 ++++++++++++++++++++++ rust/src/lib.rs | 12 +- rust/src/module_finding.rs | 60 ++++ src/grimp/adaptors/importscanner.py | 251 +-------------- src/grimp/main.py | 2 +- tests/unit/adaptors/test_importscanner.py | 49 +-- 7 files changed, 567 insertions(+), 301 deletions(-) create mode 100644 rust/src/import_scanning.rs create mode 100644 rust/src/module_finding.rs diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index fd15970e..d93eb4e7 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -1,27 +1,36 @@ -use pyo3::exceptions::{PyFileNotFoundError,PyRuntimeError}; +use pyo3::exceptions::{PyFileNotFoundError, PyRuntimeError}; use pyo3::prelude::*; use std::collections::HashMap; type FileSystemContents = HashMap; -use unindent::unindent; -use std::path::{Path,PathBuf}; use std::fs; +use std::path::{Path, PathBuf}; +use unindent::unindent; + +pub trait FileSystem: Send + Sync { + fn sep(&self) -> String; + + fn join(&self, components: Vec) -> String; + + fn split(&self, file_name: &str) -> (String, String); -#[pyclass] + fn exists(&self, file_name: &str) -> bool; + + fn read(&self, file_name: &str) -> PyResult; +} + +#[derive(Clone)] pub struct RealBasicFileSystem {} -#[pymethods] -impl RealBasicFileSystem { - #[new] - fn new() -> Self { - RealBasicFileSystem {} - } +#[pyclass(name = "RealBasicFileSystem")] +pub struct PyRealBasicFileSystem { + pub inner: RealBasicFileSystem, +} - #[getter] +impl FileSystem for RealBasicFileSystem { fn sep(&self) -> String { std::path::MAIN_SEPARATOR.to_string() } - #[pyo3(signature = (*components))] fn join(&self, components: Vec) -> String { let mut path = PathBuf::new(); for component in components { @@ -45,30 +54,65 @@ impl RealBasicFileSystem { None => PathBuf::new(), // If there's no parent (e.g., just a filename), return empty }; - (head.to_str().unwrap().to_string(), tail.to_str().unwrap().to_string()) + ( + head.to_str().unwrap().to_string(), + tail.to_str().unwrap().to_string(), + ) } - + fn exists(&self, file_name: &str) -> bool { Path::new(file_name).is_file() } fn read(&self, file_name: &str) -> PyResult { // TODO: is this good enough for handling non-UTF8 encodings? - fs::read_to_string(file_name).map_err( - |_| PyRuntimeError::new_err(format!("Could not read {}", file_name)) - ) + fs::read_to_string(file_name) + .map_err(|_| PyRuntimeError::new_err(format!("Could not read {}", file_name))) } } -#[pyclass] +#[pymethods] +impl PyRealBasicFileSystem { + #[new] + fn new() -> Self { + PyRealBasicFileSystem { + inner: RealBasicFileSystem {}, + } + } + + #[getter] + fn sep(&self) -> String { + self.inner.sep() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + self.inner.join(components) + } + + fn split(&self, file_name: &str) -> (String, String) { + self.inner.split(file_name) + } + + fn exists(&self, file_name: &str) -> bool { + self.inner.exists(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + self.inner.read(file_name) + } +} +#[derive(Clone)] pub struct FakeBasicFileSystem { - contents: FileSystemContents, + contents: Box, +} + +#[pyclass(name = "FakeBasicFileSystem")] +pub struct PyFakeBasicFileSystem { + pub inner: FakeBasicFileSystem, } -#[pymethods] impl FakeBasicFileSystem { - #[pyo3(signature = (contents=None, content_map=None))] - #[new] fn new(contents: Option<&str>, content_map: Option>) -> PyResult { let mut parsed_contents = match contents { Some(contents) => parse_indented_fs_string(contents), @@ -82,16 +126,16 @@ impl FakeBasicFileSystem { parsed_contents.extend(unindented_map); }; Ok(FakeBasicFileSystem { - contents: parsed_contents, + contents: Box::new(parsed_contents), }) } +} - #[getter] +impl FileSystem for FakeBasicFileSystem { fn sep(&self) -> String { "/".to_string() } - #[pyo3(signature = (*components))] fn join(&self, components: Vec) -> String { let sep = self.sep(); // Get the separator from the getter method components @@ -131,7 +175,6 @@ impl FakeBasicFileSystem { (head, tail.to_string()) } - /// Checks if a file or directory exists within the file system. fn exists(&self, file_name: &str) -> bool { self.contents.contains_key(file_name) } @@ -144,6 +187,40 @@ impl FakeBasicFileSystem { } } +#[pymethods] +impl PyFakeBasicFileSystem { + #[pyo3(signature = (contents=None, content_map=None))] + #[new] + fn new(contents: Option<&str>, content_map: Option>) -> PyResult { + Ok(PyFakeBasicFileSystem { + inner: FakeBasicFileSystem::new(contents, content_map)?, + }) + } + + #[getter] + fn sep(&self) -> String { + self.inner.sep() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + self.inner.join(components) + } + + fn split(&self, file_name: &str) -> (String, String) { + self.inner.split(file_name) + } + + /// Checks if a file or directory exists within the file system. + fn exists(&self, file_name: &str) -> bool { + self.inner.exists(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + self.inner.read(file_name) + } +} + /// Parses an indented string representing a file system structure /// into a HashMap where keys are full file paths. /// diff --git a/rust/src/import_scanning.rs b/rust/src/import_scanning.rs new file mode 100644 index 00000000..21d8e6b6 --- /dev/null +++ b/rust/src/import_scanning.rs @@ -0,0 +1,365 @@ +use crate::filesystem::{FileSystem, PyFakeBasicFileSystem, PyRealBasicFileSystem}; +use crate::import_parsing; +use crate::module_finding::{FoundPackage, Module}; +use itertools::Itertools; +use pyo3::exceptions::{PyFileNotFoundError, PyTypeError}; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PySet}; +use std::collections::HashSet; +use crate::errors::GrimpError; + +#[pyclass] +pub struct ImportScanner { + file_system: Box, + found_packages: HashSet, + include_external_packages: bool, + modules: HashSet, +} + +#[derive(Debug, Hash, Eq, PartialEq)] +struct DirectImport { + importer: String, + imported: String, + line_number: usize, + line_contents: String, +} + +#[pymethods] +impl ImportScanner { + #[allow(unused_variables)] + #[new] + #[pyo3(signature = (file_system, found_packages, include_external_packages=false))] + fn new( + py: Python, + file_system: Bound<'_, PyAny>, + found_packages: Bound<'_, PyAny>, + include_external_packages: bool, + ) -> PyResult { + let file_system_boxed: Box; + + if let Ok(py_real) = file_system.extract::>() { + file_system_boxed = Box::new(py_real.inner.clone()); + } else if let Ok(py_fake) = file_system.extract::>() { + file_system_boxed = Box::new(py_fake.inner.clone()); + } else { + return Err(PyTypeError::new_err( + "file_system must be an instance of RealBasicFileSystem or FakeBasicFileSystem", + )); + } + let found_packages_rust = _py_found_packages_to_rust(found_packages); + let modules = _get_modules_from_found_packages(&found_packages_rust); + + Ok(ImportScanner { + file_system: file_system_boxed, + found_packages: found_packages_rust, + modules, + include_external_packages, + }) + } + + #[pyo3(signature = (module, exclude_type_checking_imports=false))] + fn scan_for_imports<'a>( + &self, + py: Python<'a>, + module: Bound<'_, PyAny>, + exclude_type_checking_imports: bool, + ) -> PyResult> { + let module_rust = module.extract().unwrap(); + let found_package_for_module = self._lookup_found_package_for_module(&module_rust); + let module_filename = + self._determine_module_filename(&module_rust, found_package_for_module)?; + let module_contents = self.file_system.read(&module_filename).unwrap(); + + let parse_result = import_parsing::parse_imports_from_code(&module_contents); + match parse_result { + Err(GrimpError::ParseError { line_number, text, ..}) => { + // TODO: define SourceSyntaxError using pyo3. + let exceptions_pymodule = + PyModule::import(py, "grimp.exceptions").unwrap(); + let py_exception_class = exceptions_pymodule.getattr("SourceSyntaxError").unwrap(); + let exception = py_exception_class.call1((module_filename, line_number, text,)).unwrap(); + return Err(PyErr::from_value(exception)); + }, + Err(e) => { + return Err(e.into()); + }, + _ => () + } + let imported_objects = parse_result.unwrap(); + let is_package = self._module_is_package(&module_filename); + + let mut imports: HashSet = HashSet::new(); + for imported_object in imported_objects { + // Don't include type checking imports, if specified. + if exclude_type_checking_imports && imported_object.typechecking_only { + continue; + } + + // Resolve relative imports. + let imported_object_name = self._get_absolute_imported_object_name( + &module_rust, + is_package, + &imported_object.name, + ); + + // Resolve imported module. + match self._get_internal_module(&imported_object_name) { + Some(imported_module) => { + // It's an internal import. + imports.insert(DirectImport { + importer: module_rust.name.to_string(), + imported: imported_module.name.to_string(), + line_number: imported_object.line_number, + line_contents: imported_object.line_contents, + }); + } + None => { + // It's an external import. + if self.include_external_packages { + if let Some(imported_module) = + self._distill_external_module(&imported_object_name) + { + imports.insert(DirectImport { + importer: module_rust.name.to_string(), + imported: imported_module, + line_number: imported_object.line_number, + line_contents: imported_object.line_contents, + }); + } + } + } + } + } + + Ok(self._to_py_direct_imports(py, &imports)) + } +} + +impl ImportScanner { + fn _lookup_found_package_for_module(&self, module: &Module) -> &FoundPackage { + // TODO: it's probably inefficient to do this every time we look up a module. + for found_package in &self.found_packages { + for module_file in &found_package.module_files { + if module_file.module == *module { + return found_package; + } + } + } + panic!("Could not lookup found package for module {}", module); + } + + fn _determine_module_filename( + &self, + module: &Module, + found_package: &FoundPackage, + ) -> PyResult { + // TODO: could we do all this with &str instead of String? + let top_level_components: Vec = + found_package.name.split(".").map(str::to_string).collect(); + let module_components: Vec = module.name.split(".").map(str::to_string).collect(); + let leaf_components = module_components[top_level_components.len()..].to_vec(); + + let mut filename_root_components: Vec = vec![found_package.directory.clone()]; + filename_root_components.extend(leaf_components); + let filename_root = self.file_system.join(filename_root_components); + let normal_module_path = format!("{}.py", filename_root); + let init_module_path = self + .file_system + .join(vec![filename_root, "__init__.py".to_string()]); + let candidate_filenames = [normal_module_path, init_module_path]; + for candidate_filename in candidate_filenames { + if self.file_system.exists(&candidate_filename) { + return Ok(candidate_filename); + } + } + Err(PyFileNotFoundError::new_err(format!( + "Could not find module {}.", + module + ))) + } + + fn _module_is_package(&self, module_filename: &str) -> bool { + self.file_system.split(module_filename).1 == "__init__.py" + } + + #[allow(unused_variables)] + fn _get_absolute_imported_object_name( + &self, + module: &Module, + is_package: bool, + imported_object_name: &str, + ) -> String { + let leading_dots_count = count_leading_dots(imported_object_name); + if leading_dots_count == 0 { + return imported_object_name.to_string(); + } + let imported_object_name_base: String; + if is_package { + if leading_dots_count == 1 { + imported_object_name_base = module.name.clone(); + } else { + let parts: Vec<&str> = module.name.split('.').collect(); + imported_object_name_base = + parts[0..parts.len() - leading_dots_count + 1].join("."); + } + } else { + let parts: Vec<&str> = module.name.split('.').collect(); + imported_object_name_base = parts[0..parts.len() - leading_dots_count].join("."); + } + + format!( + "{}.{}", + imported_object_name_base, + &imported_object_name[leading_dots_count..] + ) + } + + #[allow(unused_variables)] + fn _get_internal_module(&self, imported_object_name: &str) -> Option { + let candidate_module = Module { + name: imported_object_name.to_string(), + }; + if self.modules.contains(&candidate_module) { + return Some(candidate_module); + } + if let Some((parent_name, _)) = imported_object_name.rsplit_once('.') { + let parent = Module { + name: parent_name.to_string(), + }; + if self.modules.contains(&parent) { + return Some(parent); + } + } + None + } + + /// Given a module that we already know is external, turn it into a module to add to the graph. + // + // The 'distillation' process involves removing any unwanted subpackages. For example, + // django.models.db should be turned into simply django. + // The process is more complex for potential namespace packages, as it's not possible to + // determine the portion package simply from name. Rather than adding the overhead of a + // filesystem read, we just get the shallowest component that does not clash with an internal + // module namespace. Take, for example, foo.blue.alpha.one. If one of the found + // packages is foo.blue.beta, the module will be distilled to foo.blue.alpha. + // Alternatively, if the found package is foo.green, the distilled module will + // be foo.blue. + #[allow(unused_variables)] + fn _distill_external_module(&self, module_name: &str) -> Option { + let module_root = module_name.split(".").next().unwrap(); + // If it's a module that is a parent of one of the internal packages, return None + // as it doesn't make sense and is probably an import of a namespace package. + for package in &self.found_packages { + if module_is_descendant(&package.name, module_name) { + return None; + } + } + + // If it shares a namespace with an internal module, get the shallowest component that does + // not clash with an internal module namespace. + let mut candidate_portions: HashSet = HashSet::new(); + let mut sorted_found_packages: Vec<&FoundPackage> = self.found_packages.iter().collect(); + sorted_found_packages.sort_by_key(|package| &package.name); + sorted_found_packages.reverse(); + + for found_package in sorted_found_packages { + let root_module = &found_package.name; + if module_is_descendant(root_module, module_root) { + let mut internal_path_components: Vec<&str> = root_module.split(".").collect(); + let mut external_path_components: Vec<&str> = module_name.split(".").collect(); + let mut external_namespace_components: Vec<&str> = vec![]; + while external_path_components[0] == internal_path_components[0] { + external_namespace_components.push(external_path_components.remove(0)); + internal_path_components.remove(0); + } + external_namespace_components.push(external_path_components[0]); + + candidate_portions.insert(Module { + name: external_namespace_components.join("."), + }); + } + } + + if !candidate_portions.is_empty() { + // If multiple found packages share a namespace with this module, use the deepest one + // as we know that that will be a namespace too. + let deepest_candidate_portion = candidate_portions + .iter() + .sorted_by_key(|portion| portion.name.split(".").collect::>().len()) + .next_back() + .unwrap(); + Some(deepest_candidate_portion.name.clone()) + } else { + Some(module_name.split('.').next().unwrap().to_string()) + } + } + + fn _to_py_direct_imports<'a>( + &self, + py: Python<'a>, + rust_imports: &HashSet, + ) -> Bound<'a, PySet> { + // TODO: do this in the constructor. + let valueobjects_pymodule = + PyModule::import(py, "grimp.domain.valueobjects").unwrap(); + let py_module_class = valueobjects_pymodule.getattr("Module").unwrap(); + let py_direct_import_class = valueobjects_pymodule.getattr("DirectImport").unwrap(); + + let pyset = PySet::empty(py).unwrap(); + + for rust_import in rust_imports { + let importer = py_module_class.call1((&rust_import.importer,)).unwrap(); + let imported = py_module_class.call1((&rust_import.imported,)).unwrap(); + let kwargs = PyDict::new(py); + kwargs.set_item("importer", &importer).unwrap(); + kwargs.set_item("imported", &imported).unwrap(); + kwargs + .set_item("line_number", rust_import.line_number) + .unwrap(); + kwargs + .set_item("line_contents", &rust_import.line_contents) + .unwrap(); + let py_direct_import = py_direct_import_class.call((), Some(&kwargs)).unwrap(); + pyset.add(&py_direct_import).unwrap(); + } + + pyset + } +} +fn _py_found_packages_to_rust(py_found_packages: Bound<'_, PyAny>) -> HashSet { + let py_set = py_found_packages + .downcast::() + .expect("Expected py_found_packages to be a Python set."); + + let mut rust_found_packages = HashSet::new(); + // Iterate over the elements in the Python set. + for py_found_package_any in py_set.iter() { + // Extract each Python 'FoundPackage' object into a Rust 'FoundPackage' struct. + // Panics if extraction fails, as specified. + let rust_found_package: FoundPackage = py_found_package_any + .extract() + .expect("Failed to extract Python FoundPackage to Rust FoundPackage."); + rust_found_packages.insert(rust_found_package); + } + rust_found_packages +} + +fn _get_modules_from_found_packages(found_packages: &HashSet) -> HashSet { + let mut modules = HashSet::new(); + for package in found_packages { + for module_file in &package.module_files { + modules.insert(module_file.module.clone()); + } + } + modules +} + +fn count_leading_dots(s: &str) -> usize { + s.chars() + .take_while(|&c| c == '.') // Take characters as long as they are dots + .count() // Count how many were taken +} + +fn module_is_descendant(module_name: &str, potential_ancestor: &str) -> bool { + module_name.starts_with(&format!("{}.", potential_ancestor)) +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 9016bc63..1a772fab 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,14 +1,17 @@ pub mod errors; pub mod exceptions; +mod filesystem; pub mod graph; pub mod import_parsing; +mod import_scanning; pub mod module_expressions; -mod filesystem; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{InvalidModuleExpression, ModuleNotPresent, NoSuchContainer, ParseError}; +use crate::filesystem::{PyFakeBasicFileSystem, PyRealBasicFileSystem}; use crate::graph::higher_order_queries::Level; use crate::graph::{Graph, Module, ModuleIterator, ModuleTokenIterator}; +use crate::import_scanning::ImportScanner; use crate::module_expressions::ModuleExpression; use derive_new::new; use itertools::Itertools; @@ -19,14 +22,15 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; -use crate::filesystem::{RealBasicFileSystem,FakeBasicFileSystem}; +mod module_finding; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; m.add( diff --git a/rust/src/module_finding.rs b/rust/src/module_finding.rs new file mode 100644 index 00000000..3eb2016d --- /dev/null +++ b/rust/src/module_finding.rs @@ -0,0 +1,60 @@ +use pyo3::{prelude::*, types::PyFrozenSet}; +use std::collections::BTreeSet; +use std::fmt; + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, FromPyObject)] +pub struct ModuleFile { + pub module: Module, +} + +/// A Python module. +/// The string is the importable name, e.g. "mypackage.foo". +#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash, FromPyObject)] +pub struct Module { + pub name: String, +} + +impl fmt::Display for Module { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.name) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +/// Set of modules found under a single package, together with metadata. +pub struct FoundPackage { + pub name: String, + pub directory: String, + // BTreeSet rather than HashSet seems to be necessary to make FoundPackage hashable. + pub module_files: BTreeSet, +} + +/// Implements conversion from a Python 'FoundPackage' object to the Rust 'FoundPackage' struct. +/// It extracts 'name', 'directory', and converts the 'module_files' frozenset into a Rust HashSet. +impl<'py> FromPyObject<'py> for FoundPackage { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + // Extract the 'name' attribute. + let name: String = ob.getattr("name")?.extract()?; + // Extract the 'directory' attribute. + let directory: String = ob.getattr("directory")?.extract()?; + + // Access the 'module_files' attribute. + let module_files_py = ob.getattr("module_files")?; + // Downcast the PyAny object to a PyFrozenSet, as Python 'FrozenSet' maps to 'PyFrozenSet'. + let py_frozen_set = module_files_py.downcast::()?; + + let mut module_files = BTreeSet::new(); + // Iterate over the Python frozenset. + for py_module_file_any in py_frozen_set.iter() { + // Extract each element (PyAny) into a Rust 'ModuleFile'. + let module_file: ModuleFile = py_module_file_any.extract()?; + module_files.insert(module_file); + } + + Ok(FoundPackage { + name, + directory, + module_files, + }) + } +} diff --git a/src/grimp/adaptors/importscanner.py b/src/grimp/adaptors/importscanner.py index 50239930..655782de 100644 --- a/src/grimp/adaptors/importscanner.py +++ b/src/grimp/adaptors/importscanner.py @@ -1,252 +1,3 @@ -from __future__ import annotations - -import re -import logging -from dataclasses import dataclass -from typing import Dict, Optional, Set - -from grimp import exceptions -from grimp.application.ports.modulefinder import FoundPackage -from grimp.application.ports.filesystem import BasicFileSystem -from grimp.domain.valueobjects import DirectImport, Module from grimp import _rustgrimp as rust # type: ignore[attr-defined] - -logger = logging.getLogger(__name__) - -_LEADING_DOT_REGEX = re.compile(r"^(\.+)\w") - - -@dataclass(frozen=True) -class _ImportedObject: - name: str - line_number: int - line_contents: str - typechecking_only: bool - - -class ImportScanner: - def __init__( - self, - file_system: BasicFileSystem, - found_packages: Set[FoundPackage], - include_external_packages: bool = False, - ) -> None: - """ - Args: - - found_packages: Set of FoundPackages containing all the modules - for analysis. - - file_system: The file system interface to use. - - include_external_packages: Whether to include imports of external modules (i.e. - modules not contained in modules_by_package_directory) - in the results. - """ - self.file_system = file_system - self.include_external_packages = include_external_packages - self.found_packages = found_packages - - # Flatten all the modules into a set. - self.modules: Set[Module] = set() - for package in self.found_packages: - self.modules |= {mf.module for mf in package.module_files} - - self._found_packages_by_module: Dict[Module, FoundPackage] = { - module_file.module: package - for package in self.found_packages - for module_file in package.module_files - } - - def scan_for_imports( - self, module: Module, *, exclude_type_checking_imports: bool = False - ) -> Set[DirectImport]: - """ - Note: this method only analyses the module in question and will not load any other - code, so it relies on self.modules to deduce which modules it imports. (This is - because you can't know whether "from foo.bar import baz" is importing a module - called `baz`, or a function `baz` from the module `bar`.) - """ - found_package = self._found_package_for_module(module) - module_filename = self._determine_module_filename(module, found_package) - module_contents = self._read_module_contents(module_filename) - - try: - imported_objects = self._get_raw_imported_objects(module_contents) - except rust.ParseError as e: - raise exceptions.SourceSyntaxError( - filename=module_filename, - lineno=e.line_number, - text=e.text, - ) - - is_package = self._module_is_package(module_filename) - - imports = set() - for imported_object in imported_objects: - # Filter on `exclude_type_checking_imports`. - if exclude_type_checking_imports and imported_object.typechecking_only: - continue - - # Resolve relative imports. - imported_object_name = self._get_absolute_imported_object_name( - module=module, is_package=is_package, imported_object_name=imported_object.name - ) - - # Resolve imported module. - imported_module = self._get_internal_module(imported_object_name, modules=self.modules) - if imported_module is None: - # => External import. - - # Filter on `self.include_external_packages`. - if not self.include_external_packages: - continue - - # Distill module. - imported_module = self._distill_external_module( - Module(imported_object_name), found_packages=self.found_packages - ) - if imported_module is None: - continue - - imports.add( - DirectImport( - importer=module, - imported=imported_module, - line_number=imported_object.line_number, - line_contents=imported_object.line_contents, - ) - ) - return imports - - def _found_package_for_module(self, module: Module) -> FoundPackage: - try: - return self._found_packages_by_module[module] - except KeyError: - raise ValueError(f"No found package for module {module}.") - - def _determine_module_filename(self, module: Module, found_package: FoundPackage) -> str: - """ - Work out the full filename of the given module. - - Any given module can either be a straight Python file (foo.py) or else a package - (in which case the file is an __init__.py within a directory). - """ - top_level_components = found_package.name.split(".") - module_components = module.name.split(".") - leaf_components = module_components[len(top_level_components) :] - package_directory = found_package.directory - - filename_root = self.file_system.join(package_directory, *leaf_components) - candidate_filenames = ( - f"{filename_root}.py", - self.file_system.join(filename_root, "__init__.py"), - ) - for candidate_filename in candidate_filenames: - if self.file_system.exists(candidate_filename): - return candidate_filename - raise FileNotFoundError(f"Could not find module {module}.") - - def _read_module_contents(self, module_filename: str) -> str: - """ - Read the file contents of the module. - """ - return self.file_system.read(module_filename) - - def _module_is_package(self, module_filename: str) -> bool: - """ - Whether or not the supplied module filename is a package. - """ - return self.file_system.split(module_filename)[-1] == "__init__.py" - - @staticmethod - def _get_raw_imported_objects(module_contents: str) -> Set[_ImportedObject]: - imported_object_dicts = rust.parse_imported_objects_from_code(module_contents) - return {_ImportedObject(**d) for d in imported_object_dicts} - - @staticmethod - def _get_absolute_imported_object_name( - *, module: Module, is_package: bool, imported_object_name: str - ) -> str: - leading_dot_match = _LEADING_DOT_REGEX.match(imported_object_name) - if leading_dot_match is None: - return imported_object_name - - n_leading_dots = len(leading_dot_match.group(1)) - if is_package: - if n_leading_dots == 1: - imported_object_name_base = module.name - else: - imported_object_name_base = ".".join( - module.name.split(".")[: -(n_leading_dots - 1)] - ) - else: - imported_object_name_base = ".".join(module.name.split(".")[:-n_leading_dots]) - return imported_object_name_base + "." + imported_object_name[n_leading_dots:] - - @staticmethod - def _get_internal_module(object_name: str, *, modules: Set[Module]) -> Optional[Module]: - candidate_module = Module(object_name) - if candidate_module in modules: - return candidate_module - - try: - candidate_module = candidate_module.parent - except ValueError: - return None - else: - if candidate_module in modules: - return candidate_module - else: - return None - - @staticmethod - def _distill_external_module( - module: Module, *, found_packages: Set[FoundPackage] - ) -> Optional[Module]: - """ - Given a module that we already know is external, turn it into a module to add to the graph. - - The 'distillation' process involves removing any unwanted subpackages. For example, - Module("django.models.db") should be turned into simply Module("django"). - - The process is more complex for potential namespace packages, as it's not possible to - determine the portion package simply from name. Rather than adding the overhead of a - filesystem read, we just get the shallowest component that does not clash with an internal - module namespace. Take, for example, a Module("foo.blue.alpha.one"). If one of the found - packages is foo.blue.beta, the module will be distilled to Module("foo.blue.alpha"). - Alternatively, if the found package is foo.green, the distilled module will - be Module("foo.blue"). - """ - # If it's a module that is a parent of one of the internal packages, return None - # as it doesn't make sense and is probably an import of a namespace package. - if any(Module(package.name).is_descendant_of(module) for package in found_packages): - return None - - # If it shares a namespace with an internal module, get the shallowest component that does - # not clash with an internal module namespace. - candidate_portions: Set[Module] = set() - for found_package in sorted(found_packages, key=lambda p: p.name, reverse=True): - root_module = Module(found_package.name) - if root_module.is_descendant_of(module.root): - ( - internal_path_components, - external_path_components, - ) = root_module.name.split( - "." - ), module.name.split(".") - external_namespace_components = [] - while external_path_components[0] == internal_path_components[0]: - external_namespace_components.append(external_path_components[0]) - external_path_components = external_path_components[1:] - internal_path_components = internal_path_components[1:] - external_namespace_components.append(external_path_components[0]) - candidate_portions.add(Module(".".join(external_namespace_components))) - - if candidate_portions: - # If multiple found packages share a namespace with this module, use the deepest one - # as we know that that will be a namespace too. - deepest_candidate_portion = sorted( - candidate_portions, key=lambda p: len(p.name.split(".")) - )[-1] - return deepest_candidate_portion - else: - return module.root +ImportScanner = rust.ImportScanner diff --git a/src/grimp/main.py b/src/grimp/main.py index 70622b05..d60533a8 100644 --- a/src/grimp/main.py +++ b/src/grimp/main.py @@ -13,7 +13,7 @@ settings.configure( MODULE_FINDER=ModuleFinder(), FILE_SYSTEM=FileSystem(), - IMPORT_SCANNER_CLASS=ImportScanner, + IMPORT_SCANNER_CLASS=ImportScanner, # type: ignore[has-type] IMPORT_GRAPH_CLASS=ImportGraph, PACKAGE_FINDER=ImportLibPackageFinder(), CACHE_CLASS=Cache, diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index e8315960..4307ca3a 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -3,7 +3,6 @@ import pytest # type: ignore -from grimp.adaptors.importscanner import ImportScanner from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module @@ -62,7 +61,7 @@ def test_absolute_imports(include_external_packages, expected_result): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -82,7 +81,7 @@ def test_absolute_imports(include_external_packages, expected_result): def test_non_ascii(): blue_module = Module("mypackage.blue") non_ascii_modules = {Module("mypackage.ñon_ascii_变"), Module("mypackage.ñon_ascii_变.ラーメン")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/mypackage/blue.py": """ from ñon_ascii_变 import * @@ -94,7 +93,7 @@ def test_non_ascii(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="mypackage", @@ -167,7 +166,7 @@ def test_single_namespace_package_portion(): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo", @@ -254,7 +253,7 @@ def test_import_of_portion_not_in_graph(include_external_packages): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo", @@ -443,7 +442,7 @@ def test_absolute_from_imports(include_external_packages, expected_result): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -460,14 +459,24 @@ def test_absolute_from_imports(include_external_packages, expected_result): assert expected_result == result -def test_relative_from_imports(): +@pytest.mark.parametrize("module_to_scan_is_package", (True, False)) +def test_relative_from_imports(module_to_scan_is_package): all_modules = { + Module("foo"), + Module("foo.one"), Module("foo.one.blue"), Module("foo.one.green"), Module("foo.two.brown"), Module("foo.two.yellow"), Module("foo.three"), } + if module_to_scan_is_package: + module_to_scan = Module("foo.one") + module_filename = "/path/to/foo/one/__init__.py" + else: + module_to_scan = Module("foo.one.blue") + module_filename = "/path/to/foo/one/blue.py" + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ @@ -483,7 +492,7 @@ def test_relative_from_imports(): three.py """, content_map={ - "/path/to/foo/one/blue.py": """ + module_filename: """ from . import green from ..two import yellow from .. import three @@ -492,7 +501,7 @@ def test_relative_from_imports(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -503,23 +512,23 @@ def test_relative_from_imports(): file_system=file_system, ) - result = import_scanner.scan_for_imports(Module("foo.one.blue")) + result = import_scanner.scan_for_imports(module_to_scan) assert result == { DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.one.green"), line_number=1, line_contents="from . import green", ), DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.two.yellow"), line_number=2, line_contents="from ..two import yellow", ), DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.three"), line_number=3, line_contents="from .. import three", @@ -550,7 +559,7 @@ def test_trims_to_known_modules(import_source): content_map={"/path/to/foo/one.py": import_source}, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -598,7 +607,7 @@ def test_trims_to_known_modules_within_init_file(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -649,7 +658,7 @@ def my_function(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -713,7 +722,7 @@ def test_external_package_imports_for_namespace_packages(statement, expected_mod } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo.blue", @@ -764,7 +773,7 @@ def test_scans_multiple_packages(statement): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -833,7 +842,7 @@ def test_exclude_type_checking_imports( } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", From 0cfe29392776ca572ca6162e459e11f1b8c43a48 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Wed, 11 Jun 2025 17:18:07 +0100 Subject: [PATCH 11/11] Handle include_external_packages=None --- src/grimp/application/usecases.py | 5 ++-- tests/unit/application/test_usecases.py | 34 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index 21ea2a0a..1447fa4f 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -49,7 +49,6 @@ def build_graph( "mypackage", "anotherpackage", "onemore", include_external_packages=True, ) """ - file_system: AbstractFileSystem = settings.FILE_SYSTEM found_packages = _find_packages( @@ -211,7 +210,9 @@ def _scan_imports( import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS( file_system=file_system, found_packages=found_packages, - include_external_packages=include_external_packages, + # Ensure that the passed exclude_type_checking_imports is definitely a boolean, + # otherwise the Rust class will error. + include_external_packages=bool(include_external_packages), ) return { module_file: import_scanner.scan_for_imports( diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index d6d19fcb..0c417f01 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -133,3 +133,37 @@ def write( if supplied_cache_dir is not sentinel.not_supplied: kwargs["cache_dir"] = supplied_cache_dir usecases.build_graph("mypackage", **kwargs) + + def test_forgives_wrong_type_being_passed_to_include_external_packages(self): + file_system = FakeFileSystem( + contents=""" + /path/to/mypackage/ + __init__.py + foo/ + __init__.py + one.py + """, + content_map={ + "/path/to/mypackage/foo/one.py": ( + "import mypackage.foo.two.green\nfrom .. import Something" + ), + }, + ) + + class FakePackageFinder(BaseFakePackageFinder): + directory_map = {"mypackage": "/path/to/mypackage"} + + with override_settings(FILE_SYSTEM=file_system, PACKAGE_FINDER=FakePackageFinder()): + graph = usecases.build_graph( + "mypackage", + # Note: this should be a bool, but we want to tolerate it, + # as Import Linter currently has a bug where it will pass it as None. + include_external_packages=None, + ) + + expected_modules = { + "mypackage", + "mypackage.foo", + "mypackage.foo.one", + } + assert expected_modules == graph.modules