From 9ba14b1271888c2f8cb8f34993d2af918b369e33 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Mon, 9 Jun 2025 13:48:23 +0100 Subject: [PATCH 01/15] Add uv.lock to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8898f647..95d38542 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,6 @@ rust/debug rust/target rust/flamegraph.svg target + +# UV +uv.lock From 023ff2515262b36fb943af8c845f7aa5ce0058d9 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 27 Jun 2025 17:38:15 +0100 Subject: [PATCH 02/15] Run cargo clippy --fix with latest rust version --- rust/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 18327cd2..8c603133 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -543,7 +543,7 @@ impl GraphWrapper { .unwrap() .into_iter() .map(|name| match container.clone() { - Some(container) => format!("{}.{}", container, name), + Some(container) => format!("{container}.{name}"), None => name, }) .filter_map(|name| match self.get_visible_module_by_name(&name) { From afb76a80f64527607825ed0ed01ad7544384a1b2 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 10:29:57 +0100 Subject: [PATCH 03/15] Define BasicFileSystem protocol --- src/grimp/application/ports/filesystem.py | 26 ++++++++++++++++++++ src/grimp/application/ports/importscanner.py | 4 +-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/grimp/application/ports/filesystem.py b/src/grimp/application/ports/filesystem.py index b11125be..d2966255 100644 --- a/src/grimp/application/ports/filesystem.py +++ b/src/grimp/application/ports/filesystem.py @@ -1,5 +1,6 @@ import abc from typing import Iterator, List, Tuple +from typing import Protocol class AbstractFileSystem(abc.ABC): @@ -80,3 +81,28 @@ def write(self, file_name: str, contents: str) -> None: Write the contents to a file. """ raise NotImplementedError + + +class BasicFileSystem(Protocol): + """ + A more limited file system, used by the ImportScanner. + + Having two different file system APIs is an interim approach, allowing us to + implement BasicFileSystem in Rust, for use with the ImportScanner, without needing + to implement the full range of methods used by other parts of Grimp. + + Eventually we'll move the full implementation to Rust, and have a single + FileSystem interface. + """ + + def join(self, *components: str) -> str: + ... + + def split(self, file_name: str) -> Tuple[str, str]: + ... + + def read(self, file_name: str) -> str: + ... + + def exists(self, file_name: str) -> bool: + ... diff --git a/src/grimp/application/ports/importscanner.py b/src/grimp/application/ports/importscanner.py index 34da5be9..26df8aa3 100644 --- a/src/grimp/application/ports/importscanner.py +++ b/src/grimp/application/ports/importscanner.py @@ -1,7 +1,7 @@ import abc from typing import Set -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import BasicFileSystem from grimp.application.ports.modulefinder import FoundPackage from grimp.domain.valueobjects import DirectImport, Module @@ -13,7 +13,7 @@ class AbstractImportScanner(abc.ABC): def __init__( self, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool = False, ) -> None: From 979e7fa40eedeebc9f63eefddee5ae33dd20d383 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 10:58:52 +0100 Subject: [PATCH 04/15] Implement FakeBasicFileSystem Much of the Rust code was generated by an LLM, possibly it could be simplified. The Python tests are adapted so some of the same tests can be run on the Python FakeFileSystem and the Rust-based FakeBasicFileSystem. --- rust/Cargo.lock | 22 +++ rust/Cargo.toml | 3 + rust/src/filesystem.rs | 177 ++++++++++++++++++++++ rust/src/lib.rs | 3 + tests/unit/adaptors/test_filesystem.py | 196 ++++++++++++++++++------- 5 files changed, 350 insertions(+), 51 deletions(-) create mode 100644 rust/src/filesystem.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index f4ae21c1..9a2b2796 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -21,11 +21,14 @@ dependencies = [ "ruff_python_parser", "ruff_source_file", "rustc-hash 2.1.1", + "serde", "serde_json", + "serde_yaml", "slotmap", "string-interner", "tap", "thiserror", + "unindent", ] [[package]] @@ -666,6 +669,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.9.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "siphasher" version = "1.0.1" @@ -810,6 +826,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "version_check" version = "0.9.5" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 8e5acab2..845daeb7 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -25,6 +25,9 @@ const_format = "0.2.34" ruff_python_parser = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } ruff_python_ast = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } ruff_source_file = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4.10" } +serde = { version = "1.0", features = ["derive"] } +serde_yaml = "0.9" +unindent = "0.2.4" [dependencies.pyo3] version = "0.24.1" diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs new file mode 100644 index 00000000..34332a8d --- /dev/null +++ b/rust/src/filesystem.rs @@ -0,0 +1,177 @@ +use pyo3::exceptions::PyFileNotFoundError; +use pyo3::prelude::*; +use std::collections::HashMap; +use unindent::unindent; + +type FileSystemContents = HashMap; + +// Implements BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem). +#[pyclass] +pub struct FakeBasicFileSystem { + contents: FileSystemContents, +} + +#[pymethods] +impl FakeBasicFileSystem { + #[pyo3(signature = (contents=None, content_map=None))] + #[new] + fn new(contents: Option<&str>, content_map: Option>) -> PyResult { + let mut parsed_contents = match contents { + Some(contents) => parse_indented_file_system_string(contents), + None => HashMap::new(), + }; + if let Some(content_map) = content_map { + let unindented_map: HashMap = content_map + .into_iter() + .map(|(key, val)| (key, unindent(&val).trim().to_string())) + .collect(); + parsed_contents.extend(unindented_map); + }; + Ok(FakeBasicFileSystem { + contents: parsed_contents, + }) + } + + #[getter] + fn sep(&self) -> String { + "/".to_string() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + let sep = self.sep(); + components + .into_iter() + .map(|c| c.trim_end_matches(&sep).to_string()) + .collect::>() + .join(&sep) + } + + fn split(&self, file_name: &str) -> (String, String) { + let components: Vec<&str> = file_name.split('/').collect(); + + if components.is_empty() { + return ("".to_string(), "".to_string()); + } + + let tail = components.last().unwrap_or(&""); // Last component, or empty if components is empty (shouldn't happen from split) + + let head_components = &components[..components.len() - 1]; // All components except the last + + let head = if head_components.is_empty() { + // Case for single component paths like "filename.txt" or empty string "" + "".to_string() + } else if file_name.starts_with('/') + && head_components.len() == 1 + && head_components[0].is_empty() + { + // Special handling for paths starting with '/', e.g., "/" or "/filename.txt" + // If components were ["", ""], head_components is [""] -> should be "/" + // If components were ["", "file.txt"], head_components is [""] -> should be "/" + "/".to_string() + } else { + // Default joining for multiple components + head_components.join("/") + }; + + (head, tail.to_string()) + } + + /// Checks if a file or directory exists within the file system. + fn exists(&self, file_name: &str) -> bool { + self.contents.contains_key(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + match self.contents.get(file_name) { + Some(file_name) => Ok(file_name.clone()), + None => Err(PyFileNotFoundError::new_err("")), + } + } +} + +/// Parses an indented string representing a file system structure +/// into a HashMap where keys are full file paths. +/// See tests.adaptors.filesystem.FakeFileSystem for the API. +pub fn parse_indented_file_system_string(file_system_string: &str) -> HashMap { + let mut file_paths_map: HashMap = HashMap::new(); + let mut path_stack: Vec = Vec::new(); // Stores current directory path components + let mut first_line = true; // Flag to handle the very first path component + + // Normalize newlines and split into lines + let buffer = file_system_string.replace("\r\n", "\n"); + let lines: Vec<&str> = buffer.split('\n').collect(); + + for line_raw in lines.clone() { + let line = line_raw.trim_end(); // Remove trailing whitespace + if line.is_empty() { + continue; // Skip empty lines + } + + let current_indent = line.chars().take_while(|&c| c.is_whitespace()).count(); + let trimmed_line = line.trim_start(); + + // Assuming 4 spaces per indentation level for calculating depth + // Adjust this if your indentation standard is different (e.g., 2 spaces, tabs) + let current_depth = current_indent / 4; + + if first_line { + // The first non-empty line sets the base path. + // It might be absolute (/a/b/) or relative (a/b/). + let root_component = trimmed_line.trim_end_matches('/').to_string(); + path_stack.push(root_component); + first_line = false; + } else { + // Adjust the path_stack based on indentation level + // Pop elements from the stack until we reach the correct parent directory depth + while path_stack.len() > current_depth { + path_stack.pop(); + } + + // If the current line is a file, append it to the path for inserting into map, + // then pop it off so that subsequent siblings are correctly handled. + // If it's a directory, append it and it stays on the stack for its children. + let component_to_add = trimmed_line.trim_end_matches('/').to_string(); + if !component_to_add.is_empty() { + // Avoid pushing empty strings due to lines like just "/" + path_stack.push(component_to_add); + } + } + + // Construct the full path + // Join components on the stack. If the first component started with '/', + // ensure the final path also starts with '/'. + let full_path = if !path_stack.is_empty() { + let mut joined = path_stack.join("/"); + // If the original root started with a slash, ensure the final path does too. + // But be careful not to double-slash if a component is e.g. "/root" + if lines[0].trim().starts_with('/') && !joined.starts_with('/') { + joined = format!("/{joined}"); + } + joined + } else { + "".to_string() + }; + + // If it's a file (doesn't end with '/'), add it to the map + // A file is not a directory, so its name should be removed from the stack after processing + // so that sibling items are at the correct level. + if !trimmed_line.ends_with('/') { + file_paths_map.insert(full_path, String::new()); // Value can be empty or actual content + if !path_stack.is_empty() { + path_stack.pop(); // Pop the file name off the stack + } + } + } + + // Edge case: If the very first line was a file and it ended up on the stack, it needs to be processed. + // This handles single-file inputs like "myfile.txt" + if !path_stack.is_empty() + && !path_stack.last().unwrap().ends_with('/') + && !file_paths_map.contains_key(&path_stack.join("/")) + { + file_paths_map.insert(path_stack.join("/"), String::new()); + } + + file_paths_map +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8c603133..502e5fdf 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -3,6 +3,7 @@ pub mod exceptions; pub mod graph; pub mod import_parsing; pub mod module_expressions; +mod filesystem; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{InvalidModuleExpression, ModuleNotPresent, NoSuchContainer, ParseError}; @@ -18,11 +19,13 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; +use crate::filesystem::FakeBasicFileSystem; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; + m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; m.add( diff --git a/tests/unit/adaptors/test_filesystem.py b/tests/unit/adaptors/test_filesystem.py index a59a173c..899d10f6 100644 --- a/tests/unit/adaptors/test_filesystem.py +++ b/tests/unit/adaptors/test_filesystem.py @@ -1,13 +1,148 @@ from copy import copy - +from typing import Type import pytest # type: ignore - +from grimp.application.ports.filesystem import BasicFileSystem from tests.adaptors.filesystem import FakeFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] + + +class _Base: + """ + Tests for methods that AbstractFileSystem and BasicFileSystem share. + """ + + file_system_cls: Type[BasicFileSystem] + + @pytest.mark.parametrize("path", ("/path/to", "/path/to/")) + def test_join(self, path): + file_system = self.file_system_cls() + assert "/path/to/mypackage/file.py" == file_system.join(path, "mypackage", "file.py") + + def test_split(self): + file_system = self.file_system_cls() + assert ("/path/to/mypackage", "file.py") == file_system.split("/path/to/mypackage/file.py") + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/mypackage", False), + ("/path/to/mypackage/", False), + ("/path/to/mypackage/readme.txt", True), + ("/path/to/mypackage/foo/one.txt", True), + ("/path/to/mypackage/foo/two/green.txt", True), + ("/path/to/nonexistent.txt", False), + ("/path/to/mypackage/purple.txt", False), + ], + ) + def test_exists_content_only(self, file_name, expected): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """ + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/", False), + ("/path/to/file.txt", True), + ("/path/to/a/deeper/file.txt", True), + ("/path/to/nonexistent.txt", False), + ], + ) + def test_exists_content_map_only(self, file_name, expected): + file_system = self.file_system_cls( + content_map={ + "/path/to/file.txt": "hello", + "/path/to/a/deeper/file.txt": "hello", + } + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected", + [ + ("/path/to/file.txt", True), + ("/path/to/mypackage/foo/two/green.txt", True), + ("/path/to/nonexistent.txt", False), + ], + ) + def test_exists_content_map_and_content(self, file_name, expected): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """, + content_map={ + "/path/to/file.txt": "hello", + "/path/to/a/deeper/file.txt": "hello", + }, + ) + + assert file_system.exists(file_name) == expected + + @pytest.mark.parametrize( + "file_name, expected_contents", + ( + ("/path/to/mypackage/readme.txt", "Lorem"), + ("/path/to/mypackage/foo/one.txt", "Ipsum"), + # Listed in contents, but not in content_map. + ("/path/to/mypackage/foo/two/green.txt", ""), + # Listed in content_map, but not in contents. + ("/path/to/mypackage/foo/two/blue.txt", "Dolor sic"), + ( + "/path/to/mypackage/indented.txt", + "This is indented text.\n We should dedent it.", + ), + ("/path/to/mypackage/nonexistent", FileNotFoundError), + ("/path/to/mypackage/foo/three/nonexistent", FileNotFoundError), + # TODO - should we raise an exception if we attempt to read a directory? + ), + ) + def test_read(self, file_name, expected_contents): + file_system = self.file_system_cls( + contents=""" + /path/to/mypackage/ + readme.txt + foo/ + one.txt + two/ + green.txt + """, + content_map={ + "/path/to/mypackage/readme.txt": "Lorem", + "/path/to/mypackage/foo/one.txt": "Ipsum", + "/path/to/mypackage/foo/two/blue.txt": "Dolor sic", + "/path/to/mypackage/indented.txt": """ + This is indented text. + We should dedent it. + """, + }, + ) + if isinstance(expected_contents, type) and issubclass(expected_contents, Exception): + with pytest.raises(expected_contents): + file_system.read(file_name) + else: + assert file_system.read(file_name) == expected_contents + +class TestFakeFileSystem(_Base): + file_system_cls = FakeFileSystem -class TestFakeFileSystem: def test_walk(self): - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ __init__.py @@ -29,7 +164,7 @@ def test_walk(self): ] == list(file_system.walk("/path/to/mypackage")) def test_empty_if_directory_does_not_exist(self): - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ __init__.py @@ -38,18 +173,9 @@ def test_empty_if_directory_does_not_exist(self): assert [] == list(file_system.walk("/path/to/nonexistent/package")) def test_dirname(self): - file_system = FakeFileSystem() + file_system = self.file_system_cls() assert "/path/to" == file_system.dirname("/path/to/file.txt") - @pytest.mark.parametrize("path", ("/path/to", "/path/to/")) - def test_join(self, path): - file_system = FakeFileSystem() - assert "/path/to/mypackage/file.py" == file_system.join(path, "mypackage", "file.py") - - def test_split(self): - file_system = FakeFileSystem() - assert ("/path/to/mypackage", "file.py") == file_system.split("/path/to/mypackage/file.py") - def test_dirnames_can_be_modified_in_place(self): """ From the os.walk docs: @@ -59,7 +185,7 @@ def test_dirnames_can_be_modified_in_place(self): of visiting, or even to inform walk() about directories the caller creates or renames before it resumes walk() again. """ - file_system = FakeFileSystem( + file_system = self.file_system_cls( """ /path/to/mypackage/ foo/ @@ -90,38 +216,6 @@ def test_dirnames_can_be_modified_in_place(self): assert expected_tuples == actual_tuples - @pytest.mark.parametrize( - "file_name, expected_contents", - ( - ("/path/to/mypackage/readme.txt", "Lorem"), - ("/path/to/mypackage/foo/one.txt", "Ipsum"), - # Listed in contents, but not in content_map. - ("/path/to/mypackage/foo/two/green.txt", ""), - # Listed in content_map, but not in contents. - ("/path/to/mypackage/foo/two/blue.txt", "Dolor sic"), - ("/path/to/mypackage/nonexistent", FileNotFoundError), - ("/path/to/mypackage/foo/three/nonexistent", FileNotFoundError), - # TODO - should we raise an exception if we attempt to read a directory? - ), - ) - def test_read(self, file_name, expected_contents): - file_system = FakeFileSystem( - contents=""" - /path/to/mypackage/ - readme.txt - foo/ - one.txt - two/ - green.txt - """, - content_map={ - "/path/to/mypackage/readme.txt": "Lorem", - "/path/to/mypackage/foo/one.txt": "Ipsum", - "/path/to/mypackage/foo/two/blue.txt": "Dolor sic", - }, - ) - if isinstance(expected_contents, type) and issubclass(expected_contents, Exception): - with pytest.raises(expected_contents): - file_system.read(file_name) - else: - assert expected_contents == file_system.read(file_name) + +class TestFakeBasicFileSystem(_Base): + file_system_cls = rust.FakeBasicFileSystem From 75c6e9dd47d48766cfbee07a86bd1ed0f6a5ee2c Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 12:47:08 +0100 Subject: [PATCH 05/15] Use FakeBasicFileSystem in unit test --- tests/unit/adaptors/test_importscanner.py | 27 ++++++++++++----------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index f4d1795f..ee71873e 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -6,7 +6,8 @@ from grimp.adaptors.importscanner import ImportScanner from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module -from tests.adaptors.filesystem import FakeFileSystem + +from grimp import _rustgrimp as rust # type: ignore[attr-defined] @pytest.mark.parametrize( @@ -50,7 +51,7 @@ ) def test_absolute_imports(include_external_packages, expected_result): all_modules = {Module("foo.one"), Module("foo.two")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": """ import foo.two @@ -81,7 +82,7 @@ def test_absolute_imports(include_external_packages, expected_result): def test_non_ascii(): blue_module = Module("mypackage.blue") non_ascii_modules = {Module("mypackage.ñon_ascii_变"), Module("mypackage.ñon_ascii_变.ラーメン")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/mypackage/blue.py": """ from ñon_ascii_变 import * @@ -152,7 +153,7 @@ def test_single_namespace_package_portion(): MODULE_ALPHA, } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/one.py": """ import namespace.foo.two @@ -235,7 +236,7 @@ def test_import_of_portion_not_in_graph(include_external_packages): MODULE_FOO_ONE, MODULE_FOO_BLUE, } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/one.py": """ import namespace.bar.one.orange @@ -415,7 +416,7 @@ def test_absolute_from_imports(include_external_packages, expected_result): Module("foo.two.yellow"), Module("foo.three"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -467,7 +468,7 @@ def test_relative_from_imports(): Module("foo.two.yellow"), Module("foo.three"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -537,7 +538,7 @@ def test_trims_to_known_modules(import_source): Module("foo.two"), Module("foo.two.yellow"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -580,7 +581,7 @@ def test_trims_to_known_modules_within_init_file(): Module("foo.one.blue"), Module("foo.one.blue.alpha"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -633,7 +634,7 @@ def test_trims_to_known_modules_within_init_file(): def test_trims_whitespace_from_start_of_line_contents(): all_modules = {Module("foo"), Module("foo.one"), Module("foo.two")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ __init__.py @@ -706,7 +707,7 @@ def my_function(): def test_external_package_imports_for_namespace_packages(statement, expected_module_name): module_to_scan = Module("namespace.foo.blue.alpha") - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/namespace/foo/blue/alpha.py": statement, } @@ -751,7 +752,7 @@ def test_external_package_imports_for_namespace_packages(statement, expected_mod def test_scans_multiple_packages(statement): foo_modules = {Module("foo"), Module("foo.one"), Module("foo.two")} bar_modules = {Module("bar"), Module("bar.green"), Module("bar.blue")} - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": f""" import foo.two @@ -819,7 +820,7 @@ def test_exclude_type_checking_imports( Module("foo.four"), Module("foo.five"), } - file_system = FakeFileSystem( + file_system = rust.FakeBasicFileSystem( content_map={ "/path/to/foo/one.py": f""" import foo.two From 96392323fedfc1ad96e7ad38d324c14bdf3a3823 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 14:29:16 +0100 Subject: [PATCH 06/15] Pass BasicFileSystem to ImportScanner from use case --- src/grimp/adaptors/filesystem.py | 5 ++++- src/grimp/application/ports/filesystem.py | 8 ++++++++ src/grimp/application/usecases.py | 7 ++++--- tests/adaptors/filesystem.py | 14 +++++++++++++- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/grimp/adaptors/filesystem.py b/src/grimp/adaptors/filesystem.py index 9f637982..b69abe19 100644 --- a/src/grimp/adaptors/filesystem.py +++ b/src/grimp/adaptors/filesystem.py @@ -2,7 +2,7 @@ import tokenize from typing import Iterator, List, Tuple -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem class FileSystem(AbstractFileSystem): @@ -44,3 +44,6 @@ def write(self, file_name: str, contents: str) -> None: os.makedirs(dirname) with open(file_name, "w") as file: print(contents, file=file) + + def convert_to_basic(self) -> BasicFileSystem: + return self # FileSystem already corresponds to the BasicFileSystem protocol. diff --git a/src/grimp/application/ports/filesystem.py b/src/grimp/application/ports/filesystem.py index d2966255..bfe129a2 100644 --- a/src/grimp/application/ports/filesystem.py +++ b/src/grimp/application/ports/filesystem.py @@ -1,3 +1,4 @@ +from __future__ import annotations import abc from typing import Iterator, List, Tuple from typing import Protocol @@ -82,6 +83,13 @@ def write(self, file_name: str, contents: str) -> None: """ raise NotImplementedError + @abc.abstractmethod + def convert_to_basic(self) -> BasicFileSystem: + """ + Convert this file system to a BasicFileSystem. + """ + raise NotImplementedError + class BasicFileSystem(Protocol): """ diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index eee3286c..f403e344 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -7,6 +7,7 @@ import joblib # type: ignore +from .ports.filesystem import BasicFileSystem from ..application.ports import caching from ..application.ports.filesystem import AbstractFileSystem from ..application.ports.graph import ImportGraph @@ -145,7 +146,7 @@ def _scan_packages( imports_by_module_file.update( _scan_imports( remaining_module_files_to_scan, - file_system=file_system, + file_system=file_system.convert_to_basic(), found_packages=found_packages, include_external_packages=include_external_packages, exclude_type_checking_imports=exclude_type_checking_imports, @@ -213,7 +214,7 @@ def _read_imports_from_cache( def _scan_imports( module_files: Collection[ModuleFile], *, - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, @@ -258,7 +259,7 @@ def _decide_number_of_processes(number_of_module_files: int) -> int: def _scan_chunks( chunks: Collection[Collection[ModuleFile]], - file_system: AbstractFileSystem, + file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, diff --git a/tests/adaptors/filesystem.py b/tests/adaptors/filesystem.py index 6eac8cca..74854da0 100644 --- a/tests/adaptors/filesystem.py +++ b/tests/adaptors/filesystem.py @@ -1,8 +1,10 @@ from typing import Any, Dict, Generator, List, Optional, Tuple + import yaml -from grimp.application.ports.filesystem import AbstractFileSystem +from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] DEFAULT_MTIME = 10000.0 @@ -43,6 +45,7 @@ def __init__( i.e. last modified times. """ self.contents = self._parse_contents(contents) + self._raw_contents = contents self.content_map = content_map if content_map else {} self.mtime_map: Dict[str, float] = mtime_map if mtime_map else {} @@ -186,3 +189,12 @@ def get_mtime(self, file_name: str) -> float: def write(self, file_name: str, contents: str) -> None: self.content_map[file_name] = contents self.mtime_map[file_name] = DEFAULT_MTIME + + def convert_to_basic(self) -> BasicFileSystem: + """ + Convert this file system to a BasicFileSystem. + """ + return rust.FakeBasicFileSystem( + contents=self._raw_contents, + content_map=self.content_map, + ) From 1e5d688974653dcec5e441c03cea9ccae460933f Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:22:31 +0100 Subject: [PATCH 07/15] Implement RealBasicFileSystem --- rust/Cargo.lock | 10 ++++ rust/Cargo.toml | 1 + rust/src/filesystem.rs | 109 ++++++++++++++++++++++++++++++++++++++++- rust/src/lib.rs | 3 +- 4 files changed, 121 insertions(+), 2 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 9a2b2796..d5c457f5 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -9,6 +9,7 @@ dependencies = [ "bimap", "const_format", "derive-new", + "encoding_rs", "getset", "indexmap 2.9.0", "itertools 0.14.0", @@ -137,6 +138,15 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 845daeb7..1b346c8a 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -28,6 +28,7 @@ ruff_source_file = { git = "https://github.com/astral-sh/ruff.git", tag = "v0.4. serde = { version = "1.0", features = ["derive"] } serde_yaml = "0.9" unindent = "0.2.4" +encoding_rs = "0.8.35" [dependencies.pyo3] version = "0.24.1" diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index 34332a8d..4f7c6273 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -1,8 +1,115 @@ -use pyo3::exceptions::PyFileNotFoundError; +use pyo3::exceptions::{PyFileNotFoundError, PyUnicodeDecodeError}; use pyo3::prelude::*; +use regex::Regex; use std::collections::HashMap; +use std::fs; +use std::path::{Path, PathBuf}; use unindent::unindent; +// Implements a BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem) +// that actually reads files. +#[pyclass] +pub struct RealBasicFileSystem {} + +#[pymethods] +impl RealBasicFileSystem { + #[new] + fn new() -> Self { + RealBasicFileSystem {} + } + + #[getter] + fn sep(&self) -> String { + std::path::MAIN_SEPARATOR.to_string() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + let mut path = PathBuf::new(); + for component in components { + path.push(component); + } + path.to_str().unwrap().to_string() + } + + fn split(&self, file_name: &str) -> (String, String) { + let path = Path::new(file_name); + + // Get the "tail" part (the file name or last directory) + let tail = match path.file_name() { + Some(name) => PathBuf::from(name), + None => PathBuf::new(), // If there's no file name (e.g., path is a root), return empty + }; + + // Get the "head" part (the parent directory) + let head = match path.parent() { + Some(parent_path) => parent_path.to_path_buf(), + None => PathBuf::new(), // If there's no parent (e.g., just a filename), return empty + }; + + ( + head.to_str().unwrap().to_string(), + tail.to_str().unwrap().to_string(), + ) + } + + fn exists(&self, file_name: &str) -> bool { + Path::new(file_name).is_file() + } + + fn read(&self, file_name: &str) -> PyResult { + // Python files are assumed UTF-8 by default (PEP 686), but they can specify an alternative + // encoding, which we need to take into account here. + // See https://peps.python.org/pep-0263/ + + // This method was authored primarily by an LLM. + + let path = Path::new(file_name); + let bytes = fs::read(path).map_err(|e| { + PyFileNotFoundError::new_err(format!("Failed to read file {file_name}: {e}")) + })?; + + let s = String::from_utf8_lossy(&bytes); + let encoding_re = Regex::new(r"^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap(); + + let mut detected_encoding: Option = None; + + // Coding specification needs to be in the first two lines, or it's ignored. + for line in s.lines().take(2) { + if let Some(captures) = encoding_re.captures(line) { + if let Some(encoding_name) = captures.get(1) { + detected_encoding = Some(encoding_name.as_str().to_string()); + break; + } + } + } + + if let Some(enc_name) = detected_encoding { + let encoding = + encoding_rs::Encoding::for_label(enc_name.as_bytes()).ok_or_else(|| { + PyUnicodeDecodeError::new_err(format!( + "Failed to decode file {file_name} (unknown encoding '{enc_name}')" + )) + })?; + let (decoded_s, _, had_errors) = encoding.decode(&bytes); + if had_errors { + Err(PyUnicodeDecodeError::new_err(format!( + "Failed to decode file {file_name} with encoding '{enc_name}'" + ))) + } else { + Ok(decoded_s.into_owned()) + } + } else { + // Default to UTF-8 if no encoding is specified + String::from_utf8(bytes).map_err(|e| { + PyUnicodeDecodeError::new_err(format!( + "Failed to decode file {file_name} as UTF-8: {e}" + )) + }) + } + } +} + type FileSystemContents = HashMap; // Implements BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem). diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 502e5fdf..31757ac6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -19,12 +19,13 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; -use crate::filesystem::FakeBasicFileSystem; +use crate::filesystem::{RealBasicFileSystem,FakeBasicFileSystem}; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; From 1475c553fdbb993afc316e40a1e111c6b56b0b55 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Mon, 23 Jun 2025 08:34:25 +0100 Subject: [PATCH 08/15] Instantiate ImportScanner from within _scan_chunk This means we don't need to pickle the FileSystem - making it possible to use the Rust-based file system classes while doing multiprocessing. --- src/grimp/application/usecases.py | 40 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index f403e344..71e97e65 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -146,7 +146,6 @@ def _scan_packages( imports_by_module_file.update( _scan_imports( remaining_module_files_to_scan, - file_system=file_system.convert_to_basic(), found_packages=found_packages, include_external_packages=include_external_packages, exclude_type_checking_imports=exclude_type_checking_imports, @@ -214,7 +213,6 @@ def _read_imports_from_cache( def _scan_imports( module_files: Collection[ModuleFile], *, - file_system: BasicFileSystem, found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, @@ -222,7 +220,6 @@ def _scan_imports( chunks = _create_chunks(module_files) return _scan_chunks( chunks, - file_system, found_packages, include_external_packages, exclude_type_checking_imports, @@ -257,22 +254,36 @@ def _decide_number_of_processes(number_of_module_files: int) -> int: return min(joblib.cpu_count(), number_of_module_files) -def _scan_chunks( - chunks: Collection[Collection[ModuleFile]], - file_system: BasicFileSystem, +def _scan_chunk( found_packages: Set[FoundPackage], include_external_packages: bool, exclude_type_checking_imports: bool, + chunk: Iterable[ModuleFile], ) -> Dict[ModuleFile, Set[DirectImport]]: + file_system: AbstractFileSystem = settings.FILE_SYSTEM + basic_file_system = file_system.convert_to_basic() import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS( - file_system=file_system, + file_system=basic_file_system, found_packages=found_packages, include_external_packages=include_external_packages, ) + return { + module_file: import_scanner.scan_for_imports( + module_file.module, exclude_type_checking_imports=exclude_type_checking_imports + ) + for module_file in chunk + } + +def _scan_chunks( + chunks: Collection[Collection[ModuleFile]], + found_packages: Set[FoundPackage], + include_external_packages: bool, + exclude_type_checking_imports: bool, +) -> Dict[ModuleFile, Set[DirectImport]]: number_of_processes = len(chunks) import_scanning_jobs = joblib.Parallel(n_jobs=number_of_processes)( - joblib.delayed(_scan_chunk)(import_scanner, exclude_type_checking_imports, chunk) + joblib.delayed(_scan_chunk)(found_packages, include_external_packages, exclude_type_checking_imports, chunk) for chunk in chunks ) @@ -280,16 +291,3 @@ def _scan_chunks( for chunk_imports_by_module_file in import_scanning_jobs: imports_by_module_file.update(chunk_imports_by_module_file) return imports_by_module_file - - -def _scan_chunk( - import_scanner: AbstractImportScanner, - exclude_type_checking_imports: bool, - chunk: Iterable[ModuleFile], -) -> Dict[ModuleFile, Set[DirectImport]]: - return { - module_file: import_scanner.scan_for_imports( - module_file.module, exclude_type_checking_imports=exclude_type_checking_imports - ) - for module_file in chunk - } From 6c57ea1545ec9379f93ea79e2108935b18988220 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Wed, 11 Jun 2025 17:18:07 +0100 Subject: [PATCH 09/15] Handle include_external_packages=None --- src/grimp/application/usecases.py | 10 +++++--- tests/unit/application/test_usecases.py | 34 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/grimp/application/usecases.py b/src/grimp/application/usecases.py index 71e97e65..35e830f3 100644 --- a/src/grimp/application/usecases.py +++ b/src/grimp/application/usecases.py @@ -7,7 +7,6 @@ import joblib # type: ignore -from .ports.filesystem import BasicFileSystem from ..application.ports import caching from ..application.ports.filesystem import AbstractFileSystem from ..application.ports.graph import ImportGraph @@ -60,7 +59,6 @@ def build_graph( "mypackage", "anotherpackage", "onemore", include_external_packages=True, ) """ - file_system: AbstractFileSystem = settings.FILE_SYSTEM found_packages = _find_packages( @@ -265,7 +263,9 @@ def _scan_chunk( import_scanner: AbstractImportScanner = settings.IMPORT_SCANNER_CLASS( file_system=basic_file_system, found_packages=found_packages, - include_external_packages=include_external_packages, + # Ensure that the passed exclude_type_checking_imports is definitely a boolean, + # otherwise the Rust class will error. + include_external_packages=bool(include_external_packages), ) return { module_file: import_scanner.scan_for_imports( @@ -283,7 +283,9 @@ def _scan_chunks( ) -> Dict[ModuleFile, Set[DirectImport]]: number_of_processes = len(chunks) import_scanning_jobs = joblib.Parallel(n_jobs=number_of_processes)( - joblib.delayed(_scan_chunk)(found_packages, include_external_packages, exclude_type_checking_imports, chunk) + joblib.delayed(_scan_chunk)( + found_packages, include_external_packages, exclude_type_checking_imports, chunk + ) for chunk in chunks ) diff --git a/tests/unit/application/test_usecases.py b/tests/unit/application/test_usecases.py index a8f14207..0b76e98e 100644 --- a/tests/unit/application/test_usecases.py +++ b/tests/unit/application/test_usecases.py @@ -208,3 +208,37 @@ class FakeModuleFinder(BaseFakeModuleFinder): [call] = mock_scan_chunks.call_args_list chunks = call.args[0] assert len(chunks) == expected_number_of_chunks + + def test_forgives_wrong_type_being_passed_to_include_external_packages(self): + file_system = FakeFileSystem( + contents=""" + /path/to/mypackage/ + __init__.py + foo/ + __init__.py + one.py + """, + content_map={ + "/path/to/mypackage/foo/one.py": ( + "import mypackage.foo.two.green\nfrom .. import Something" + ), + }, + ) + + class FakePackageFinder(BaseFakePackageFinder): + directory_map = {"mypackage": "/path/to/mypackage"} + + with override_settings(FILE_SYSTEM=file_system, PACKAGE_FINDER=FakePackageFinder()): + graph = usecases.build_graph( + "mypackage", + # Note: this should be a bool, but we want to tolerate it, + # as Import Linter currently has a bug where it will pass it as None. + include_external_packages=None, + ) + + expected_modules = { + "mypackage", + "mypackage.foo", + "mypackage.foo.one", + } + assert expected_modules == graph.modules From 067664002b7f437d1745920c402d732ec3d6b586 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Mon, 23 Jun 2025 08:37:27 +0100 Subject: [PATCH 10/15] Use RealBasicFileSystem in prod --- src/grimp/adaptors/filesystem.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/grimp/adaptors/filesystem.py b/src/grimp/adaptors/filesystem.py index b69abe19..83b54436 100644 --- a/src/grimp/adaptors/filesystem.py +++ b/src/grimp/adaptors/filesystem.py @@ -3,6 +3,7 @@ from typing import Iterator, List, Tuple from grimp.application.ports.filesystem import AbstractFileSystem, BasicFileSystem +from grimp import _rustgrimp as rust # type: ignore[attr-defined] class FileSystem(AbstractFileSystem): @@ -46,4 +47,4 @@ def write(self, file_name: str, contents: str) -> None: print(contents, file=file) def convert_to_basic(self) -> BasicFileSystem: - return self # FileSystem already corresponds to the BasicFileSystem protocol. + return rust.RealBasicFileSystem() From a9f62102b1481b6a91e21b9ff9424b09a9143315 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Tue, 10 Jun 2025 15:44:19 +0100 Subject: [PATCH 11/15] Make AbstractImportScanner a Protocol --- src/grimp/adaptors/importscanner.py | 29 +++++++++++++++++--- src/grimp/application/ports/importscanner.py | 15 ++-------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/grimp/adaptors/importscanner.py b/src/grimp/adaptors/importscanner.py index a405facc..50239930 100644 --- a/src/grimp/adaptors/importscanner.py +++ b/src/grimp/adaptors/importscanner.py @@ -6,8 +6,8 @@ from typing import Dict, Optional, Set from grimp import exceptions -from grimp.application.ports.importscanner import AbstractImportScanner from grimp.application.ports.modulefinder import FoundPackage +from grimp.application.ports.filesystem import BasicFileSystem from grimp.domain.valueobjects import DirectImport, Module from grimp import _rustgrimp as rust # type: ignore[attr-defined] @@ -25,9 +25,30 @@ class _ImportedObject: typechecking_only: bool -class ImportScanner(AbstractImportScanner): - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) +class ImportScanner: + def __init__( + self, + file_system: BasicFileSystem, + found_packages: Set[FoundPackage], + include_external_packages: bool = False, + ) -> None: + """ + Args: + - found_packages: Set of FoundPackages containing all the modules + for analysis. + - file_system: The file system interface to use. + - include_external_packages: Whether to include imports of external modules (i.e. + modules not contained in modules_by_package_directory) + in the results. + """ + self.file_system = file_system + self.include_external_packages = include_external_packages + self.found_packages = found_packages + + # Flatten all the modules into a set. + self.modules: Set[Module] = set() + for package in self.found_packages: + self.modules |= {mf.module for mf in package.module_files} self._found_packages_by_module: Dict[Module, FoundPackage] = { module_file.module: package diff --git a/src/grimp/application/ports/importscanner.py b/src/grimp/application/ports/importscanner.py index 26df8aa3..82cb21bc 100644 --- a/src/grimp/application/ports/importscanner.py +++ b/src/grimp/application/ports/importscanner.py @@ -1,12 +1,11 @@ -import abc -from typing import Set +from typing import Set, Protocol from grimp.application.ports.filesystem import BasicFileSystem from grimp.application.ports.modulefinder import FoundPackage from grimp.domain.valueobjects import DirectImport, Module -class AbstractImportScanner(abc.ABC): +class AbstractImportScanner(Protocol): """ Statically analyses some Python modules for import statements within their shared package. """ @@ -26,16 +25,7 @@ def __init__( modules not contained in modules_by_package_directory) in the results. """ - self.file_system = file_system - self.include_external_packages = include_external_packages - self.found_packages = found_packages - # Flatten all the modules into a set. - self.modules: Set[Module] = set() - for package in self.found_packages: - self.modules |= {mf.module for mf in package.module_files} - - @abc.abstractmethod def scan_for_imports( self, module: Module, *, exclude_type_checking_imports: bool = False ) -> Set[DirectImport]: @@ -43,4 +33,3 @@ def scan_for_imports( Statically analyses the given module and returns an iterable of Modules that it imports. """ - raise NotImplementedError From f9fbfa5965b32dbd597885d6c693f8cedd0e6615 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 27 Jun 2025 17:07:19 +0100 Subject: [PATCH 12/15] Push down Rust-specific file systems with Python wrapper classes RealBasicFileSystem is renamed to PyRealBasicFileSystem, likewise with FakeBasicFileSystem. These then wrap inner Rust structs. We do this so we can box a file system in a Rust-based ImportScanner class, and then interact with it polymorphically. (See an upcoming commit.) --- rust/src/filesystem.rs | 116 ++++++++++++++++++++++++++++++++++------- rust/src/lib.rs | 6 +-- 2 files changed, 100 insertions(+), 22 deletions(-) diff --git a/rust/src/filesystem.rs b/rust/src/filesystem.rs index 4f7c6273..8d9c7848 100644 --- a/rust/src/filesystem.rs +++ b/rust/src/filesystem.rs @@ -6,24 +6,34 @@ use std::fs; use std::path::{Path, PathBuf}; use unindent::unindent; -// Implements a BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem) -// that actually reads files. +pub trait FileSystem: Send + Sync { + fn sep(&self) -> String; + + fn join(&self, components: Vec) -> String; + + fn split(&self, file_name: &str) -> (String, String); + + fn exists(&self, file_name: &str) -> bool; + + fn read(&self, file_name: &str) -> PyResult; +} + +#[derive(Clone)] #[pyclass] pub struct RealBasicFileSystem {} -#[pymethods] -impl RealBasicFileSystem { - #[new] - fn new() -> Self { - RealBasicFileSystem {} - } +// Implements a BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem) +// that actually reads files. +#[pyclass(name = "RealBasicFileSystem")] +pub struct PyRealBasicFileSystem { + pub inner: RealBasicFileSystem, +} - #[getter] +impl FileSystem for RealBasicFileSystem { fn sep(&self) -> String { std::path::MAIN_SEPARATOR.to_string() } - #[pyo3(signature = (*components))] fn join(&self, components: Vec) -> String { let mut path = PathBuf::new(); for component in components { @@ -110,18 +120,52 @@ impl RealBasicFileSystem { } } +#[pymethods] +impl PyRealBasicFileSystem { + #[new] + fn new() -> Self { + PyRealBasicFileSystem { + inner: RealBasicFileSystem {}, + } + } + + #[getter] + fn sep(&self) -> String { + self.inner.sep() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + self.inner.join(components) + } + + fn split(&self, file_name: &str) -> (String, String) { + self.inner.split(file_name) + } + + fn exists(&self, file_name: &str) -> bool { + self.inner.exists(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + self.inner.read(file_name) + } +} + type FileSystemContents = HashMap; -// Implements BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem). -#[pyclass] +#[derive(Clone)] pub struct FakeBasicFileSystem { - contents: FileSystemContents, + contents: Box, +} + +// Implements BasicFileSystem (defined in grimp.application.ports.filesystem.BasicFileSystem). +#[pyclass(name = "FakeBasicFileSystem")] +pub struct PyFakeBasicFileSystem { + pub inner: FakeBasicFileSystem, } -#[pymethods] impl FakeBasicFileSystem { - #[pyo3(signature = (contents=None, content_map=None))] - #[new] fn new(contents: Option<&str>, content_map: Option>) -> PyResult { let mut parsed_contents = match contents { Some(contents) => parse_indented_file_system_string(contents), @@ -135,16 +179,16 @@ impl FakeBasicFileSystem { parsed_contents.extend(unindented_map); }; Ok(FakeBasicFileSystem { - contents: parsed_contents, + contents: Box::new(parsed_contents), }) } +} - #[getter] +impl FileSystem for FakeBasicFileSystem { fn sep(&self) -> String { "/".to_string() } - #[pyo3(signature = (*components))] fn join(&self, components: Vec) -> String { let sep = self.sep(); components @@ -197,6 +241,40 @@ impl FakeBasicFileSystem { } } +#[pymethods] +impl PyFakeBasicFileSystem { + #[pyo3(signature = (contents=None, content_map=None))] + #[new] + fn new(contents: Option<&str>, content_map: Option>) -> PyResult { + Ok(PyFakeBasicFileSystem { + inner: FakeBasicFileSystem::new(contents, content_map)?, + }) + } + + #[getter] + fn sep(&self) -> String { + self.inner.sep() + } + + #[pyo3(signature = (*components))] + fn join(&self, components: Vec) -> String { + self.inner.join(components) + } + + fn split(&self, file_name: &str) -> (String, String) { + self.inner.split(file_name) + } + + /// Checks if a file or directory exists within the file system. + fn exists(&self, file_name: &str) -> bool { + self.inner.exists(file_name) + } + + fn read(&self, file_name: &str) -> PyResult { + self.inner.read(file_name) + } +} + /// Parses an indented string representing a file system structure /// into a HashMap where keys are full file paths. /// See tests.adaptors.filesystem.FakeFileSystem for the API. diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 31757ac6..221703fe 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -19,14 +19,14 @@ use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTu use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; -use crate::filesystem::{RealBasicFileSystem,FakeBasicFileSystem}; +use crate::filesystem::{PyRealBasicFileSystem,PyFakeBasicFileSystem}; #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(parse_imported_objects_from_code))?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; m.add( From 3bbefa5fcbcbfdc402e5bbcc36dfef57749c2797 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 27 Jun 2025 17:16:46 +0100 Subject: [PATCH 13/15] Define some Rust equivalents for module finding data structures --- rust/src/module_finding.rs | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 rust/src/module_finding.rs diff --git a/rust/src/module_finding.rs b/rust/src/module_finding.rs new file mode 100644 index 00000000..4034b265 --- /dev/null +++ b/rust/src/module_finding.rs @@ -0,0 +1,60 @@ +use pyo3::{prelude::*, types::PyFrozenSet}; +use std::collections::BTreeSet; +use std::fmt; + +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, FromPyObject)] +pub struct ModuleFile { + pub module: Module, +} + +/// A Python module. +/// The string is the importable name, e.g. "mypackage.foo". +#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash, FromPyObject)] +pub struct Module { + pub name: String, +} + +impl fmt::Display for Module { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.name) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +/// Set of modules found under a single package, together with metadata. +pub struct FoundPackage { + pub name: String, + pub directory: String, + // BTreeSet rather than HashSet is necessary to make FoundPackage hashable. + pub module_files: BTreeSet, +} + +/// Implements conversion from a Python 'FoundPackage' object to the Rust 'FoundPackage' struct. +/// It extracts 'name', 'directory', and converts the 'module_files' frozenset into a Rust BTreeSet. +impl<'py> FromPyObject<'py> for FoundPackage { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + // Extract the 'name' attribute. + let name: String = ob.getattr("name")?.extract()?; + // Extract the 'directory' attribute. + let directory: String = ob.getattr("directory")?.extract()?; + + // Access the 'module_files' attribute. + let module_files_py = ob.getattr("module_files")?; + // Downcast the PyAny object to a PyFrozenSet, as Python 'FrozenSet' maps to 'PyFrozenSet'. + let py_frozen_set = module_files_py.downcast::()?; + + let mut module_files = BTreeSet::new(); + // Iterate over the Python frozenset. + for py_module_file_any in py_frozen_set.iter() { + // Extract each element (PyAny) into a Rust 'ModuleFile'. + let module_file: ModuleFile = py_module_file_any.extract()?; + module_files.insert(module_file); + } + + Ok(FoundPackage { + name, + directory, + module_files, + }) + } +} From 92299c551de0f06f8f2e94fadeadcdfef2ba9549 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 27 Jun 2025 17:21:31 +0100 Subject: [PATCH 14/15] Define ImportScanner in Rust --- rust/src/import_scanning.rs | 364 ++++++++++++++++++++++ rust/src/lib.rs | 4 + tests/unit/adaptors/test_importscanner.py | 47 +-- 3 files changed, 396 insertions(+), 19 deletions(-) create mode 100644 rust/src/import_scanning.rs diff --git a/rust/src/import_scanning.rs b/rust/src/import_scanning.rs new file mode 100644 index 00000000..66f07a62 --- /dev/null +++ b/rust/src/import_scanning.rs @@ -0,0 +1,364 @@ +use crate::errors::GrimpError; +use crate::filesystem::{FileSystem, PyFakeBasicFileSystem, PyRealBasicFileSystem}; +use crate::import_parsing; +use crate::module_finding::{FoundPackage, Module}; +use itertools::Itertools; +use pyo3::exceptions::{PyFileNotFoundError, PyTypeError}; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PySet}; +use std::collections::HashSet; + +#[pyclass] +pub struct ImportScanner { + file_system: Box, + found_packages: HashSet, + include_external_packages: bool, + modules: HashSet, +} + +#[derive(Debug, Hash, Eq, PartialEq)] +struct DirectImport { + importer: String, + imported: String, + line_number: usize, + line_contents: String, +} + +#[pymethods] +impl ImportScanner { + #[allow(unused_variables)] + #[new] + #[pyo3(signature = (file_system, found_packages, include_external_packages=false))] + fn new( + py: Python, + file_system: Bound<'_, PyAny>, + found_packages: Bound<'_, PyAny>, + include_external_packages: bool, + ) -> PyResult { + let file_system_boxed: Box; + + if let Ok(py_real) = file_system.extract::>() { + file_system_boxed = Box::new(py_real.inner.clone()); + } else if let Ok(py_fake) = file_system.extract::>() { + file_system_boxed = Box::new(py_fake.inner.clone()); + } else { + return Err(PyTypeError::new_err( + "file_system must be an instance of RealBasicFileSystem or FakeBasicFileSystem", + )); + } + let found_packages_rust = _py_found_packages_to_rust(found_packages); + let modules = _get_modules_from_found_packages(&found_packages_rust); + + Ok(ImportScanner { + file_system: file_system_boxed, + found_packages: found_packages_rust, + modules, + include_external_packages, + }) + } + + #[pyo3(signature = (module, exclude_type_checking_imports=false))] + fn scan_for_imports<'a>( + &self, + py: Python<'a>, + module: Bound<'_, PyAny>, + exclude_type_checking_imports: bool, + ) -> PyResult> { + let module_rust = module.extract().unwrap(); + let found_package_for_module = self._lookup_found_package_for_module(&module_rust); + let module_filename = + self._determine_module_filename(&module_rust, found_package_for_module)?; + let module_contents = self.file_system.read(&module_filename).unwrap(); + + let parse_result = import_parsing::parse_imports_from_code(&module_contents); + match parse_result { + Err(GrimpError::ParseError { + line_number, text, .. + }) => { + // TODO: define SourceSyntaxError using pyo3. + let exceptions_pymodule = PyModule::import(py, "grimp.exceptions").unwrap(); + let py_exception_class = exceptions_pymodule.getattr("SourceSyntaxError").unwrap(); + let exception = py_exception_class + .call1((module_filename, line_number, text)) + .unwrap(); + return Err(PyErr::from_value(exception)); + } + Err(e) => { + return Err(e.into()); + } + _ => (), + } + let imported_objects = parse_result.unwrap(); + let is_package = self._module_is_package(&module_filename); + + let mut imports: HashSet = HashSet::new(); + for imported_object in imported_objects { + // Don't include type checking imports, if specified. + if exclude_type_checking_imports && imported_object.typechecking_only { + continue; + } + + // Resolve relative imports. + let imported_object_name = self._get_absolute_imported_object_name( + &module_rust, + is_package, + &imported_object.name, + ); + + // Resolve imported module. + match self._get_internal_module(&imported_object_name) { + Some(imported_module) => { + // It's an internal import. + imports.insert(DirectImport { + importer: module_rust.name.to_string(), + imported: imported_module.name.to_string(), + line_number: imported_object.line_number, + line_contents: imported_object.line_contents, + }); + } + None => { + // It's an external import. + if self.include_external_packages { + if let Some(imported_module) = + self._distill_external_module(&imported_object_name) + { + imports.insert(DirectImport { + importer: module_rust.name.to_string(), + imported: imported_module, + line_number: imported_object.line_number, + line_contents: imported_object.line_contents, + }); + } + } + } + } + } + + Ok(self._to_py_direct_imports(py, &imports)) + } +} + +impl ImportScanner { + fn _lookup_found_package_for_module(&self, module: &Module) -> &FoundPackage { + // TODO: it's probably inefficient to do this every time we look up a module. + for found_package in &self.found_packages { + for module_file in &found_package.module_files { + if module_file.module == *module { + return found_package; + } + } + } + panic!("Could not lookup found package for module {module}"); + } + + fn _determine_module_filename( + &self, + module: &Module, + found_package: &FoundPackage, + ) -> PyResult { + // TODO: could we do all this with &str instead of String? + let top_level_components: Vec = + found_package.name.split(".").map(str::to_string).collect(); + let module_components: Vec = module.name.split(".").map(str::to_string).collect(); + let leaf_components = module_components[top_level_components.len()..].to_vec(); + + let mut filename_root_components: Vec = vec![found_package.directory.clone()]; + filename_root_components.extend(leaf_components); + let filename_root = self.file_system.join(filename_root_components); + let normal_module_path = format!("{filename_root}.py"); + let init_module_path = self + .file_system + .join(vec![filename_root, "__init__.py".to_string()]); + let candidate_filenames = [normal_module_path, init_module_path]; + for candidate_filename in candidate_filenames { + if self.file_system.exists(&candidate_filename) { + return Ok(candidate_filename); + } + } + Err(PyFileNotFoundError::new_err(format!( + "Could not find module {module}." + ))) + } + + fn _module_is_package(&self, module_filename: &str) -> bool { + self.file_system.split(module_filename).1 == "__init__.py" + } + + #[allow(unused_variables)] + fn _get_absolute_imported_object_name( + &self, + module: &Module, + is_package: bool, + imported_object_name: &str, + ) -> String { + let leading_dots_count = count_leading_dots(imported_object_name); + if leading_dots_count == 0 { + return imported_object_name.to_string(); + } + let imported_object_name_base: String; + if is_package { + if leading_dots_count == 1 { + imported_object_name_base = module.name.clone(); + } else { + let parts: Vec<&str> = module.name.split('.').collect(); + imported_object_name_base = + parts[0..parts.len() - leading_dots_count + 1].join("."); + } + } else { + let parts: Vec<&str> = module.name.split('.').collect(); + imported_object_name_base = parts[0..parts.len() - leading_dots_count].join("."); + } + + format!( + "{}.{}", + imported_object_name_base, + &imported_object_name[leading_dots_count..] + ) + } + + #[allow(unused_variables)] + fn _get_internal_module(&self, imported_object_name: &str) -> Option { + let candidate_module = Module { + name: imported_object_name.to_string(), + }; + if self.modules.contains(&candidate_module) { + return Some(candidate_module); + } + if let Some((parent_name, _)) = imported_object_name.rsplit_once('.') { + let parent = Module { + name: parent_name.to_string(), + }; + if self.modules.contains(&parent) { + return Some(parent); + } + } + None + } + + /// Given a module that we already know is external, turn it into a module to add to the graph. + // + // The 'distillation' process involves removing any unwanted subpackages. For example, + // django.models.db should be turned into simply django. + // The process is more complex for potential namespace packages, as it's not possible to + // determine the portion package simply from name. Rather than adding the overhead of a + // filesystem read, we just get the shallowest component that does not clash with an internal + // module namespace. Take, for example, foo.blue.alpha.one. If one of the found + // packages is foo.blue.beta, the module will be distilled to foo.blue.alpha. + // Alternatively, if the found package is foo.green, the distilled module will + // be foo.blue. + #[allow(unused_variables)] + fn _distill_external_module(&self, module_name: &str) -> Option { + let module_root = module_name.split(".").next().unwrap(); + // If it's a module that is a parent of one of the internal packages, return None + // as it doesn't make sense and is probably an import of a namespace package. + for package in &self.found_packages { + if module_is_descendant(&package.name, module_name) { + return None; + } + } + + // If it shares a namespace with an internal module, get the shallowest component that does + // not clash with an internal module namespace. + let mut candidate_portions: HashSet = HashSet::new(); + let mut sorted_found_packages: Vec<&FoundPackage> = self.found_packages.iter().collect(); + sorted_found_packages.sort_by_key(|package| &package.name); + sorted_found_packages.reverse(); + + for found_package in sorted_found_packages { + let root_module = &found_package.name; + if module_is_descendant(root_module, module_root) { + let mut internal_path_components: Vec<&str> = root_module.split(".").collect(); + let mut external_path_components: Vec<&str> = module_name.split(".").collect(); + let mut external_namespace_components: Vec<&str> = vec![]; + while external_path_components[0] == internal_path_components[0] { + external_namespace_components.push(external_path_components.remove(0)); + internal_path_components.remove(0); + } + external_namespace_components.push(external_path_components[0]); + + candidate_portions.insert(Module { + name: external_namespace_components.join("."), + }); + } + } + + if !candidate_portions.is_empty() { + // If multiple found packages share a namespace with this module, use the deepest one + // as we know that that will be a namespace too. + let deepest_candidate_portion = candidate_portions + .iter() + .sorted_by_key(|portion| portion.name.split(".").collect::>().len()) + .next_back() + .unwrap(); + Some(deepest_candidate_portion.name.clone()) + } else { + Some(module_name.split('.').next().unwrap().to_string()) + } + } + + fn _to_py_direct_imports<'a>( + &self, + py: Python<'a>, + rust_imports: &HashSet, + ) -> Bound<'a, PySet> { + // TODO: do this in the constructor. + let valueobjects_pymodule = PyModule::import(py, "grimp.domain.valueobjects").unwrap(); + let py_module_class = valueobjects_pymodule.getattr("Module").unwrap(); + let py_direct_import_class = valueobjects_pymodule.getattr("DirectImport").unwrap(); + + let pyset = PySet::empty(py).unwrap(); + + for rust_import in rust_imports { + let importer = py_module_class.call1((&rust_import.importer,)).unwrap(); + let imported = py_module_class.call1((&rust_import.imported,)).unwrap(); + let kwargs = PyDict::new(py); + kwargs.set_item("importer", &importer).unwrap(); + kwargs.set_item("imported", &imported).unwrap(); + kwargs + .set_item("line_number", rust_import.line_number) + .unwrap(); + kwargs + .set_item("line_contents", &rust_import.line_contents) + .unwrap(); + let py_direct_import = py_direct_import_class.call((), Some(&kwargs)).unwrap(); + pyset.add(&py_direct_import).unwrap(); + } + + pyset + } +} +fn _py_found_packages_to_rust(py_found_packages: Bound<'_, PyAny>) -> HashSet { + let py_set = py_found_packages + .downcast::() + .expect("Expected py_found_packages to be a Python set."); + + let mut rust_found_packages = HashSet::new(); + // Iterate over the elements in the Python set. + for py_found_package_any in py_set.iter() { + // Extract each Python 'FoundPackage' object into a Rust 'FoundPackage' struct. + // Panics if extraction fails, as specified. + let rust_found_package: FoundPackage = py_found_package_any + .extract() + .expect("Failed to extract Python FoundPackage to Rust FoundPackage."); + rust_found_packages.insert(rust_found_package); + } + rust_found_packages +} + +fn _get_modules_from_found_packages(found_packages: &HashSet) -> HashSet { + let mut modules = HashSet::new(); + for package in found_packages { + for module_file in &package.module_files { + modules.insert(module_file.module.clone()); + } + } + modules +} + +fn count_leading_dots(s: &str) -> usize { + s.chars().take_while(|&c| c == '.').count() +} + +fn module_is_descendant(module_name: &str, potential_ancestor: &str) -> bool { + module_name.starts_with(&format!("{potential_ancestor}.")) +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 221703fe..7a102930 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -4,11 +4,14 @@ pub mod graph; pub mod import_parsing; pub mod module_expressions; mod filesystem; +mod import_scanning; +mod module_finding; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{InvalidModuleExpression, ModuleNotPresent, NoSuchContainer, ParseError}; use crate::graph::higher_order_queries::Level; use crate::graph::{Graph, Module, ModuleIterator, ModuleTokenIterator}; +use crate::import_scanning::ImportScanner; use crate::module_expressions::ModuleExpression; use derive_new::new; use itertools::Itertools; @@ -27,6 +30,7 @@ fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; m.add( diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index ee71873e..fe55a704 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -3,7 +3,6 @@ import pytest # type: ignore -from grimp.adaptors.importscanner import ImportScanner from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module @@ -62,7 +61,7 @@ def test_absolute_imports(include_external_packages, expected_result): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -94,7 +93,7 @@ def test_non_ascii(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="mypackage", @@ -167,7 +166,7 @@ def test_single_namespace_package_portion(): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo", @@ -254,7 +253,7 @@ def test_import_of_portion_not_in_graph(include_external_packages): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo", @@ -443,7 +442,7 @@ def test_absolute_from_imports(include_external_packages, expected_result): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -460,14 +459,24 @@ def test_absolute_from_imports(include_external_packages, expected_result): assert expected_result == result -def test_relative_from_imports(): +@pytest.mark.parametrize("module_to_scan_is_package", (True, False)) +def test_relative_from_imports(module_to_scan_is_package): all_modules = { + Module("foo"), + Module("foo.one"), Module("foo.one.blue"), Module("foo.one.green"), Module("foo.two.brown"), Module("foo.two.yellow"), Module("foo.three"), } + if module_to_scan_is_package: + module_to_scan = Module("foo.one") + module_filename = "/path/to/foo/one/__init__.py" + else: + module_to_scan = Module("foo.one.blue") + module_filename = "/path/to/foo/one/blue.py" + file_system = rust.FakeBasicFileSystem( contents=""" /path/to/foo/ @@ -483,7 +492,7 @@ def test_relative_from_imports(): three.py """, content_map={ - "/path/to/foo/one/blue.py": """ + module_filename: """ from . import green from ..two import yellow from .. import three @@ -492,7 +501,7 @@ def test_relative_from_imports(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -503,23 +512,23 @@ def test_relative_from_imports(): file_system=file_system, ) - result = import_scanner.scan_for_imports(Module("foo.one.blue")) + result = import_scanner.scan_for_imports(module_to_scan) assert result == { DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.one.green"), line_number=1, line_contents="from . import green", ), DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.two.yellow"), line_number=2, line_contents="from ..two import yellow", ), DirectImport( - importer=Module("foo.one.blue"), + importer=module_to_scan, imported=Module("foo.three"), line_number=3, line_contents="from .. import three", @@ -550,7 +559,7 @@ def test_trims_to_known_modules(import_source): content_map={"/path/to/foo/one.py": import_source}, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -598,7 +607,7 @@ def test_trims_to_known_modules_within_init_file(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -649,7 +658,7 @@ def my_function(): }, ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -713,7 +722,7 @@ def test_external_package_imports_for_namespace_packages(statement, expected_mod } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="namespace.foo.blue", @@ -764,7 +773,7 @@ def test_scans_multiple_packages(statement): } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", @@ -833,7 +842,7 @@ def test_exclude_type_checking_imports( } ) - import_scanner = ImportScanner( + import_scanner = rust.ImportScanner( found_packages={ FoundPackage( name="foo", From 80b2234a138ea661b20f4c97a7d4a6734b6568d8 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 27 Jun 2025 17:24:52 +0100 Subject: [PATCH 15/15] Use Rust-based ImportScanner in production --- src/grimp/adaptors/importscanner.py | 251 +--------------------------- src/grimp/main.py | 2 +- 2 files changed, 2 insertions(+), 251 deletions(-) diff --git a/src/grimp/adaptors/importscanner.py b/src/grimp/adaptors/importscanner.py index 50239930..655782de 100644 --- a/src/grimp/adaptors/importscanner.py +++ b/src/grimp/adaptors/importscanner.py @@ -1,252 +1,3 @@ -from __future__ import annotations - -import re -import logging -from dataclasses import dataclass -from typing import Dict, Optional, Set - -from grimp import exceptions -from grimp.application.ports.modulefinder import FoundPackage -from grimp.application.ports.filesystem import BasicFileSystem -from grimp.domain.valueobjects import DirectImport, Module from grimp import _rustgrimp as rust # type: ignore[attr-defined] - -logger = logging.getLogger(__name__) - -_LEADING_DOT_REGEX = re.compile(r"^(\.+)\w") - - -@dataclass(frozen=True) -class _ImportedObject: - name: str - line_number: int - line_contents: str - typechecking_only: bool - - -class ImportScanner: - def __init__( - self, - file_system: BasicFileSystem, - found_packages: Set[FoundPackage], - include_external_packages: bool = False, - ) -> None: - """ - Args: - - found_packages: Set of FoundPackages containing all the modules - for analysis. - - file_system: The file system interface to use. - - include_external_packages: Whether to include imports of external modules (i.e. - modules not contained in modules_by_package_directory) - in the results. - """ - self.file_system = file_system - self.include_external_packages = include_external_packages - self.found_packages = found_packages - - # Flatten all the modules into a set. - self.modules: Set[Module] = set() - for package in self.found_packages: - self.modules |= {mf.module for mf in package.module_files} - - self._found_packages_by_module: Dict[Module, FoundPackage] = { - module_file.module: package - for package in self.found_packages - for module_file in package.module_files - } - - def scan_for_imports( - self, module: Module, *, exclude_type_checking_imports: bool = False - ) -> Set[DirectImport]: - """ - Note: this method only analyses the module in question and will not load any other - code, so it relies on self.modules to deduce which modules it imports. (This is - because you can't know whether "from foo.bar import baz" is importing a module - called `baz`, or a function `baz` from the module `bar`.) - """ - found_package = self._found_package_for_module(module) - module_filename = self._determine_module_filename(module, found_package) - module_contents = self._read_module_contents(module_filename) - - try: - imported_objects = self._get_raw_imported_objects(module_contents) - except rust.ParseError as e: - raise exceptions.SourceSyntaxError( - filename=module_filename, - lineno=e.line_number, - text=e.text, - ) - - is_package = self._module_is_package(module_filename) - - imports = set() - for imported_object in imported_objects: - # Filter on `exclude_type_checking_imports`. - if exclude_type_checking_imports and imported_object.typechecking_only: - continue - - # Resolve relative imports. - imported_object_name = self._get_absolute_imported_object_name( - module=module, is_package=is_package, imported_object_name=imported_object.name - ) - - # Resolve imported module. - imported_module = self._get_internal_module(imported_object_name, modules=self.modules) - if imported_module is None: - # => External import. - - # Filter on `self.include_external_packages`. - if not self.include_external_packages: - continue - - # Distill module. - imported_module = self._distill_external_module( - Module(imported_object_name), found_packages=self.found_packages - ) - if imported_module is None: - continue - - imports.add( - DirectImport( - importer=module, - imported=imported_module, - line_number=imported_object.line_number, - line_contents=imported_object.line_contents, - ) - ) - return imports - - def _found_package_for_module(self, module: Module) -> FoundPackage: - try: - return self._found_packages_by_module[module] - except KeyError: - raise ValueError(f"No found package for module {module}.") - - def _determine_module_filename(self, module: Module, found_package: FoundPackage) -> str: - """ - Work out the full filename of the given module. - - Any given module can either be a straight Python file (foo.py) or else a package - (in which case the file is an __init__.py within a directory). - """ - top_level_components = found_package.name.split(".") - module_components = module.name.split(".") - leaf_components = module_components[len(top_level_components) :] - package_directory = found_package.directory - - filename_root = self.file_system.join(package_directory, *leaf_components) - candidate_filenames = ( - f"{filename_root}.py", - self.file_system.join(filename_root, "__init__.py"), - ) - for candidate_filename in candidate_filenames: - if self.file_system.exists(candidate_filename): - return candidate_filename - raise FileNotFoundError(f"Could not find module {module}.") - - def _read_module_contents(self, module_filename: str) -> str: - """ - Read the file contents of the module. - """ - return self.file_system.read(module_filename) - - def _module_is_package(self, module_filename: str) -> bool: - """ - Whether or not the supplied module filename is a package. - """ - return self.file_system.split(module_filename)[-1] == "__init__.py" - - @staticmethod - def _get_raw_imported_objects(module_contents: str) -> Set[_ImportedObject]: - imported_object_dicts = rust.parse_imported_objects_from_code(module_contents) - return {_ImportedObject(**d) for d in imported_object_dicts} - - @staticmethod - def _get_absolute_imported_object_name( - *, module: Module, is_package: bool, imported_object_name: str - ) -> str: - leading_dot_match = _LEADING_DOT_REGEX.match(imported_object_name) - if leading_dot_match is None: - return imported_object_name - - n_leading_dots = len(leading_dot_match.group(1)) - if is_package: - if n_leading_dots == 1: - imported_object_name_base = module.name - else: - imported_object_name_base = ".".join( - module.name.split(".")[: -(n_leading_dots - 1)] - ) - else: - imported_object_name_base = ".".join(module.name.split(".")[:-n_leading_dots]) - return imported_object_name_base + "." + imported_object_name[n_leading_dots:] - - @staticmethod - def _get_internal_module(object_name: str, *, modules: Set[Module]) -> Optional[Module]: - candidate_module = Module(object_name) - if candidate_module in modules: - return candidate_module - - try: - candidate_module = candidate_module.parent - except ValueError: - return None - else: - if candidate_module in modules: - return candidate_module - else: - return None - - @staticmethod - def _distill_external_module( - module: Module, *, found_packages: Set[FoundPackage] - ) -> Optional[Module]: - """ - Given a module that we already know is external, turn it into a module to add to the graph. - - The 'distillation' process involves removing any unwanted subpackages. For example, - Module("django.models.db") should be turned into simply Module("django"). - - The process is more complex for potential namespace packages, as it's not possible to - determine the portion package simply from name. Rather than adding the overhead of a - filesystem read, we just get the shallowest component that does not clash with an internal - module namespace. Take, for example, a Module("foo.blue.alpha.one"). If one of the found - packages is foo.blue.beta, the module will be distilled to Module("foo.blue.alpha"). - Alternatively, if the found package is foo.green, the distilled module will - be Module("foo.blue"). - """ - # If it's a module that is a parent of one of the internal packages, return None - # as it doesn't make sense and is probably an import of a namespace package. - if any(Module(package.name).is_descendant_of(module) for package in found_packages): - return None - - # If it shares a namespace with an internal module, get the shallowest component that does - # not clash with an internal module namespace. - candidate_portions: Set[Module] = set() - for found_package in sorted(found_packages, key=lambda p: p.name, reverse=True): - root_module = Module(found_package.name) - if root_module.is_descendant_of(module.root): - ( - internal_path_components, - external_path_components, - ) = root_module.name.split( - "." - ), module.name.split(".") - external_namespace_components = [] - while external_path_components[0] == internal_path_components[0]: - external_namespace_components.append(external_path_components[0]) - external_path_components = external_path_components[1:] - internal_path_components = internal_path_components[1:] - external_namespace_components.append(external_path_components[0]) - candidate_portions.add(Module(".".join(external_namespace_components))) - - if candidate_portions: - # If multiple found packages share a namespace with this module, use the deepest one - # as we know that that will be a namespace too. - deepest_candidate_portion = sorted( - candidate_portions, key=lambda p: len(p.name.split(".")) - )[-1] - return deepest_candidate_portion - else: - return module.root +ImportScanner = rust.ImportScanner diff --git a/src/grimp/main.py b/src/grimp/main.py index 70622b05..d60533a8 100644 --- a/src/grimp/main.py +++ b/src/grimp/main.py @@ -13,7 +13,7 @@ settings.configure( MODULE_FINDER=ModuleFinder(), FILE_SYSTEM=FileSystem(), - IMPORT_SCANNER_CLASS=ImportScanner, + IMPORT_SCANNER_CLASS=ImportScanner, # type: ignore[has-type] IMPORT_GRAPH_CLASS=ImportGraph, PACKAGE_FINDER=ImportLibPackageFinder(), CACHE_CLASS=Cache,