From 4f94a900c1d754cd91505c1f07994fe531731eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gr=C3=BCter?= Date: Thu, 15 May 2025 22:23:04 +0200 Subject: [PATCH 1/5] Don't leave line after rule opening empty While lark didn't complain at all about this, leaving the first line after the ":" empty was matching "nothing". Obviously not what is intended! So while it would be more readable to be able to wrap everything inside a rule with an indent, it doesn't seem to be supported currently [1]. [1] www.github.com/lark-parser/lark/issues/155 --- src/docstub/doctype.lark | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/docstub/doctype.lark b/src/docstub/doctype.lark index 111173c..1bd6ba9 100644 --- a/src/docstub/doctype.lark +++ b/src/docstub/doctype.lark @@ -13,8 +13,7 @@ annotation_with_meta : type ("," optional)? ("," extra_info)? // Just the docstring type annotation without meta information. -?type : - | qualname +?type: qualname | rst_role | literal_expression | subscription_expression @@ -37,8 +36,7 @@ or_expression : type (("or" | "|") type)+ // An expression where an object is subscribed with "A[v, ...]". We extend this // syntax with a natural language variant `A of (v, ...)` and `A of {k : v}`. -subscription_expression: - | qualname "[" type ("," type)* ("," ELLIPSES)? "]" +subscription_expression: qualname "[" type ("," type)* ("," ELLIPSES)? "]" | qualname "of" type // TODO allow plural somehow, e.g. "list of int(s)"? | qualname "of" "(" type ("," type)* ("," ELLIPSES)? ")" | qualname "of" "{" type ":" type "}" @@ -50,16 +48,14 @@ literal_expression : "{" literal_item ("," literal_item)* "}" // An single item in a literal expression (or `optional`). We must also allow // for qualified names, since a "class" can be used as a literal too. -?literal_item : - | ELLIPSES +?literal_item: ELLIPSES | STRING | NUMBER | qualname // TODO should rst_role too? make combined `type qualname | rst_role`? // A natural language alternative to describe arrays with a dtype and shape -array_expression : - | array_name "of dtype" dtype ("and shape" shape)? +array_expression: array_name "of dtype" dtype ("and shape" shape)? | array_name "of shape" shape ("and dtype" dtype)? | shape array_name ("of" dtype)? | shape? array_name "of" dtype @@ -80,8 +76,7 @@ ARRAY_NAME : "array" | "ndarray" | "array-like" | "array_like" // The shape used in an array expression. Possibly to liberal right now in // what it allows. Since there is currently no support to type the shape of // NumPy arrays, this information is dropped during the transformation. -shape : - | "(" dim ",)" +shape: "(" dim ",)" | "(" leading_optional_dim? dim (("," dim | insert_optional_dim))* ")" | NUMBER "-"? "D" From dc28946fb1e2a01d86f3b6fe37ef44a87fdf5ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gr=C3=BCter?= Date: Thu, 15 May 2025 22:23:48 +0200 Subject: [PATCH 2/5] Remove space before colons. --- src/docstub/doctype.lark | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/docstub/doctype.lark b/src/docstub/doctype.lark index 1bd6ba9..dbf9e66 100644 --- a/src/docstub/doctype.lark +++ b/src/docstub/doctype.lark @@ -3,13 +3,13 @@ // Reading and introduction order of rules starts at the top of the tree. -?start : annotation_with_meta +?start: annotation_with_meta // The basic structure of a full docstring annotation as it comes after the // `name : `. It includes additional meta information that is optional and // currently ignored. -annotation_with_meta : type ("," optional)? ("," extra_info)? +annotation_with_meta: type ("," optional)? ("," extra_info)? // Just the docstring type annotation without meta information. @@ -22,20 +22,20 @@ annotation_with_meta : type ("," optional)? ("," extra_info)? // Name with leading dot separated path -qualname : (/~/ ".")? (NAME ".")* NAME +qualname: (/~/ ".")? (NAME ".")* NAME // A qualname can be wrapped in a reStructuredText role, e.g, as used by Sphinx. // https://docutils.sourceforge.io/docs/ref/rst/roles.html -rst_role : (":" (NAME ":")? NAME ":")? "`" qualname "`" +rst_role: (":" (NAME ":")? NAME ":")? "`" qualname "`" // An union of different types, joined either by "or" or "|" -or_expression : type (("or" | "|") type)+ +or_expression: type (("or" | "|") type)+ // An expression where an object is subscribed with "A[v, ...]". We extend this -// syntax with a natural language variant `A of (v, ...)` and `A of {k : v}`. +// syntax with a natural language variants, e.g., `A of (v, ...)` and `A of {k : v}`. subscription_expression: qualname "[" type ("," type)* ("," ELLIPSES)? "]" | qualname "of" type // TODO allow plural somehow, e.g. "list of int(s)"? | qualname "of" "(" type ("," type)* ("," ELLIPSES)? ")" @@ -43,7 +43,7 @@ subscription_expression: qualname "[" type ("," type)* ("," ELLIPSES)? "]" // An expression combining multiple literals inside curly braces `{l1, l2, ...}` -literal_expression : "{" literal_item ("," literal_item)* "}" +literal_expression: "{" literal_item ("," literal_item)* "}" // An single item in a literal expression (or `optional`). We must also allow @@ -67,11 +67,11 @@ array_expression: array_name "of dtype" dtype ("and shape" shape)? // are using a hack here, that only allows specific names in `array_name`. In // the transformer we alias this to qualname. // TODO figure out less hacky way & allow users to set other array names -array_name : ARRAY_NAME -ARRAY_NAME : "array" | "ndarray" | "array-like" | "array_like" +array_name: ARRAY_NAME +ARRAY_NAME: "array" | "ndarray" | "array-like" | "array_like" // The dtype used in an array expression. -?dtype : qualname +?dtype: qualname // The shape used in an array expression. Possibly to liberal right now in // what it allows. Since there is currently no support to type the shape of @@ -82,29 +82,29 @@ shape: "(" dim ",)" // Optional dimensions in a `shape` expression placed at the start, // e.g., `([3 ,] N)`. -?leading_optional_dim : "[" dim ("," dim)* ",]" +?leading_optional_dim: "[" dim ("," dim)* ",]" // Optional dimensions in a `shape` expression placed anywhere but the start, // e.g., `(A[, B], C[, D])`. -?insert_optional_dim : "[," dim ("," dim)* "]" +?insert_optional_dim: "[," dim ("," dim)* "]" // Dimension can be a number, ellipses ('...') or a simple name. A simple name // can be bound to a specific number, e.g. `N=3`. -?dim : NUMBER | ELLIPSES | NAME ("=" NUMBER)? +?dim: NUMBER | ELLIPSES | NAME ("=" NUMBER)? // Optional information about a parameter has a default value, added after the // docstring annotation. Currently dropped during transformation. -optional : "optional" | "default" ("=" | ":")? literal_item +optional: "optional" | "default" ("=" | ":")? literal_item // Extra meta information added after the docstring annotation. // Currently dropped during transformation. -extra_info : /[^\r\n]+/ +extra_info: /[^\r\n]+/ // Allow Python's ellipses object -ELLIPSES : "..." +ELLIPSES: "..." // A simple name. Can start with a number or character. Can be delimited by "_" // or "-" but not by ".". From 3d9215f4b6e8dbdfb8b2c8023eaae185e3d7c783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gr=C3=BCter?= Date: Fri, 16 May 2025 13:26:11 +0200 Subject: [PATCH 3/5] Refactor & improve grammar Still not perfect, but the names and structure make a lot more sense to me now. This also allows using a plural "(s)" in "container of type" style expressions. --- src/docstub/_docstrings.py | 47 +++++++++++--------- src/docstub/doctype.lark | 87 +++++++++++++++++++++++--------------- tests/test_docstrings.py | 68 +++++++++++++++++++++++++---- 3 files changed, 141 insertions(+), 61 deletions(-) diff --git a/src/docstub/_docstrings.py b/src/docstub/_docstrings.py index 9a92bfe..ebcf0ee 100644 --- a/src/docstub/_docstrings.py +++ b/src/docstub/_docstrings.py @@ -24,7 +24,7 @@ with grammar_path.open() as file: _grammar = file.read() -_lark = lark.Lark(_grammar, propagate_positions=True) +_lark = lark.Lark(_grammar, propagate_positions=True, strict=True) def _find_one_token(tree: lark.Tree, *, name: str) -> lark.Token: @@ -295,19 +295,6 @@ def doctype_to_annotation(self, doctype): self._collected_imports = None self._unknown_qualnames = None - def annotation_with_meta(self, tree): - """ - Parameters - ---------- - tree : lark.Tree - - Returns - ------- - out : str - """ - out = " | ".join(tree.children) - return out - def qualname(self, tree): """ Parameters @@ -352,7 +339,7 @@ def rst_role(self, tree): qualname = _find_one_token(tree, name="QUALNAME") return qualname - def or_expression(self, tree): + def union(self, tree): """ Parameters ---------- @@ -365,7 +352,7 @@ def or_expression(self, tree): out = " | ".join(tree.children) return out - def subscription_expression(self, tree): + def subscription(self, tree): """ Parameters ---------- @@ -381,7 +368,7 @@ def subscription_expression(self, tree): out = f"{_container}[{_content}]" return out - def literal_expression(self, tree): + def natlang_literal(self, tree): """ Parameters ---------- @@ -393,13 +380,34 @@ def literal_expression(self, tree): """ out = ", ".join(tree.children) out = f"Literal[{out}]" + + if len(tree.children): + logger.warning( + "natural language literal with one item `%s`, " + "consider using `%s` to improve readability", + tree.children[0], + out, + ) + if self.types_db is not None: _, known_import = self.types_db.query("Literal") if known_import: self._collected_imports.add(known_import) return out - def array_expression(self, tree): + def natlang_container(self, tree): + """ + Parameters + ---------- + tree : lark.Tree + + Returns + ------- + out : str + """ + return self.subscription(tree) + + def natlang_array(self, tree): """ Parameters ---------- @@ -491,7 +499,8 @@ def __default__(self, data, children, meta): """ if isinstance(children, list) and len(children) == 1: out = children[0] - out.type = data.upper() # Turn rule into "token" + if hasattr(out, "type"): + out.type = data.upper() # Turn rule into "token" else: out = children return out diff --git a/src/docstub/doctype.lark b/src/docstub/doctype.lark index dbf9e66..6d8c997 100644 --- a/src/docstub/doctype.lark +++ b/src/docstub/doctype.lark @@ -9,69 +9,86 @@ // The basic structure of a full docstring annotation as it comes after the // `name : `. It includes additional meta information that is optional and // currently ignored. -annotation_with_meta: type ("," optional)? ("," extra_info)? +?annotation_with_meta: type ("," optional)? ("," extra_info)? // Just the docstring type annotation without meta information. ?type: qualname - | rst_role - | literal_expression - | subscription_expression - | array_expression - | or_expression + | union + | subscription + | natlang_literal + | natlang_container + | natlang_array + + +// A qualified name which can contain multiple parts separated by a ".". +// Optionally, "~." can be prefixed to abbreviate a leading part of the name. +// Optionally, a qualname can be wrapped in the style of a reStructuredText +// role [1], e.g, as used by Sphinx. +// [1] https://docutils.sourceforge.io/docs/ref/rst/roles.html +// +qualname: (/~/ ".")? (NAME ".")* NAME + | (":" (NAME ":")? NAME ":")? "`" qualname "`" -> rst_role -// Name with leading dot separated path -qualname: (/~/ ".")? (NAME ".")* NAME +// An union of different types, joined either by "or" or "|". +union: type (_OR type)+ -// A qualname can be wrapped in a reStructuredText role, e.g, as used by Sphinx. -// https://docutils.sourceforge.io/docs/ref/rst/roles.html -rst_role: (":" (NAME ":")? NAME ":")? "`" qualname "`" +// An expression where an object is subscribed with "A[v, ...]". +subscription: qualname "[" type ("," type)* ("," ELLIPSES)? "]" -// An union of different types, joined either by "or" or "|" -or_expression: type (("or" | "|") type)+ +// A natural language version that combines one or multiple literals inside +// curly braces `{l1, l2, ...}` +natlang_literal: "{" literal_item ("," literal_item)* "}" -// An expression where an object is subscribed with "A[v, ...]". We extend this -// syntax with a natural language variants, e.g., `A of (v, ...)` and `A of {k : v}`. -subscription_expression: qualname "[" type ("," type)* ("," ELLIPSES)? "]" - | qualname "of" type // TODO allow plural somehow, e.g. "list of int(s)"? - | qualname "of" "(" type ("," type)* ("," ELLIPSES)? ")" - | qualname "of" "{" type ":" type "}" +// An single item in a literal expression (or `optional`). We must also allow +// for qualified names, since a "class" can be used as a literal too. +?literal_item: ELLIPSES | STRING | NUMBER | qualname -// An expression combining multiple literals inside curly braces `{l1, l2, ...}` -literal_expression: "{" literal_item ("," literal_item)* "}" +// A natural language form of the subscription expression for containers. +// This variant explicitly doesn't allow nesting more complex types inside it +// to maintain readability. However, unions with with simple qualnames are +// allowed, e.g, `list of (int or float)`. +natlang_container: qualname "of" qualname "(s)"? + | qualname "of" "(" union ")" + | _natlang_tuple + | _natlang_mapping -// An single item in a literal expression (or `optional`). We must also allow -// for qualified names, since a "class" can be used as a literal too. -?literal_item: ELLIPSES - | STRING - | NUMBER - | qualname // TODO should rst_role too? make combined `type qualname | rst_role`? +// Special behavior for tuples [1]. +// [1] https://typing.python.org/en/latest/spec/tuples.html#tuple-type-form +_natlang_tuple: qualname "of" "(" type "," ELLIPSES ")" + | qualname "of" "(" type ("," type)+ ")" + + +// Natural language container variant for mappings. +_natlang_mapping: qualname "of" "{" type ":" (type | union) "}" // A natural language alternative to describe arrays with a dtype and shape -array_expression: array_name "of dtype" dtype ("and shape" shape)? +natlang_array: array_name "of dtype" dtype ("and shape" shape)? | array_name "of shape" shape ("and dtype" dtype)? | shape array_name ("of" dtype)? | shape? array_name "of" dtype | shape dtype array_name | dtype array_name -// Currently a bit of a hack. Since the `array_expression` is currently so -// ambiguous, we want to make sure it only works for real arrays. For now, we -// are using a hack here, that only allows specific names in `array_name`. In -// the transformer we alias this to qualname. + +// Currently a bit of a hack. Since the `array_expression` is ambiguous, we +// want to make sure it only works for real arrays. For now, we are using a +// hack here, that only allows specific names in `array_name`. In the +// transformer we alias this to qualname. +// // TODO figure out less hacky way & allow users to set other array names array_name: ARRAY_NAME ARRAY_NAME: "array" | "ndarray" | "array-like" | "array_like" // The dtype used in an array expression. -?dtype: qualname +?dtype: qualname | "(" union ")" // The shape used in an array expression. Possibly to liberal right now in // what it allows. Since there is currently no support to type the shape of @@ -103,6 +120,10 @@ optional: "optional" | "default" ("=" | ":")? literal_item extra_info: /[^\r\n]+/ +// Operator used in unions. +_OR: "or" | "|" + + // Allow Python's ellipses object ELLIPSES: "..." diff --git a/tests/test_docstrings.py b/tests/test_docstrings.py index cebfcb0..9af432c 100644 --- a/tests/test_docstrings.py +++ b/tests/test_docstrings.py @@ -1,5 +1,6 @@ from textwrap import dedent +import lark import pytest from docstub._analysis import KnownImport @@ -35,28 +36,72 @@ def test_unexpected_value(self): class Test_DoctypeTransformer: - # fmt: off @pytest.mark.parametrize( ("doctype", "expected"), [ + # Conventional ("list[float]", "list[float]"), ("dict[str, Union[int, str]]", "dict[str, Union[int, str]]"), ("tuple[int, ...]", "tuple[int, ...]"), - + ("Sequence[int | float]", "Sequence[int | float]"), + # Natural language variant with "of" and optional plural "(s)" ("list of int", "list[int]"), - ("tuple of float", "tuple[float]"), + ("list of int(s)", "list[int]"), + # Natural tuple variant + ("tuple of (float, int, str)", "tuple[float, int, str]"), ("tuple of (float, ...)", "tuple[float, ...]"), + # Natural dict variant + ("dict of {str: int}", "dict[str, int]"), + ("dict of {str: int | float}", "dict[str, int | float]"), + ("dict of {str: int or float}", "dict[str, int | float]"), + ("dict[list of str]", "dict[list[str]]"), + ], + ) + def test_subscription(self, doctype, expected): + transformer = DoctypeTransformer() + annotation, _ = transformer.doctype_to_annotation(doctype) + assert annotation.value == expected - ("Sequence[int | float]", "Sequence[int | float]"), - + @pytest.mark.parametrize( + ("doctype", "expected"), + [ + # Natural language variant with "of" and optional plural "(s)" + ("list of int", "list[int]"), + ("list of int(s)", "list[int]"), + ("list of (int or float)", "list[int | float]"), + # Natural tuple variant + ("tuple of (float, int, str)", "tuple[float, int, str]"), + ("tuple of (float, ...)", "tuple[float, ...]"), + # Natural dict variant ("dict of {str: int}", "dict[str, int]"), + ("dict of {str: int | float}", "dict[str, int | float]"), + ("dict of {str: int or float}", "dict[str, int | float]"), + ("dict[list of str]", "dict[list[str]]"), ], ) - def test_container(self, doctype, expected): + def test_natlang_container(self, doctype, expected): transformer = DoctypeTransformer() annotation, _ = transformer.doctype_to_annotation(doctype) assert annotation.value == expected - # fmt: on + + @pytest.mark.parametrize( + "doctype", + [ + "list of (float)", + "list of (float,)", + "list of (, )", + "list of ...", + "list of (..., ...)", + "dict of {}", + "dict of {:}", + "dict of {a:}", + "dict of {:b}", + ], + ) + def test_subscription_error(self, doctype): + transformer = DoctypeTransformer() + with pytest.raises(lark.exceptions.UnexpectedInput): + transformer.doctype_to_annotation(doctype) @pytest.mark.parametrize( ("doctype", "expected"), @@ -64,6 +109,9 @@ def test_container(self, doctype, expected): ("{'a', 1, None, False}", "Literal['a', 1, None, False]"), ("dict[{'a', 'b'}, int]", "dict[Literal['a', 'b'], int]"), ("{SomeEnum.FIRST}", "Literal[SomeEnum_FIRST]"), + ("{`SomeEnum.FIRST`, 1}", "Literal[SomeEnum_FIRST, 1]"), + ("{:ref:`SomeEnum.FIRST`, 2}", "Literal[SomeEnum_FIRST, 2]"), + ("{:py:ref:`SomeEnum.FIRST`, 3}", "Literal[SomeEnum_FIRST, 3]"), ], ) def test_literals(self, doctype, expected): @@ -97,10 +145,12 @@ def test_optional_extra_info(self, doctype, expected, extra_info): ("`Generator`", "Generator"), (":class:`Generator`", "Generator"), (":py:class:`Generator`", "Generator"), + (":py:class:`Generator`[int]", "Generator[int]"), + (":py:ref:`~.Foo`[int]", "_Foo[int]"), ("list[:py:class:`Generator`]", "list[Generator]"), ], ) - def test_sphinx_ref(self, doctype, expected): + def test_rst_role(self, doctype, expected): transformer = DoctypeTransformer() annotation, _ = transformer.doctype_to_annotation(doctype) assert annotation.value == expected @@ -120,7 +170,7 @@ def test_sphinx_ref(self, doctype, expected): @pytest.mark.parametrize("name", ["array", "ndarray", "array-like", "array_like"]) @pytest.mark.parametrize("dtype", ["int", "np.int8"]) @pytest.mark.parametrize("shape", ["(2, 3)", "(N, m)", "3D", "2-D", "(N, ...)"]) - def test_shape_n_dtype(self, fmt, expected_fmt, name, dtype, shape): + def test_natlang_array(self, fmt, expected_fmt, name, dtype, shape): def escape(name: str) -> str: return name.replace("-", "_").replace(".", "_") From c99b216bbf5782051b14fc3310e53aa3888e4372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lars=20Gr=C3=BCter?= Date: Fri, 16 May 2025 13:58:16 +0200 Subject: [PATCH 4/5] Disallow space in front of plural "(s)" --- src/docstub/doctype.lark | 52 +++++++++++++++++++++++++--------------- tests/test_docstrings.py | 1 + 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/docstub/doctype.lark b/src/docstub/doctype.lark index 6d8c997..1243a73 100644 --- a/src/docstub/doctype.lark +++ b/src/docstub/doctype.lark @@ -1,6 +1,9 @@ // Grammar defining the syntax for docstring type descriptions // // Reading and introduction order of rules starts at the top of the tree. +// +// Reference for Lark grammars: +// https://lark-parser.readthedocs.io/en/latest/grammar.html ?start: annotation_with_meta @@ -12,7 +15,8 @@ ?annotation_with_meta: type ("," optional)? ("," extra_info)? -// Just the docstring type annotation without meta information. +// A type annotation. Can range from a simple qualified name to a complex +// nested construct of types. ?type: qualname | union | subscription @@ -35,30 +39,43 @@ qualname: (/~/ ".")? (NAME ".")* NAME union: type (_OR type)+ +// Operator used in unions. +_OR: "or" | "|" + + // An expression where an object is subscribed with "A[v, ...]". subscription: qualname "[" type ("," type)* ("," ELLIPSES)? "]" -// A natural language version that combines one or multiple literals inside +// Allow Python's ellipses object +ELLIPSES: "..." + + +// A natural language expression that combines one or multiple literals inside // curly braces `{l1, l2, ...}` natlang_literal: "{" literal_item ("," literal_item)* "}" // An single item in a literal expression (or `optional`). We must also allow -// for qualified names, since a "class" can be used as a literal too. +// for qualified names, since a "class" or enum can be used as a literal too. ?literal_item: ELLIPSES | STRING | NUMBER | qualname -// A natural language form of the subscription expression for containers. -// This variant explicitly doesn't allow nesting more complex types inside it -// to maintain readability. However, unions with with simple qualnames are -// allowed, e.g, `list of (int or float)`. -natlang_container: qualname "of" qualname "(s)"? +// Natural language forms of the subscription expression for containers. +// These forms allow nesting allow nesting in and with other expressions. But +// it's discouraged to do so extensively to maintain readability. +natlang_container: qualname "of" qualname _PLURAL_S? | qualname "of" "(" union ")" | _natlang_tuple | _natlang_mapping +// Indicate the plural version of a qualname by appending "(s)". +// The negative lookbehind in this regex disallows whitespace directly in front +// of this. +_PLURAL_S: /(? Date: Fri, 16 May 2025 14:00:18 +0200 Subject: [PATCH 5/5] Test natural language literal with 1 item --- tests/test_docstrings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_docstrings.py b/tests/test_docstrings.py index 299874d..503045f 100644 --- a/tests/test_docstrings.py +++ b/tests/test_docstrings.py @@ -107,6 +107,7 @@ def test_subscription_error(self, doctype): @pytest.mark.parametrize( ("doctype", "expected"), [ + ("{0}", "Literal[0]"), ("{'a', 1, None, False}", "Literal['a', 1, None, False]"), ("dict[{'a', 'b'}, int]", "dict[Literal['a', 'b'], int]"), ("{SomeEnum.FIRST}", "Literal[SomeEnum_FIRST]"),