Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-108580: Structure for importlib metadata idents #108585

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 13 additions & 0 deletions Doc/library/importlib.metadata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,16 @@ all the metadata in a JSON-compatible form per :PEP:`566`::
>>> wheel_metadata.json['requires_python']
'>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*'

The attributes ``authors`` and ``maintainers`` unify and provide minimal
parsing for the respective core metadata fields ("Author", "Author-email")
and ("Maintainer", "Maintainer-email")::

>>> wheel_metadata.authors
[Ident(name='Daniel Holth', email='dholth@fastmail.fm')]

>>> wheel_metadata.maintainers
[Ident(name='Alex Grönholm', email='alex.gronholm@nextday.fi')]

.. note::

The actual type of the object returned by ``metadata()`` is an
Expand All @@ -220,6 +230,9 @@ all the metadata in a JSON-compatible form per :PEP:`566`::
.. versionadded:: 3.10
The ``json`` attribute was added.

.. versionadded:: 3.13
The ``authors`` and ``maintainers`` attributes were added.


.. _version:

Expand Down
2 changes: 2 additions & 0 deletions Lib/importlib/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from ._functools import method_cache, pass_none
from ._itertools import always_iterable, unique_everseen
from ._meta import PackageMetadata, SimplePath
from ._adapters import Ident

from contextlib import suppress
from importlib import import_module
Expand All @@ -31,6 +32,7 @@
__all__ = [
'Distribution',
'DistributionFinder',
'Ident',
'PackageMetadata',
'PackageNotFoundError',
'distribution',
Expand Down
253 changes: 253 additions & 0 deletions Lib/importlib/metadata/_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import textwrap
import email.message
import dataclasses

from ._text import FoldedCase

Expand All @@ -15,6 +16,232 @@
stacklevel=2,
)

# The formatting of the identity fields ("Author", "Maintainer", "Author-email"
# and "Maintainer-email") in the core metadata specification and related
# 'pyproject.toml' specification is inspired by RFC5233 but not precisely
# defined. In practice conflicting definitions are used by many packages and
# even examples from the specification. For a permissive parser the key
# takeaway from RFC5233 is that special characters such as "," and "@" must be
# quoted when used as text.


def _entries_findall(string):
"""
Return a list of entries given an RFC5233-inspired string. Entries are
separated by ", " and contents of quoted strings are ignored. Each
entry will be a non-empty string.

>>> _entries_findall('a, b , c, "d, e, f"')
['a', 'b ', ' c', '"d, e, f"']

>>> _entries_findall('a')
['a']

>>> _entries_findall('')
[]

>>> _entries_findall(", ")
[]
"""

# Split an RFC5233-ish list:
# 1. Require a list separator, or beginning-of-string.
# 2. Alt 1: match single or double quotes and handle escape characters.
# 3. Alt 2: match anything except ',' followed by a space. If quote
# characters are unbalanced, they will be matched here.
# 4. Match the alternatives at least once, in any order...
# 5. ... and capture them.
# Result:
# group 1 (list entry): None or non-empty string.
_entries = re.compile(
r"""
(?: (?<=,\ ) | (?<=^) ) # 1
( (?: (["']) (?:(?!\2|\\).|\\.)* \2 # 2
| (?!,\ ). # 3
)+ # 4
) # 5
""",
re.VERBOSE,
)

return [entry[0] for entry in _entries.findall(string)]


def _name_email_split(string):
"""
Split an RFC5233-inspired entry into a name and email address tuple. Each
component will be either None or a non-empty string. Split the form "name
local@domain" on the first unquoted "@" such that:

* local may not be empty and may not contain any unquoted spaces
* domain may not be empty
* spaces between name and address are consumed
* space between name and address is optional if name ends in "@"
* first opening "<" of local is consumed only if local remains non-empty
* last closing ">" of domain is consumed only if domain remains non-empty

>>> _name_email_split("name local@domain")
('name', 'local@domain')

>>> _name_email_split('@"unlocal@undomain" @ <loc"al@dom\\'ain')
('@"unlocal@undomain" @', 'loc"al@dom\\'ain')

>>> _name_email_split('@@ local@domain')
('@@', 'local@domain')

>>> _name_email_split('@nameonly@')
('@nameonly@', None)

>>> _name_email_split('@domain@ ')
('@', 'domain@ ')

>>> _name_email_split(' domain@only')
(None, 'domain@only')

>>> _name_email_split(' ')
(' ', None)

>>> _name_email_split('')
(None, None)
"""

# Split an RFC5233-inspired name-address entry:
# 01. Start at the beginning.
# 02. Capture at least one name component, but optionally so the result
# will be 'None' rather than an empty string.
# 03. Stop matching against name components if the lookahead matches an
# address. An address can be preceded by spaces, which are optional if
# the name is missing.
# 04. Simulate a possessive quantifier for Python < 3.11 given the
# equivalence between "(...)++" and "(?=( (...)+ ))\1". The contained
# alternatives are not exclusive and the possessive quantifier prevents
# the second alternative from stealing quoted components during
# backtracking.
# 05. Alt 1.1: Match single-quoted or double-quoted components and handle
# escape characters.
# 06. Alt 1.2: Match any character except the local component delimiters
# " " or "@". If quote characters are unbalanced, they will be matched
# here.
# 07. Match the alternatives at least once - the local part of the address
# cannot be empty.
# 08. (See 04)
# 09. Match "@" followed by something - the domain cannot be empty either.
# 10. (See 03)
# 11. Alt 2.1: Match a quoted component...
# 12. Alt 2.2: ... or match a single character.
# ...
# 14. (See 02)
# 15. (See 02)
# 16. If the name portion is missing or ends with an "@", there may or may
# not be whitespace before the address. The opening angle bracket is
# always optional.
# ...
# 20. Match everything after "@" with a non-greedy quantifier to allow for
# the optional closing angle bracket.
# 21. Allow for no address component.
# 22. Match the optional closing angle bracket.
# 23. Finish at the end.
# Summary:
# ^ ( ( not: space* (quote | not:space-or-at)++ @ anything
# quote | anything
# )+
# )?
# space* <? ( (quote | not:space-or-at)+ @ anything+? )? >? $
# Result:
# group 1 (name): None or non-empty string.
# group 5 (email): None or non-empty string.
_name_email = re.compile(
r"""
^ # 01
( (?: # 02
(?! \ * # 03
(?=( # 04
(?: (["']) (?:(?!\3|\\).|\\.)* \3 # 05
| [^ @] # 06
)+ # 07
))\2 # 08
@ . # 09
) # 10
(?: (["']) (?:(?!\4|\\).|\\.)* \4 # 11
| . # 12
) # 13
)+ # 14
)? # 15
\ * <? # 16
( (?: (["']) (?:(?!\6|\\).|\\.)* \6 # 17
| [^ @] # 18
)+ # 19
@ .+? # 20
)? # 21
>? # 22
$ # 23
""",
re.VERBOSE,
)

# Equivalent, simpler, version using possessive quantifiers, for
# Python >= 3.11.
# _name_email = re.compile(
# r"""
# ^ ( (?: (?! \ *
# (?: (["']) (?:(?!\2|\\).|\\.)* \2
# | [^ @]
# )++
# @ .
# )
# (?: (["']) (?:(?!\3|\\).|\\.)* \3
# | .
# )
# )+
# )?
# \ * <?
# ( (?: (["']) (?:(?!\5|\\).|\\.)* \5
# | [^ @]
# )+
# @ .+?
# )?
# >?
# $
# """,
# re.VERBOSE,
# )

return _name_email.match(string).groups()[::4]


def _uniq(values):
"""
Return a list omitting duplicate values.

>>> _uniq([1, 2, 1, 2, 3, 1, 4])
[1, 2, 3, 4]

>>> _uniq(())
[]
"""
unique = set()
result = []
for value in values:
if value in unique:
continue
unique.add(value)
result.append(value)
return result


@dataclasses.dataclass(eq=True, frozen=True)
class Ident:
"""
A container for identity attributes, used by the author or
maintainer fields.
"""

name: str | None
email: str | None

def __iter__(self):
return (getattr(self, field.name) for field in dataclasses.fields(self))


class Message(email.message.Message):
multiple_use_keys = set(
Expand Down Expand Up @@ -87,3 +314,29 @@ def transform(key):
return tk, value

return dict(map(transform, map(FoldedCase, self)))

def _parse_idents(self, string):
entries = (_name_email_split(entry) for entry in _entries_findall(string))
return _uniq(Ident(*entry) for entry in entries if entry != (None, None))

def _parse_names(self, string):
return _uniq(Ident(entry, None) for entry in _entries_findall(string))

def _parse_names_idents(self, names_field, idents_field):
names = self._parse_names(self.get(names_field, ""))
idents = self._parse_idents(self.get(idents_field, ""))
return _uniq((*names, *idents))

@property
def authors(self):
"""
Minimal parsing for "Author" and "Author-email" fields.
"""
return self._parse_names_idents("Author", "Author-email")

@property
def maintainers(self):
"""
Minimal parsing for "Maintainer" and "Maintainer-email" fields.
"""
return self._parse_names_idents("Maintainer", "Maintainer-email")
14 changes: 14 additions & 0 deletions Lib/importlib/metadata/_meta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Protocol
from typing import Any, Dict, Iterator, List, Optional, TypeVar, Union, overload

from ._adapters import Ident


_T = TypeVar("_T")

Expand Down Expand Up @@ -43,6 +45,18 @@ def json(self) -> Dict[str, Union[str, List[str]]]:
A JSON-compatible form of the metadata.
"""

@property
def authors(self) -> list[Ident]:
"""
Minimal parsing for "Author" and "Author-email" fields.
"""

@property
def maintainers(self) -> list[Ident]:
"""
Minimal parsing for "Maintainer" and "Maintainer-email" fields.
"""


class SimplePath(Protocol[_T]):
"""
Expand Down
5 changes: 5 additions & 0 deletions Lib/test/support/_hypothesis_stubs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"strategies",
"HealthCheck",
"settings",
"note",
"Verbosity",
]

Expand Down Expand Up @@ -82,6 +83,10 @@ def settings(*args, **kwargs):
return lambda f: f # pragma: nocover


def note(*args, **kwargs):
pass # pragma: no cover


class HealthCheck(Enum):
data_too_large = 1
filter_too_much = 2
Expand Down