From 3a097a38c6dc6ff15e82663b7005650e158ce641 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 17:32:07 -0800 Subject: [PATCH 01/28] Implement RawMetadata Co-authored-by: Donald Stufft --- src/packaging/metadata.py | 369 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 369 insertions(+) create mode 100644 src/packaging/metadata.py diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py new file mode 100644 index 00000000..dd3b7460 --- /dev/null +++ b/src/packaging/metadata.py @@ -0,0 +1,369 @@ +import email.feedparser +import email.header +import email.message +import email.parser +import email.policy +from typing import Dict, List, Tuple, TypedDict, Union, cast + + +# The RawMetadata class attempts to make as few assumptions about the underlying +# serialization formats as possible. The idea is that as long as a serialization +# formats offer some very basic primitives in *some* way then we can support +# serializing to and from that format. +class RawMetadata(TypedDict, total=False): + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: str + platforms: List[str] + summary: str + description: str + keywords: List[str] + home_page: str + author: str + author_email: str + license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: List[str] + download_url: str + classifiers: List[str] + requires: List[str] + provides: List[str] + obsoletes: List[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str + requires_dist: List[str] + provides_dist: List[str] + obsoletes_dist: List[str] + requires_python: str + requires_external: List[str] + project_urls: Dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emiting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. + + # Metadata 2.1 - PEP 566 + description_content_type: str + provides_extra: List[str] + + # Metadata 2.2 - PEP 643 + dynamic: List[str] + + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. + + +_STRING_FIELDS = { + "author", + "author_email", + "description", + "description_content_type", + "download_url", + "home_page", + "license", + "maintainer", + "maintainer_email", + "metadata_version", + "name", + "requires_python", + "summary", + "version", +} + +_LIST_STRING_FIELDS = { + "classifiers", + "dynamic", + "obsoletes", + "obsoletes_dist", + "platforms", + "provides", + "provides_dist", + "provides_extra", + "requires", + "requires_dist", + "requires_external", + "supported_platforms", +} + + +def _parse_keywords(data: str) -> List[str]: + """Split a string of comma-separate keyboards into a list of keywords.""" + return [k.strip() for k in data.split(",")] + + +def _parse_project_urls(data: List[str]) -> Dict[str, str]: + """Parse a list of label/URL string pairings separated by a comma.""" + urls = {} + for pair in data: + # Our logic is slightly tricky here as we want to try and do + # *something* reasonable with malformed data. + # + # The main thing that we have to worry about, is data that does + # not have a ',' at all to split the label from the Value. There + # isn't a singular right answer here, and we will fail validation + # later on (if the caller is validating) so it doesn't *really* + # matter, but since the missing value has to be an empty str + # and our return value is dict[str, str], if we let the key + # be the missing value, then they'd have multiple '' values that + # overwrite each other in a accumulating dict. + # + # The other potentional issue is that it's possible to have the + # same label multiple times in the metadata, with no solid "right" + # answer with what to do in that case. As such, we'll do the only + # thing we can, which is treat the field as unparseable and add it + # to our list of unparsed fields. + parts = [p.strip() for p in pair.split(",", 1)] + parts.extend([""] * (max(0, 2 - len(parts)))) # Ensure 2 items + + # TODO: The spec doesn't say anything about if the keys should be + # considered case sensitive or not... logically they should + # be case preserving, but case insensitive, but doing that + # would open up more cases where we might have duplicated + # entries. + label, url = parts + if label in urls: + # The label already exists in our set of urls, so this field + # is unparseable, and we can just add the whole thing to our + # unparseable data and stop processing it. + raise KeyError("duplicate labels in project urls") + urls[label] = url + + return urls + + +def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: + """Get the body of the message.""" + # If our source is a str, then our caller has managed encodings for us, + # and we don't need to deal with it. + if isinstance(source, str): + payload: Union[List[str], str] = msg.get_payload() + if isinstance(payload, list): + raise ValueError("payload is a multi-part") + return payload + # If our source is a bytes, then we're managing the encoding and we need + # to deal with it. + else: + bpayload: Union[List[bytes], bytes] = msg.get_payload(decode=True) + if isinstance(bpayload, list): + raise ValueError("payload is a multi-part") + + try: + return bpayload.decode("utf8", "strict") + except UnicodeDecodeError: + raise ValueError("payload in an invalid encoding") + + +# The various parse_FORMAT functions here are intended to be as lenient as +# possible in their parsing, while still returning a correctly typed +# RawMetadata. +# +# To aid in this, we also generally want to do as little touching of the +# data as possible, except where there are possibly some historic holdovers +# that make valid data awkward to work with. +# +# While this is a lower level, intermediate format than our ``Metadata`` +# class, some light touch ups can make a massive different in usability. + +# Map METADATA fields to RawMetadata. +_EMAIL_FIELD_MAPPING = { + "author": "author", + "author-email": "author_email", + "classifier": "classifiers", + "description": "description", + "description-content-type": "description_content_type", + "download-url": "download_url", + "dynamic": "dynamic", + "home-page": "home_page", + "keywords": "keywords", + "license": "license", + "maintainer": "maintainer", + "maintainer-email": "maintainer_email", + "metadata-version": "metadata_version", + "name": "name", + "obsoletes": "obsoletes", + "obsoletes-dist": "obsoletes_dist", + "platform": "platforms", + "project-url": "project_urls", + "provides": "provides", + "provides-dist": "provides_dist", + "provides-extra": "provides_extra", + "requires": "requires", + "requires-dist": "requires_dist", + "requires-external": "requires_external", + "requires-python": "requires_python", + "summary": "summary", + "supported-platform": "supported_platforms", + "version": "version", +} + + +def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[str]]]: + raw: Dict[str, Union[str, List[str], Dict[str, str]]] = {} + unparsed: Dict[str, List[str]] = {} + + if isinstance(data, str): + parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data) + else: + parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data) + + # We have to wrap parsed.keys() in a set, because in the case of multiple + # values for a key (a list), the key will appear multiple times in the + # list of keys, but we're avoiding that by using get_all(). + for name in frozenset(parsed.keys()): + # Header names in RFC are case insensitive, so we'll normalize to all + # lower case to make comparisons easier. + name = name.lower() + + # We use get_all() here, even for fields that aren't multiple use, + # because otherwise someone could have e.g. two Name fields, and we + # would just silently ignore it rather than doing something about it. + headers = parsed.get_all(name) + + # The way the email module works when parsing bytes is that it + # unconditionally decodes the bytes as ascii using the surrogateescape + # handler. When you pull that data back out (such as with get_all() ), + # it looks to see if the str has any surrogate escapes, and if it does + # it wraps it in a Header object instead of returning the string. + # + # As such, we'll look for those Header objects, and fix up the encoding. + value = [] + # Flag if we have run into any issues processing the headers, thus + # signalling that the data belongs in 'unparsed'. + valid_encoding = True + for h in headers: + # It's unclear if this can return more types than just a Header or + # a str, so we'll just assert here to make sure. + assert isinstance(h, (email.header.Header, str)) + + # If it's a header object, we need to do our little dance to get + # the real data out of it. In cases where there is invalid data + # we're going to end up with mojibake, but there's no obvious, good + # way around that without reimplementing parts of the Header object + # ourselves. + # + # That should be fine since, if mojibacked happens, this key is + # going into the unparsed dict anyways. + if isinstance(h, email.header.Header): + # The Header object stores it's data as chunks, and each chunk + # can be independently encoded, so we'll need to check each + # of them. + chunks = [] + for bin, encoding in email.header.decode_header(h): + # This means it found a surrogate escape that could be + # valid data (if the source was utf8), or invalid. + if encoding == "unknown-8bit": + try: + bin.decode("utf8", "strict") + except UnicodeDecodeError: + # Enable mojibake. + encoding = "latin1" + valid_encoding = False + else: + encoding = "utf8" + chunks.append((bin, encoding)) + + # Turn our chunks back into a Header object, then let that + # Header object do the right thing to turn them into a + # string for us. + value.append(str(email.header.make_header(chunks))) + # This is already a string, so just add it + else: + value.append(h) + + # We've processed all of our values to get them into a list of str, + # but we may have mojibake data, in which case this is an unparsed + # field. + if not valid_encoding: + unparsed[name] = value + continue + + raw_name = _EMAIL_FIELD_MAPPING.get(name) + if raw_name is None: + # This is a bit of a weird situation, we've encountered a key that + # we don't know what it means, so we don't know whether it's meant + # to be a list or not. + # + # Since we can't really tell one way or another, we'll just leave it + # as a list, even though it may be a single item list, because that's + # what makes the most sense for email headers. + unparsed[name] = value + continue + + # If this is one of our string fields, then we'll check to see if our + # value is a list of a single item. If it is then we'll assume that + # it was emitted as a single string, and unwrap the str from inside + # the list. + # + # If it's any other kind of data, then we haven't the faintest clue + # what we should parse it as, and we have to just add it to our list + # of unparsed stuff. + if raw_name in _STRING_FIELDS and len(value) == 1: + raw[raw_name] = value[0] + # If this is one our list of string fields, then we can just assign + # the value, since email *only* has strings, and our get_all() call + # above ensures that this is a list. + elif raw_name in _LIST_STRING_FIELDS: + raw[raw_name] = value + # Special Case: Keywords + # The keywords field is implemented in the metadata spec as a str, + # but it conceptually is a list of strings, and is serialized using + # ", ".join(keywords), so we'll do some light data massaging to turn + # this into what it logically is. + elif raw_name == "keywords" and len(value) == 1: + raw[raw_name] = _parse_keywords(value[0]) + # Special Case: Project-URL + # The project urls is implemented in the metadata spec as a list of + # specially-formatted strings that represent a key and a value, which + # is fundamentally a mapping, however the email format doesn't support + # mappings in a sane way, so it was crammed into a list of strings + # instead. + # + # We will do a little light data massaging to turn this into a map as + # it logically should be. + elif raw_name == "project_urls": + try: + raw[raw_name] = _parse_project_urls(value) + except ValueError: + unparsed[name] = value + # Nothing that we've done has managed to parse this, so it'll just + # throw it in our unparseable data and move on. + else: + unparsed[name] = value + + # We need to support getting the Description from the message payload in + # addition to getting it from the the headers. This does mean, though, there + # is the possibility of it being set both ways, in which case we put both + # in 'unparsed' since we don't know which is right. + try: + payload = _get_payload(parsed, data) + except ValueError: + unparsed.setdefault("Description", []).append( + parsed.get_payload(decode=isinstance(data, bytes)) + ) + else: + if payload: + # Check to see if we've already got a description, if so then both + # it, and this body move to unparseable. + if "description" in raw: + description_header = cast(str, raw.pop("description")) + unparsed.setdefault("Description", []).extend( + [description_header, payload] + ) + else: + raw["description"] = payload + + # We need to cast our `raw` to a metadata, because a TypedDict only support + # literal key names, but we're computing our key names on purpose, but the + # way this function is implemented, our `TypedDict` can only have valid key + # names. + return cast(RawMetadata, raw), unparsed From 4be6034f108410a206fa7ac27d863d0b939aa09e Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 18:01:30 -0800 Subject: [PATCH 02/28] Minor tweaks --- src/packaging/metadata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index dd3b7460..7a0f6214 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -175,7 +175,7 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: # class, some light touch ups can make a massive different in usability. # Map METADATA fields to RawMetadata. -_EMAIL_FIELD_MAPPING = { +_EMAIL_TO_RAW_MAPPING = { "author": "author", "author-email": "author_email", "classifier": "classifiers", @@ -276,7 +276,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st # Header object do the right thing to turn them into a # string for us. value.append(str(email.header.make_header(chunks))) - # This is already a string, so just add it + # This is already a string, so just add it. else: value.append(h) @@ -287,7 +287,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st unparsed[name] = value continue - raw_name = _EMAIL_FIELD_MAPPING.get(name) + raw_name = _EMAIL_TO_RAW_MAPPING.get(name) if raw_name is None: # This is a bit of a weird situation, we've encountered a key that # we don't know what it means, so we don't know whether it's meant @@ -347,7 +347,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st try: payload = _get_payload(parsed, data) except ValueError: - unparsed.setdefault("Description", []).append( + unparsed.setdefault("description", []).append( parsed.get_payload(decode=isinstance(data, bytes)) ) else: @@ -356,7 +356,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st # it, and this body move to unparseable. if "description" in raw: description_header = cast(str, raw.pop("description")) - unparsed.setdefault("Description", []).extend( + unparsed.setdefault("description", []).extend( [description_header, payload] ) else: From f5174f474c97813efb09497c48d402800453becc Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 18:01:43 -0800 Subject: [PATCH 03/28] Add tests for field frequency handling --- tests/test_metadata.py | 54 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/test_metadata.py diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 00000000..f2a3d985 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,54 @@ +import pytest +from packaging import metadata + +_RAW_TO_EMAIL_MAPPING = { + raw: email for email, raw in metadata._EMAIL_TO_RAW_MAPPING.items() +} + + +class TestRawMetadata: + @pytest.mark.parametrize("raw_field", metadata._STRING_FIELDS) + def test_non_repeating_fields_only_once(self, raw_field): + data = "VaLuE" + header_field = _RAW_TO_EMAIL_MAPPING[raw_field] + single_header = f"{header_field}: {data}" + raw, unparsed = metadata.parse_email(single_header) + assert not unparsed + assert len(raw) == 1 + assert raw_field in raw + assert raw[raw_field] == data + + @pytest.mark.parametrize("raw_field", metadata._STRING_FIELDS) + def test_non_repeating_fields_repeated(self, raw_field): + header_field = _RAW_TO_EMAIL_MAPPING[raw_field] + data = "VaLuE" + single_header = f"{header_field}: {data}" + repeated_header = "\n".join([single_header] * 2) + raw, unparsed = metadata.parse_email(repeated_header) + assert not raw + assert len(unparsed) == 1 + assert header_field in unparsed + assert unparsed[header_field] == [data] * 2 + + @pytest.mark.parametrize("raw_field", metadata._LIST_STRING_FIELDS) + def test_repeating_fields_only_once(self, raw_field): + data = "VaLuE" + header_field = _RAW_TO_EMAIL_MAPPING[raw_field] + single_header = f"{header_field}: {data}" + raw, unparsed = metadata.parse_email(single_header) + assert not unparsed + assert len(raw) == 1 + assert raw_field in raw + assert raw[raw_field] == [data] + + @pytest.mark.parametrize("raw_field", metadata._LIST_STRING_FIELDS) + def test_repeating_fields_repeated(self, raw_field): + header_field = _RAW_TO_EMAIL_MAPPING[raw_field] + data = "VaLuE" + single_header = f"{header_field}: {data}" + repeated_header = "\n".join([single_header] * 2) + raw, unparsed = metadata.parse_email(repeated_header) + assert not unparsed + assert len(raw) == 1 + assert raw_field in raw + assert raw[raw_field] == [data] * 2 From 1a1dc8084cd7370e73c90833d13736639e856b34 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 18:17:19 -0800 Subject: [PATCH 04/28] Test `keywords` --- tests/test_metadata.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index f2a3d985..d982eec3 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -52,3 +52,36 @@ def test_repeating_fields_repeated(self, raw_field): assert len(raw) == 1 assert raw_field in raw assert raw[raw_field] == [data] * 2 + + @pytest.mark.parametrize( + ["given", "expected"], + [ + ("A", ["A"]), + ("A ", ["A"]), + (" A", ["A"]), + ("A, B", ["A", "B"]), + ("A,B", ["A", "B"]), + (" A, B", ["A", "B"]), + ("A,B ", ["A", "B"]), + ("A B", ["A B"]), + ], + ) + def test_keywords(self, given, expected): + header = f"Keywords: {given}" + raw, unparsed = metadata.parse_email(header) + assert not unparsed + assert len(raw) == 1 + assert "keywords" in raw + assert raw["keywords"] == expected + + +# _parse_project_urls +# _get_payload +# _EMAIL_FIELD_MAPPING +# str input +# bytes input +# surrogate escape that isn't UTF-8 +# Description header +# Description header and body +# Multiple Description headers and body +# Keys all lower case From fe70df343a5da717dfeb9e090844bf1e21ae490d Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 18:35:43 -0800 Subject: [PATCH 05/28] Add tests for `Project-URL` parsing --- src/packaging/metadata.py | 6 +++--- tests/test_metadata.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 7a0f6214..2e3b3fb2 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -127,8 +127,8 @@ def _parse_project_urls(data: List[str]) -> Dict[str, str]: # TODO: The spec doesn't say anything about if the keys should be # considered case sensitive or not... logically they should - # be case preserving, but case insensitive, but doing that - # would open up more cases where we might have duplicated + # be case-preserving and case-insensitive, but doing that + # would open up more cases where we might have duplicate # entries. label, url = parts if label in urls: @@ -333,7 +333,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st elif raw_name == "project_urls": try: raw[raw_name] = _parse_project_urls(value) - except ValueError: + except KeyError: unparsed[name] = value # Nothing that we've done has managed to parse this, so it'll just # throw it in our unparseable data and move on. diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d982eec3..7af34501 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -74,6 +74,34 @@ def test_keywords(self, given, expected): assert "keywords" in raw assert raw["keywords"] == expected + @pytest.mark.parametrize( + ["given", "expected"], + [ + ("", {"": ""}), + ("A", {"A": ""}), + ("A,B", {"A": "B"}), + ("A, B", {"A": "B"}), + (" A,B", {"A": "B"}), + ("A,B ", {"A": "B"}), + ("A,B,C", {"A": "B,C"}), + ], + ) + def test_project_urls_parsing(self, given, expected): + header = f"project-url: {given}" + raw, unparsed = metadata.parse_email(header) + assert not unparsed + assert len(raw) == 1 + assert "project_urls" in raw + assert raw["project_urls"] == expected + + def test_duplicate_project_urls(self): + header = "project-url: A, B\nproject-url: A, C" + raw, unparsed = metadata.parse_email(header) + assert not raw + assert len(unparsed) == 1 + assert "project-url" in unparsed + assert unparsed["project-url"] == ["A, B", "A, C"] + # _parse_project_urls # _get_payload From 62cb64fda1208740c0a1cc8a9fa854d7b68c52dc Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 15 Jan 2023 18:45:03 -0800 Subject: [PATCH 06/28] Test `str` and `bytes` input --- tests/test_metadata.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7af34501..a35983d0 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -102,13 +102,27 @@ def test_duplicate_project_urls(self): assert "project-url" in unparsed assert unparsed["project-url"] == ["A, B", "A, C"] + def test_str_input(self): + name = "Tarek Ziadé" + header = f"author: {name}" + raw, unparsed = metadata.parse_email(header) + assert not unparsed + assert len(raw) == 1 + assert "author" in raw + assert raw["author"] == name + + def test_bytes_input(self): + name = "Tarek Ziadé" + header = f"author: {name}".encode("utf-8") + raw, unparsed = metadata.parse_email(header) + assert not unparsed + assert len(raw) == 1 + assert "author" in raw + assert raw["author"] == name + -# _parse_project_urls # _get_payload -# _EMAIL_FIELD_MAPPING -# str input -# bytes input -# surrogate escape that isn't UTF-8 +# surrogate escapes # Description header # Description header and body # Multiple Description headers and body From bd86a215cef8cd9f94a9692385ea3973e877cd85 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 16 Jan 2023 20:46:15 -0800 Subject: [PATCH 07/28] Test handling of `description` --- src/packaging/metadata.py | 2 ++ tests/test_metadata.py | 52 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 2e3b3fb2..4ac58ebd 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -359,6 +359,8 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st unparsed.setdefault("description", []).extend( [description_header, payload] ) + elif "description" in unparsed: + unparsed["description"].append(payload) else: raw["description"] = payload diff --git a/tests/test_metadata.py b/tests/test_metadata.py index a35983d0..2cf262fe 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -120,10 +120,54 @@ def test_bytes_input(self): assert "author" in raw assert raw["author"] == name + def test_header_mojibake(self): + value = "\xc0msterdam" + header_name = "value" + header_bytes = f"{header_name}: {value}".encode("latin1") + raw, unparsed = metadata.parse_email(header_bytes) + # Sanity check + with pytest.raises(UnicodeDecodeError): + header_bytes.decode("utf-8") + assert not raw + assert len(unparsed) == 1 + assert header_name in unparsed + assert unparsed[header_name] == [value] + + @pytest.mark.parametrize( + ["given"], [("hello",), ("description: hello",), ("hello".encode("utf-8"),)] + ) + def test_description(self, given): + raw, unparsed = metadata.parse_email(given) + assert not unparsed + assert len(raw) == 1 + assert "description" in raw + assert raw["description"] == "hello" + + def test_description_non_utf8(self): + header = "\xc0msterdam" + header_bytes = header.encode("latin1") + raw, unparsed = metadata.parse_email(header_bytes) + assert not raw + assert len(unparsed) == 1 + assert "description" in unparsed + assert unparsed["description"] == [header_bytes] + + @pytest.mark.parametrize( + ["given", "expected"], + [ + ("description: 1\ndescription: 2", ["1", "2"]), + ("description: 1\n\n2", ["1", "2"]), + ("description: 1\ndescription: 2\n\n3", ["1", "2", "3"]), + ], + ) + def test_description_multiple(self, given, expected): + raw, unparsed = metadata.parse_email(given) + assert not raw + assert len(unparsed) == 1 + assert "description" in unparsed + assert unparsed["description"] == expected + # _get_payload -# surrogate escapes -# Description header -# Description header and body -# Multiple Description headers and body +# multipart # Keys all lower case From 25b320a629d1e848b1182ff4f8b721a0258f2338 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sat, 21 Jan 2023 14:45:21 -0800 Subject: [PATCH 08/28] Don't worry about multi-part email bodies Can't figure out how to even trigger the situation. --- src/packaging/metadata.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 4ac58ebd..58225509 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -146,17 +146,12 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: # If our source is a str, then our caller has managed encodings for us, # and we don't need to deal with it. if isinstance(source, str): - payload: Union[List[str], str] = msg.get_payload() - if isinstance(payload, list): - raise ValueError("payload is a multi-part") + payload: str = msg.get_payload() return payload # If our source is a bytes, then we're managing the encoding and we need # to deal with it. else: - bpayload: Union[List[bytes], bytes] = msg.get_payload(decode=True) - if isinstance(bpayload, list): - raise ValueError("payload is a multi-part") - + bpayload: bytes = msg.get_payload(decode=True) try: return bpayload.decode("utf8", "strict") except UnicodeDecodeError: From f0e5271a4f19a0ddeeee7d46a3afa292b8ef7160 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sat, 21 Jan 2023 14:45:30 -0800 Subject: [PATCH 09/28] Test that keys are lower-cased --- tests/test_metadata.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2cf262fe..66c8c865 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -167,7 +167,13 @@ def test_description_multiple(self, given, expected): assert "description" in unparsed assert unparsed["description"] == expected + def test_lowercase_keys(self): + header = "AUTHOR: Tarek Ziadé\nWhatever: Else" + raw, unparsed = metadata.parse_email(header) + assert len(raw) == 1 + assert "author" in raw + assert len(unparsed) == 1 + assert "whatever" in unparsed + -# _get_payload -# multipart -# Keys all lower case +# A single, exhaustive test of every field From 167032827dc505282acf5d7c5b5ef91cf708a827 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sat, 21 Jan 2023 18:57:34 -0800 Subject: [PATCH 10/28] Support using `pathlib.Path` in `_manylinux` --- src/packaging/_manylinux.py | 6 ++++-- tests/test_manylinux.py | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/packaging/_manylinux.py b/src/packaging/_manylinux.py index 2f0cc743..14aae088 100644 --- a/src/packaging/_manylinux.py +++ b/src/packaging/_manylinux.py @@ -5,7 +5,7 @@ import re import sys import warnings -from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple +from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple, Union from ._elffile import EIClass, EIData, ELFFile, EMachine @@ -15,7 +15,9 @@ @contextlib.contextmanager -def _parse_elf(path: str) -> Generator[Optional[ELFFile], None, None]: +def _parse_elf( + path: Union[str, os.PathLike[str]] +) -> Generator[Optional[ELFFile], None, None]: try: with open(path, "rb") as f: yield ELFFile(f) diff --git a/tests/test_manylinux.py b/tests/test_manylinux.py index dafdfc3d..9945242f 100644 --- a/tests/test_manylinux.py +++ b/tests/test_manylinux.py @@ -3,6 +3,7 @@ except ImportError: ctypes = None import os +import pathlib import platform import sys import types @@ -10,7 +11,6 @@ import pretend import pytest - from packaging import _manylinux from packaging._manylinux import ( _get_glibc_version, @@ -169,11 +169,7 @@ def test_glibc_version_string_none(monkeypatch): ) def test_parse_elf_bad_executable(monkeypatch, content): if content: - path = os.path.join( - os.path.dirname(__file__), - "manylinux", - f"hello-world-{content}", - ) + path = pathlib.Path(__file__).parent / "manylinux" / f"hello-world-{content}" else: path = None with _parse_elf(path) as ef: From ac3190f7b9db259539527cc060079145f00afc66 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 20:23:51 -0800 Subject: [PATCH 11/28] Add a complete test --- tests/metadata/everything.metadata | 42 ++++++++++++++++++ tests/test_metadata.py | 70 +++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 tests/metadata/everything.metadata diff --git a/tests/metadata/everything.metadata b/tests/metadata/everything.metadata new file mode 100644 index 00000000..ee49805c --- /dev/null +++ b/tests/metadata/everything.metadata @@ -0,0 +1,42 @@ +Metadata-Version: 2.3 +Name: BeagleVote +Version: 1.0a2 +Platform: ObscureUnix +Platform: RareDOS +Supported-Platform: RedHat 7.2 +Supported-Platform: i386-win32-2791 +Summary: A module for collecting votes from beagles. +Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM +Keywords: dog,puppy,voting,election +Home-page: http://www.example.com/~cschultz/bvote/ +Download-URL: …/BeagleVote-0.45.tgz +Author: C. Schultz, Universal Features Syndicate, + Los Angeles, CA +Author-email: "C. Schultz" +Maintainer: C. Schultz, Universal Features Syndicate, + Los Angeles, CA +Maintainer-email: "C. Schultz" +License: This software may only be obtained by sending the + author a postcard, and then the user promises not + to redistribute it. +Classifier: Development Status :: 4 - Beta +Classifier: Environment :: Console (Text Based) +Provides-Extra: pdf +Requires-Dist: reportlab; extra == 'pdf' +Requires-Dist: pkginfo +Requires-Dist: PasteDeploy +Requires-Dist: zope.interface (>3.5.0) +Requires-Dist: pywin32 >1.0; sys_platform == 'win32' +Requires-Python: >=3 +Requires-External: C +Requires-External: libpng (>=1.5) +Requires-External: make; sys_platform != "win32" +Project-URL: Bug Tracker, http://bitbucket.org/tarek/distribute/issues/ +Project-URL: Documentation, https://example.com/BeagleVote +Provides-Dist: OtherProject +Provides-Dist: AnotherProject (3.4) +Provides-Dist: virtual_package; python_version >= "3.4" +Dynamic: Obsoletes-Dist +ThisIsNotReal: Hello! + +This description intentionaly left blank. diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 66c8c865..2c57c394 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,3 +1,5 @@ +import pathlib + import pytest from packaging import metadata @@ -175,5 +177,71 @@ def test_lowercase_keys(self): assert len(unparsed) == 1 assert "whatever" in unparsed + def test_complete(self): + """Test all fields (except `Obsoletes-Dist`). -# A single, exhaustive test of every field + `Obsoletes-Dist` was sacrificed to provide a value for `Dynamic`. + """ + path = pathlib.Path(__file__).parent / "metadata" / "everything.metadata" + with path.open("r", encoding="utf-8") as file: + metadata_contents = file.read() + raw, unparsed = metadata.parse_email(metadata_contents) + assert len(unparsed) == 1 + assert unparsed["thisisnotreal"] == ["Hello!"] + assert len(raw) == 24 + assert raw["metadata_version"] == "2.3" + assert raw["name"] == "BeagleVote" + assert raw["version"] == "1.0a2" + assert raw["platforms"] == ["ObscureUnix", "RareDOS"] + assert raw["supported_platforms"] == ["RedHat 7.2", "i386-win32-2791"] + assert raw["summary"] == "A module for collecting votes from beagles." + assert ( + raw["description_content_type"] + == "text/markdown; charset=UTF-8; variant=GFM" + ) + assert raw["keywords"] == ["dog", "puppy", "voting", "election"] + assert raw["home_page"] == "http://www.example.com/~cschultz/bvote/" + assert raw["download_url"] == "…/BeagleVote-0.45.tgz" + assert ( + raw["author"] + == "C. Schultz, Universal Features Syndicate,\n Los Angeles, CA " + ) + assert raw["author_email"] == '"C. Schultz" ' + assert ( + raw["maintainer"] + == "C. Schultz, Universal Features Syndicate,\n Los Angeles, CA " + ) + assert raw["maintainer_email"] == '"C. Schultz" ' + assert ( + raw["license"] + == "This software may only be obtained by sending the\n author a postcard, and then the user promises not\n to redistribute it." + ) + assert raw["classifiers"] == [ + "Development Status :: 4 - Beta", + "Environment :: Console (Text Based)", + ] + assert raw["provides_extra"] == ["pdf"] + assert raw["requires_dist"] == [ + "reportlab; extra == 'pdf'", + "pkginfo", + "PasteDeploy", + "zope.interface (>3.5.0)", + "pywin32 >1.0; sys_platform == 'win32'", + ] + assert raw["requires_python"] == ">=3" + assert raw["requires_external"] == [ + "C", + "libpng (>=1.5)", + 'make; sys_platform != "win32"', + ] + assert raw["project_urls"] == { + "Bug Tracker": "http://bitbucket.org/tarek/distribute/issues/", + "Documentation": "https://example.com/BeagleVote", + } + assert raw["provides_dist"] == [ + "OtherProject", + "AnotherProject (3.4)", + 'virtual_package; python_version >= "3.4"', + ] + assert raw["dynamic"] == ["Obsoletes-Dist"] + assert raw["description"] == "This description intentionaly left blank.\n" From 28a948083b85e236f688a9557661cb06cc3a2cfa Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 20:34:19 -0800 Subject: [PATCH 12/28] `abc.ABC` is not subscriptable until Python 3.9 --- src/packaging/_manylinux.py | 8 ++++---- tests/test_manylinux.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/packaging/_manylinux.py b/src/packaging/_manylinux.py index 14aae088..449c655b 100644 --- a/src/packaging/_manylinux.py +++ b/src/packaging/_manylinux.py @@ -5,7 +5,7 @@ import re import sys import warnings -from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple, Union +from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple from ._elffile import EIClass, EIData, ELFFile, EMachine @@ -14,10 +14,10 @@ EF_ARM_ABI_FLOAT_HARD = 0x00000400 +# `os.PathLike` not a generic type until Python 3.9, so sticking with `str` +# as the type for `path` until then. @contextlib.contextmanager -def _parse_elf( - path: Union[str, os.PathLike[str]] -) -> Generator[Optional[ELFFile], None, None]: +def _parse_elf(path: str) -> Generator[Optional[ELFFile], None, None]: try: with open(path, "rb") as f: yield ELFFile(f) diff --git a/tests/test_manylinux.py b/tests/test_manylinux.py index 9945242f..0cc4cbf6 100644 --- a/tests/test_manylinux.py +++ b/tests/test_manylinux.py @@ -170,6 +170,7 @@ def test_glibc_version_string_none(monkeypatch): def test_parse_elf_bad_executable(monkeypatch, content): if content: path = pathlib.Path(__file__).parent / "manylinux" / f"hello-world-{content}" + path = os.fsdecode(path) else: path = None with _parse_elf(path) as ef: From ae014d6c1cce5508a1b76c88a09e0d52529df19d Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 20:35:00 -0800 Subject: [PATCH 13/28] `TypedDict` was introduced in Python 3.8 --- noxfile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index 6c480595..826d92a0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -21,9 +21,7 @@ nox.options.reuse_existing_virtualenvs = True -@nox.session( - python=["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.7", "pypy3.8", "pypy3.9"] -) +@nox.session(python=["3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"]) def tests(session): def coverage(*args): session.run("python", "-m", "coverage", *args) From 4f9aa0873cd05ebd1fba85b3a083adb0ff6f78e0 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 21:08:52 -0800 Subject: [PATCH 14/28] Linting touch-ups --- tests/test_manylinux.py | 1 + tests/test_metadata.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_manylinux.py b/tests/test_manylinux.py index 0cc4cbf6..3561bb99 100644 --- a/tests/test_manylinux.py +++ b/tests/test_manylinux.py @@ -11,6 +11,7 @@ import pretend import pytest + from packaging import _manylinux from packaging._manylinux import ( _get_glibc_version, diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2c57c394..bb0818d5 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,6 +1,7 @@ import pathlib import pytest + from packaging import metadata _RAW_TO_EMAIL_MAPPING = { @@ -115,7 +116,7 @@ def test_str_input(self): def test_bytes_input(self): name = "Tarek Ziadé" - header = f"author: {name}".encode("utf-8") + header = f"author: {name}".encode() raw, unparsed = metadata.parse_email(header) assert not unparsed assert len(raw) == 1 @@ -136,7 +137,7 @@ def test_header_mojibake(self): assert unparsed[header_name] == [value] @pytest.mark.parametrize( - ["given"], [("hello",), ("description: hello",), ("hello".encode("utf-8"),)] + ["given"], [("hello",), ("description: hello",), (b"hello",)] ) def test_description(self, given): raw, unparsed = metadata.parse_email(given) From fb5db930e21152058b4c577fcffa469b293a9516 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 21:09:05 -0800 Subject: [PATCH 15/28] Remove code path that never seems to be exercised --- src/packaging/metadata.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 58225509..b03826c0 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -254,17 +254,14 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st # of them. chunks = [] for bin, encoding in email.header.decode_header(h): - # This means it found a surrogate escape that could be - # valid data (if the source was utf8), or invalid. - if encoding == "unknown-8bit": - try: - bin.decode("utf8", "strict") - except UnicodeDecodeError: - # Enable mojibake. - encoding = "latin1" - valid_encoding = False - else: - encoding = "utf8" + try: + bin.decode("utf8", "strict") + except UnicodeDecodeError: + # Enable mojibake. + encoding = "latin1" + valid_encoding = False + else: + encoding = "utf8" chunks.append((bin, encoding)) # Turn our chunks back into a Header object, then let that From a81ede78512a8d781ca1a0f0d5589fd8b97c13d5 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 21:11:02 -0800 Subject: [PATCH 16/28] Linting touch-ups --- tests/test_metadata.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index bb0818d5..b9f5dedf 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -203,19 +203,20 @@ def test_complete(self): assert raw["keywords"] == ["dog", "puppy", "voting", "election"] assert raw["home_page"] == "http://www.example.com/~cschultz/bvote/" assert raw["download_url"] == "…/BeagleVote-0.45.tgz" - assert ( - raw["author"] - == "C. Schultz, Universal Features Syndicate,\n Los Angeles, CA " + assert raw["author"] == ( + "C. Schultz, Universal Features Syndicate,\n" + " Los Angeles, CA " ) assert raw["author_email"] == '"C. Schultz" ' - assert ( - raw["maintainer"] - == "C. Schultz, Universal Features Syndicate,\n Los Angeles, CA " + assert raw["maintainer"] == ( + "C. Schultz, Universal Features Syndicate,\n" + " Los Angeles, CA " ) assert raw["maintainer_email"] == '"C. Schultz" ' - assert ( - raw["license"] - == "This software may only be obtained by sending the\n author a postcard, and then the user promises not\n to redistribute it." + assert raw["license"] == ( + "This software may only be obtained by sending the\n" + " author a postcard, and then the user promises not\n" + " to redistribute it." ) assert raw["classifiers"] == [ "Development Status :: 4 - Beta", From d466ddb9736d196dc8a8f3b1f1071b29f163b26a Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 21:15:13 -0800 Subject: [PATCH 17/28] Make mypy happy Apparently `List` is invariant, so its types must be **exact**. --- src/packaging/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index b03826c0..36e96b0a 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -3,7 +3,7 @@ import email.message import email.parser import email.policy -from typing import Dict, List, Tuple, TypedDict, Union, cast +from typing import Dict, List, Optional, Tuple, TypedDict, Union, cast # The RawMetadata class attempts to make as few assumptions about the underlying @@ -252,7 +252,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st # The Header object stores it's data as chunks, and each chunk # can be independently encoded, so we'll need to check each # of them. - chunks = [] + chunks: List[Tuple[bytes, Optional[str]]] = [] for bin, encoding in email.header.decode_header(h): try: bin.decode("utf8", "strict") From d1bd707566e111497b405b2d4d35fe5f93dd7914 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Sun, 22 Jan 2023 22:22:11 -0800 Subject: [PATCH 18/28] Drop 3.7 support --- .github/workflows/test.yml | 3 +-- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b4b4066a..f002e1ad 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,8 +22,7 @@ jobs: fail-fast: false matrix: os: [Ubuntu, Windows, macOS] - python_version: - ["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.7", "pypy3.8", "pypy3.9"] + python_version: ["3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"] steps: - uses: actions/checkout@v3 diff --git a/pyproject.toml b/pyproject.toml index 7cdb62db..45eea021 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ name = "packaging" description = "Core utilities for Python packages" dynamic = ["version"] readme = "README.rst" -requires-python = ">=3.7" +requires-python = ">=3.8" authors = [{name = "Donald Stufft", email = "donald@stufft.io"}] classifiers = [ "Development Status :: 5 - Production/Stable", From 8561eceeea4fbb646642f978d6e3dcde65bd70cb Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 23 Jan 2023 20:46:31 -0800 Subject: [PATCH 19/28] Add docs --- docs/index.rst | 1 + docs/metadata.rst | 42 +++++++++++++++++++++++++++++++++++++++ src/packaging/metadata.py | 29 +++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 docs/metadata.rst diff --git a/docs/index.rst b/docs/index.rst index aafdae83..6850e9e8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,6 +25,7 @@ You can install packaging with ``pip``: specifiers markers requirements + metadata tags utils diff --git a/docs/metadata.rst b/docs/metadata.rst new file mode 100644 index 00000000..b87574cb --- /dev/null +++ b/docs/metadata.rst @@ -0,0 +1,42 @@ +Metadata +======== + +.. currentmodule:: packaging.markers + + +Both `source distributions`_ and `binary distributions` +(_sdists_ and _wheels_, respectively) contain files recording the +`core metadata`_ for the distribution. This information is used for +everything from recording the name of the distribution to the +installation dependencies. + + +Usage +----- + +.. doctest:: + + >>> from packaging.metadata import parse_email + >>> metadata = "Metadata-Version: 2.3\nName: packaging\nVersion: 24.0" + >>> raw, unparsed = parse_email(metadata) + >>> raw["metadata_version"] + '2.3' + >>> raw["name"] + 'packaging' + >>> raw["version"] + '24.0' + + +Reference +--------- + +Low Level Interface +''''''''''''''''''' + +.. automodule:: packaging.metadata + :members: + + +.. _source distributions: https://packaging.python.org/en/latest/specifications/source-distribution-format/ +.. _binary distributions: https://packaging.python.org/en/latest/specifications/binary-distribution-format/ +.. _core metadata: https://packaging.python.org/en/latest/specifications/core-metadata/ diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 36e96b0a..1962e09f 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -11,6 +11,20 @@ # formats offer some very basic primitives in *some* way then we can support # serializing to and from that format. class RawMetadata(TypedDict, total=False): + """A dictionary of raw core metadata. + + Each field in core metadata maps to a key of this dictionary (when data is + provided). The key is lower-case and underscores are used instead of dashes + compared to the equivalent core metadata field. Any core metadata field that + can be specified multiple times or can hold multiple values in a single + field have a key with a plural name. + + Core metadata fields that can be specified multiple times are stored as a + list or dict depending on which is appropriate for the field. Any fields + which hold multiple values in a single field are stored as a list. + + """ + # Metadata 1.0 - PEP 241 metadata_version: str name: str @@ -203,6 +217,21 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[str]]]: + """Parse a distribution's metadata. + + This function returns a two-item tuple of dicts. The first dict is of + recognized fields from the core metadata specification. Fields that can be + parsed and translated into Python's built-in types are converted + appropriately. All other fields are last as-is. Fields that are allowed to + appear multiple times are stored as lists. + + The second dict contains all other fields from the metadata. This includes + any unrecognized fields. It also includes any fields which are expected to + be parsed into a built-in type were not formatted appropriately. Finally, + any fields that are expected to appear only once but are repeated are + included in this dict. + + """ raw: Dict[str, Union[str, List[str], Dict[str, str]]] = {} unparsed: Dict[str, List[str]] = {} From 4d97abedadf20f3749f8b3f65e6dd0a674739b9c Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Tue, 24 Jan 2023 15:29:15 -0800 Subject: [PATCH 20/28] Apply suggestions from code review Co-authored-by: Donald Stufft Co-authored-by: Paul Moore --- src/packaging/metadata.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 1962e09f..16067b25 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -181,7 +181,7 @@ def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str: # that make valid data awkward to work with. # # While this is a lower level, intermediate format than our ``Metadata`` -# class, some light touch ups can make a massive different in usability. +# class, some light touch ups can make a massive difference in usability. # Map METADATA fields to RawMetadata. _EMAIL_TO_RAW_MAPPING = { @@ -222,12 +222,12 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st This function returns a two-item tuple of dicts. The first dict is of recognized fields from the core metadata specification. Fields that can be parsed and translated into Python's built-in types are converted - appropriately. All other fields are last as-is. Fields that are allowed to + appropriately. All other fields are left as-is. Fields that are allowed to appear multiple times are stored as lists. The second dict contains all other fields from the metadata. This includes any unrecognized fields. It also includes any fields which are expected to - be parsed into a built-in type were not formatted appropriately. Finally, + be parsed into a built-in type but were not formatted appropriately. Finally, any fields that are expected to appear only once but are repeated are included in this dict. @@ -330,7 +330,7 @@ def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[st # of unparsed stuff. if raw_name in _STRING_FIELDS and len(value) == 1: raw[raw_name] = value[0] - # If this is one our list of string fields, then we can just assign + # If this is one of our list of string fields, then we can just assign # the value, since email *only* has strings, and our get_all() call # above ensures that this is a list. elif raw_name in _LIST_STRING_FIELDS: From 76062f3f4672a38ed99149bd61d51d61b0641167 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Tue, 24 Jan 2023 15:29:43 -0800 Subject: [PATCH 21/28] Update tests/metadata/everything.metadata --- tests/metadata/everything.metadata | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/metadata/everything.metadata b/tests/metadata/everything.metadata index ee49805c..5412a083 100644 --- a/tests/metadata/everything.metadata +++ b/tests/metadata/everything.metadata @@ -39,4 +39,4 @@ Provides-Dist: virtual_package; python_version >= "3.4" Dynamic: Obsoletes-Dist ThisIsNotReal: Hello! -This description intentionaly left blank. +This description intentionally left blank. From 004f136f3d30c51271fdd677e59fe73e33b39565 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Tue, 24 Jan 2023 17:04:41 -0800 Subject: [PATCH 22/28] Update a test --- tests/test_metadata.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b9f5dedf..f8a667bd 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,7 +1,6 @@ import pathlib import pytest - from packaging import metadata _RAW_TO_EMAIL_MAPPING = { @@ -246,4 +245,4 @@ def test_complete(self): 'virtual_package; python_version >= "3.4"', ] assert raw["dynamic"] == ["Obsoletes-Dist"] - assert raw["description"] == "This description intentionaly left blank.\n" + assert raw["description"] == "This description intentionally left blank.\n" From 2674e550ddd9b2975c587f2482341b6cfe8e96ad Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Wed, 25 Jan 2023 15:39:42 -0800 Subject: [PATCH 23/28] Make isort happy --- tests/test_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index f8a667bd..22fe76ba 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,6 +1,7 @@ import pathlib import pytest + from packaging import metadata _RAW_TO_EMAIL_MAPPING = { From 4164911bfa9ac0fd89622898b23bb113eebdab44 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 30 Jan 2023 17:08:44 -0800 Subject: [PATCH 24/28] Be compatible with Python 3.7 --- noxfile.py | 6 +- pyproject.toml | 2 +- src/packaging/metadata.py | 142 ++++++++++++++++++++------------------ 3 files changed, 78 insertions(+), 72 deletions(-) diff --git a/noxfile.py b/noxfile.py index 826d92a0..a713fa6c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -21,7 +21,7 @@ nox.options.reuse_existing_virtualenvs = True -@nox.session(python=["3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"]) +@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"]) def tests(session): def coverage(*args): session.run("python", "-m", "coverage", *args) @@ -52,7 +52,7 @@ def coverage(*args): ) -@nox.session(python="3.9") +@nox.session(python="3.11") def lint(session): # Run the linters (via pre-commit) session.install("pre-commit") @@ -64,7 +64,7 @@ def lint(session): session.run("twine", "check", *glob.glob("dist/*")) -@nox.session(python="3.9") +@nox.session(python="3.11") def docs(session): shutil.rmtree("docs/_build", ignore_errors=True) session.install("-r", "docs/requirements.txt") diff --git a/pyproject.toml b/pyproject.toml index 45eea021..7cdb62db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ name = "packaging" description = "Core utilities for Python packages" dynamic = ["version"] readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.7" authors = [{name = "Donald Stufft", email = "donald@stufft.io"}] classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 16067b25..6f27da8b 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -3,78 +3,84 @@ import email.message import email.parser import email.policy -from typing import Dict, List, Optional, Tuple, TypedDict, Union, cast - - -# The RawMetadata class attempts to make as few assumptions about the underlying -# serialization formats as possible. The idea is that as long as a serialization -# formats offer some very basic primitives in *some* way then we can support -# serializing to and from that format. -class RawMetadata(TypedDict, total=False): - """A dictionary of raw core metadata. +import typing +from typing import Dict, List, Optional, Tuple, Union, cast + +if typing.TYPE_CHECKING: # pragma: no cover + from typing_extensions import TypedDict + + # The RawMetadata class attempts to make as few assumptions about the underlying + # serialization formats as possible. The idea is that as long as a serialization + # formats offer some very basic primitives in *some* way then we can support + # serializing to and from that format. + class RawMetadata(TypedDict, total=False): + """A dictionary of raw core metadata. + + Each field in core metadata maps to a key of this dictionary (when data is + provided). The key is lower-case and underscores are used instead of dashes + compared to the equivalent core metadata field. Any core metadata field that + can be specified multiple times or can hold multiple values in a single + field have a key with a plural name. + + Core metadata fields that can be specified multiple times are stored as a + list or dict depending on which is appropriate for the field. Any fields + which hold multiple values in a single field are stored as a list. + + """ + + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: str + platforms: List[str] + summary: str + description: str + keywords: List[str] + home_page: str + author: str + author_email: str + license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: List[str] + download_url: str + classifiers: List[str] + requires: List[str] + provides: List[str] + obsoletes: List[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str + requires_dist: List[str] + provides_dist: List[str] + obsoletes_dist: List[str] + requires_python: str + requires_external: List[str] + project_urls: Dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emiting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. - Each field in core metadata maps to a key of this dictionary (when data is - provided). The key is lower-case and underscores are used instead of dashes - compared to the equivalent core metadata field. Any core metadata field that - can be specified multiple times or can hold multiple values in a single - field have a key with a plural name. + # Metadata 2.1 - PEP 566 + description_content_type: str + provides_extra: List[str] - Core metadata fields that can be specified multiple times are stored as a - list or dict depending on which is appropriate for the field. Any fields - which hold multiple values in a single field are stored as a list. + # Metadata 2.2 - PEP 643 + dynamic: List[str] - """ + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. - # Metadata 1.0 - PEP 241 - metadata_version: str - name: str - version: str - platforms: List[str] - summary: str - description: str - keywords: List[str] - home_page: str - author: str - author_email: str - license: str - - # Metadata 1.1 - PEP 314 - supported_platforms: List[str] - download_url: str - classifiers: List[str] - requires: List[str] - provides: List[str] - obsoletes: List[str] - - # Metadata 1.2 - PEP 345 - maintainer: str - maintainer_email: str - requires_dist: List[str] - provides_dist: List[str] - obsoletes_dist: List[str] - requires_python: str - requires_external: List[str] - project_urls: Dict[str, str] - - # Metadata 2.0 - # PEP 426 attempted to completely revamp the metadata format - # but got stuck without ever being able to build consensus on - # it and ultimately ended up withdrawn. - # - # However, a number of tools had started emiting METADATA with - # `2.0` Metadata-Version, so for historical reasons, this version - # was skipped. - - # Metadata 2.1 - PEP 566 - description_content_type: str - provides_extra: List[str] - - # Metadata 2.2 - PEP 643 - dynamic: List[str] - - # Metadata 2.3 - PEP 685 - # No new fields were added in PEP 685, just some edge case were - # tightened up to provide better interoptability. +else: + RawMetadata = Dict[str, Union[str, List[str], Dict[str, str]]] _STRING_FIELDS = { From 31e1b223663fce7d82ba471cd318ab7b67b0639a Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 30 Jan 2023 17:10:55 -0800 Subject: [PATCH 25/28] More 3.7 support --- .github/workflows/docs.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/test.yml | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 53bca63b..3c73f1b6 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.9" + python-version: "3.11" cache: "pip" - name: Build documentation diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 712704c1..ab6a2920 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -27,7 +27,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.9" + python-version: "3.11" cache: "pip" - name: Run `nox -s lint` diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f002e1ad..d3d38710 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -22,7 +22,8 @@ jobs: fail-fast: false matrix: os: [Ubuntu, Windows, macOS] - python_version: ["3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"] + python_version: + ["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"] steps: - uses: actions/checkout@v3 From f8ded2fefce679d6613bd04f5d65e8e660f546f9 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 30 Jan 2023 17:17:17 -0800 Subject: [PATCH 26/28] Poetry via pip via pep517 isn't happy --- .github/workflows/docs.yml | 2 +- .github/workflows/lint.yml | 2 +- noxfile.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3c73f1b6..53bca63b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -25,7 +25,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.11" + python-version: "3.9" cache: "pip" - name: Build documentation diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index ab6a2920..712704c1 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -27,7 +27,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.11" + python-version: "3.9" cache: "pip" - name: Run `nox -s lint` diff --git a/noxfile.py b/noxfile.py index a713fa6c..da5abc73 100644 --- a/noxfile.py +++ b/noxfile.py @@ -52,7 +52,7 @@ def coverage(*args): ) -@nox.session(python="3.11") +@nox.session(python="3.9") def lint(session): # Run the linters (via pre-commit) session.install("pre-commit") @@ -64,7 +64,7 @@ def lint(session): session.run("twine", "check", *glob.glob("dist/*")) -@nox.session(python="3.11") +@nox.session(python="3.9") def docs(session): shutil.rmtree("docs/_build", ignore_errors=True) session.install("-r", "docs/requirements.txt") From 6ce6b976264c4c581b119f797625cd1acc5fcf5a Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Mon, 30 Jan 2023 17:21:22 -0800 Subject: [PATCH 27/28] Try to get around isort triggering a failure via Poetry --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22f21130..a63a68dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - id: black - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort From f0c6c61bad230248150cbb081d7f1a42e127fa89 Mon Sep 17 00:00:00 2001 From: Brett Cannon Date: Tue, 31 Jan 2023 18:13:42 -0800 Subject: [PATCH 28/28] Try another suggestion on how to make `TypedDict` not error out in Python 3.7 --- src/packaging/metadata.py | 152 ++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 71 deletions(-) diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py index 6f27da8b..e76a60c3 100644 --- a/src/packaging/metadata.py +++ b/src/packaging/metadata.py @@ -3,84 +3,94 @@ import email.message import email.parser import email.policy +import sys import typing from typing import Dict, List, Optional, Tuple, Union, cast -if typing.TYPE_CHECKING: # pragma: no cover - from typing_extensions import TypedDict - - # The RawMetadata class attempts to make as few assumptions about the underlying - # serialization formats as possible. The idea is that as long as a serialization - # formats offer some very basic primitives in *some* way then we can support - # serializing to and from that format. - class RawMetadata(TypedDict, total=False): - """A dictionary of raw core metadata. - - Each field in core metadata maps to a key of this dictionary (when data is - provided). The key is lower-case and underscores are used instead of dashes - compared to the equivalent core metadata field. Any core metadata field that - can be specified multiple times or can hold multiple values in a single - field have a key with a plural name. - - Core metadata fields that can be specified multiple times are stored as a - list or dict depending on which is appropriate for the field. Any fields - which hold multiple values in a single field are stored as a list. - - """ - - # Metadata 1.0 - PEP 241 - metadata_version: str - name: str - version: str - platforms: List[str] - summary: str - description: str - keywords: List[str] - home_page: str - author: str - author_email: str - license: str - - # Metadata 1.1 - PEP 314 - supported_platforms: List[str] - download_url: str - classifiers: List[str] - requires: List[str] - provides: List[str] - obsoletes: List[str] - - # Metadata 1.2 - PEP 345 - maintainer: str - maintainer_email: str - requires_dist: List[str] - provides_dist: List[str] - obsoletes_dist: List[str] - requires_python: str - requires_external: List[str] - project_urls: Dict[str, str] - - # Metadata 2.0 - # PEP 426 attempted to completely revamp the metadata format - # but got stuck without ever being able to build consensus on - # it and ultimately ended up withdrawn. - # - # However, a number of tools had started emiting METADATA with - # `2.0` Metadata-Version, so for historical reasons, this version - # was skipped. +if sys.version_info >= (3, 8): # pragma: no cover + from typing import TypedDict +else: # pragma: no cover + if typing.TYPE_CHECKING: + from typing_extensions import TypedDict + else: + try: + from typing_extensions import TypedDict + except ImportError: + + class TypedDict: + def __init_subclass__(*_args, **_kwargs): + pass - # Metadata 2.1 - PEP 566 - description_content_type: str - provides_extra: List[str] - # Metadata 2.2 - PEP 643 - dynamic: List[str] +# The RawMetadata class attempts to make as few assumptions about the underlying +# serialization formats as possible. The idea is that as long as a serialization +# formats offer some very basic primitives in *some* way then we can support +# serializing to and from that format. +class RawMetadata(TypedDict, total=False): + """A dictionary of raw core metadata. - # Metadata 2.3 - PEP 685 - # No new fields were added in PEP 685, just some edge case were - # tightened up to provide better interoptability. + Each field in core metadata maps to a key of this dictionary (when data is + provided). The key is lower-case and underscores are used instead of dashes + compared to the equivalent core metadata field. Any core metadata field that + can be specified multiple times or can hold multiple values in a single + field have a key with a plural name. + + Core metadata fields that can be specified multiple times are stored as a + list or dict depending on which is appropriate for the field. Any fields + which hold multiple values in a single field are stored as a list. + + """ -else: - RawMetadata = Dict[str, Union[str, List[str], Dict[str, str]]] + # Metadata 1.0 - PEP 241 + metadata_version: str + name: str + version: str + platforms: List[str] + summary: str + description: str + keywords: List[str] + home_page: str + author: str + author_email: str + license: str + + # Metadata 1.1 - PEP 314 + supported_platforms: List[str] + download_url: str + classifiers: List[str] + requires: List[str] + provides: List[str] + obsoletes: List[str] + + # Metadata 1.2 - PEP 345 + maintainer: str + maintainer_email: str + requires_dist: List[str] + provides_dist: List[str] + obsoletes_dist: List[str] + requires_python: str + requires_external: List[str] + project_urls: Dict[str, str] + + # Metadata 2.0 + # PEP 426 attempted to completely revamp the metadata format + # but got stuck without ever being able to build consensus on + # it and ultimately ended up withdrawn. + # + # However, a number of tools had started emiting METADATA with + # `2.0` Metadata-Version, so for historical reasons, this version + # was skipped. + + # Metadata 2.1 - PEP 566 + description_content_type: str + provides_extra: List[str] + + # Metadata 2.2 - PEP 643 + dynamic: List[str] + + # Metadata 2.3 - PEP 685 + # No new fields were added in PEP 685, just some edge case were + # tightened up to provide better interoptability. _STRING_FIELDS = {