From 02bbde5026f8307b2216a5479753846010287ddf Mon Sep 17 00:00:00 2001 From: Dahan Gong Date: Tue, 4 Mar 2025 11:00:08 +0800 Subject: [PATCH] Update _create_gnu_long_header to align with GNU Tar --- Doc/whatsnew/3.14.rst | 9 +++ Lib/tarfile.py | 55 +++++++++++++------ Lib/test/test_tarfile.py | 24 ++++++++ Misc/ACKS | 1 + ...-03-04-03-14-44.gh-issue-130819.Dphgb6.rst | 3 + 5 files changed, 76 insertions(+), 16 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-03-04-03-14-44.gh-issue-130819.Dphgb6.rst diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 894f011ec86a30..4e0e7983b77b17 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -1708,6 +1708,15 @@ sysconfig (Contributed by Xuehai Pan in :gh:`131799`.) +tarfile +------- + +* Emit ``mode``, ``uname`` and ``gname`` fields for long paths in + :mod:`tarfile` archives, providing better bit-for-bit compatibility with GNU + :manpage:`tar(1)`. + (Contributed by Dahan Gong in :gh:`130820`.) + + threading --------- diff --git a/Lib/tarfile.py b/Lib/tarfile.py index c0f5a609b9f42f..44c6bb7ec2f13a 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -895,6 +895,9 @@ class TarInfo(object): _link_target = None, ) + _name_uid0 = None # Cached uname of uid=0 + _name_gid0 = None # Cached gname of gid=0 + def __init__(self, name=""): """Construct a TarInfo object. name is the optional name of the member. @@ -1202,6 +1205,13 @@ def _create_gnu_long_header(cls, name, type, encoding, errors): info["type"] = type info["size"] = len(name) info["magic"] = GNU_MAGIC + info["mode"] = 0o100644 + if cls._name_uid0 is None or cls._name_gid0 is None: + user_group_names = TarFile._get_user_group_names(0, 0, {}, {}) + cls._name_uid0 = user_group_names[0] or "" + cls._name_gid0 = user_group_names[1] or "" + info["uname"] = cls._name_uid0 + info["gname"] = cls._name_gid0 # create extended header + name blocks. return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ @@ -2202,22 +2212,12 @@ def gettarinfo(self, name=None, arcname=None, fileobj=None): tarinfo.type = type tarinfo.linkname = linkname - # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. To - # speed things up, cache the resolved usernames and group names. - if pwd: - if tarinfo.uid not in self._unames: - try: - self._unames[tarinfo.uid] = pwd.getpwuid(tarinfo.uid)[0] - except KeyError: - self._unames[tarinfo.uid] = '' - tarinfo.uname = self._unames[tarinfo.uid] - if grp: - if tarinfo.gid not in self._gnames: - try: - self._gnames[tarinfo.gid] = grp.getgrgid(tarinfo.gid)[0] - except KeyError: - self._gnames[tarinfo.gid] = '' - tarinfo.gname = self._gnames[tarinfo.gid] + uname, gname = TarFile._get_user_group_names(tarinfo.uid, tarinfo.gid, + self._unames, self._gnames) + if uname is not None: + tarinfo.uname = uname + if gname is not None: + tarinfo.gname = gname if type in (CHRTYPE, BLKTYPE): if hasattr(os, "major") and hasattr(os, "minor"): @@ -2560,6 +2560,29 @@ def _extract_member(self, tarinfo, targetpath, set_attrs=True, self.chmod(tarinfo, targetpath) self.utime(tarinfo, targetpath) + def _get_user_group_names(uid, gid, unames_cache, gnames_cache): + # Calls to pwd.getpwuid() and grp.getgrgid() tend to be expensive. + # To speed things up, cache the resolved usernames and group names. + if pwd: + if uid not in unames_cache: + try: + unames_cache[uid] = pwd.getpwuid(uid)[0] + except KeyError: + unames_cache[uid] = '' + uname = unames_cache[uid] + else: + uname = None + if grp: + if gid not in gnames_cache: + try: + gnames_cache[gid] = grp.getgrgid(gid)[0] + except KeyError: + gnames_cache[gid] = '' + gname = gnames_cache[gid] + else: + gname = None + return uname, gname + #-------------------------------------------------------------------------- # Below are the different file methods. They are called via # _extract_member() when extract() is called. They can be replaced in a diff --git a/Lib/test/test_tarfile.py b/Lib/test/test_tarfile.py index 2d9649237a9382..dc70a7f08669e6 100644 --- a/Lib/test/test_tarfile.py +++ b/Lib/test/test_tarfile.py @@ -1908,6 +1908,30 @@ def test_longnamelink_1025(self): self._test(("longnam/" * 127) + "longname_", ("longlnk/" * 127) + "longlink_") + def test_hidden_header_for_gnulong(self): + # Regression test for gh-130819. + memory_file = io.BytesIO() + with tarfile.open(mode="w", fileobj=memory_file, format=tarfile.GNU_FORMAT) as tar: + tar_info = tarfile.TarInfo("abcdef" * 20) + tar_info.type = tarfile.DIRTYPE + tar.addfile(tar_info, None) + tar.close() + + class RawTabInfo(tarfile.TarInfo): + + def _proc_member(self, tar_file): + if self.type in (tarfile.GNUTYPE_LONGNAME, tarfile.GNUTYPE_LONGLINK): + tester.assertEqual(self.mode, 0o644) + tester.assertEqual(self.uname, RawTabInfo._name_uid0) + tester.assertEqual(self.gname, RawTabInfo._name_gid0) + return super()._proc_member(tar_file) + + tester = self + memory_file.seek(0) + with tarfile.open(fileobj=memory_file, mode="r", tarinfo=RawTabInfo) as tar: + members = tar.getmembers() + self.assertEqual(len(members), 1) + class DeviceHeaderTest(WriteTestBase, unittest.TestCase): diff --git a/Misc/ACKS b/Misc/ACKS index 610dcf9f4238de..1b2a11e1ba4de9 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -665,6 +665,7 @@ Mikhail Golubev Marta Gómez Macías Guilherme Gonçalves Tiago Gonçalves +Dahan Gong Chris Gonnerman Shelley Gooch David Goodger diff --git a/Misc/NEWS.d/next/Library/2025-03-04-03-14-44.gh-issue-130819.Dphgb6.rst b/Misc/NEWS.d/next/Library/2025-03-04-03-14-44.gh-issue-130819.Dphgb6.rst new file mode 100644 index 00000000000000..df665b2b4fcd07 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-04-03-14-44.gh-issue-130819.Dphgb6.rst @@ -0,0 +1,3 @@ +Emit ``mode``, ``uname`` and ``gname`` fields for long paths in +:mod:`tarfile` archives, providing better bit-for-bit compatibility with GNU +``tar(1)``.