From 979e79e37d2f1d4f19cc8be7a26d7a6635974422 Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Tue, 26 Oct 2021 07:09:44 +0200 Subject: [PATCH 1/7] bpo-45466: add download feature to urllib.request Similar to http.server, the urllib.request offers a download functionality: python -m urllib.request https://python.org/ --output file.html --- Doc/library/urllib.request.rst | 17 ++++++++++++ Lib/urllib/request.py | 27 +++++++++++++++++++ .../2021-10-26-07-02-51.bpo-45466.DOzSv2.rst | 3 +++ 3 files changed, 47 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 88e93ba6b002eb..15ba22b78f5c17 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -1354,6 +1354,23 @@ The following example uses no proxies at all, overriding environment settings:: ... f.read().decode('utf-8') ... +.. _urllib-request-cli: + +:mod:`urllib.request` can also be invoked directly using the :option:`-m` +switch of the interpreter with an ``URL`` argument:: + + python -m urllib.request https://python.org/ + +By default, the downloaded data is printed to stdout. The option ``-o/--output`` +specifies an output file where the downloaded data is stored instead of being +printed:: + + python -m urllib.request https://python.org/ --output python.html + +If the output file already exists, its content is overwritten. + +.. versionadded:: 3.11 + Legacy interface ---------------- diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index fd6fc36aee04b3..4e1f83ec228dda 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -2781,3 +2781,30 @@ def proxy_bypass(host): # By default use environment variables getproxies = getproxies_environment proxy_bypass = proxy_bypass_environment + + +if __name__ == "__main__": + from argparse import ArgumentParser + from sys import stdout + + parser = ArgumentParser( + description="Download the provided URL (FTP/HTTP/HTTPS supported) " + "and print it to stdout by default. If specified, write to OUTPUT " + "instead." + ) + parser.add_argument("URL", help="(encoded) URL to download") + parser.add_argument( + "-o", + "--output", + type=str, + help="write to OUTPUT instead of stdout" + ) + args = parser.parse_args() + out = stdout.buffer if args.output is None else open(args.output, "wb") + + with urlopen(args.URL) as response: + while data := response.read(1024 * 1024): + out.write(data) + + if out is not stdout.buffer: + out.close() diff --git a/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst b/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst new file mode 100644 index 00000000000000..4a221cecf14a4a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst @@ -0,0 +1,3 @@ +The :mod:`urllib.request` module can download files (e.g. ``python -m +urrlib.request https://python.org/``). For more info, call ``python -m +urrlib.request -h``. Patch by Thomas Pohl. From 7c2f0ca0f0a2b076a681c073d6f3a078be811e3f Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Tue, 26 Oct 2021 09:55:00 +0200 Subject: [PATCH 2/7] test: add tests for download in urllib.request --- Lib/test/test_urllib2_localnet.py | 35 +++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/Lib/test/test_urllib2_localnet.py b/Lib/test/test_urllib2_localnet.py index 36fb05d3db0e2a..4a9b56d72e6539 100644 --- a/Lib/test/test_urllib2_localnet.py +++ b/Lib/test/test_urllib2_localnet.py @@ -7,6 +7,9 @@ import threading import unittest import hashlib +import subprocess +import sys +import tempfile from test.support import hashlib_helper from test.support import threading_helper @@ -660,6 +663,38 @@ def test_line_iteration(self): (index, len(lines[index]), len(line))) self.assertEqual(index + 1, len(lines)) + def test_download_to_stdout(self): + content = b"My hovercraft is full of eels." + handler = self.start_server([(200, [], content)]) + proc = subprocess.run( + [sys.executable, "-m", "urllib.request", + "http://localhost:%s" % handler.port], + capture_output=True + ) + self.assertEqual(proc.stdout, content) + self.assertEqual(proc.stderr, b"") + + def test_download_to_file(self): + content = b"I will not buy this record; it is scratched." + handler = self.start_server([(200, [], content)]*2) + + with tempfile.TemporaryDirectory() as directory: + for option in ["--output", "-o"]: + filename = os.path.join( + directory, "download-test%s.txt" % option + ) + proc = subprocess.run( + [sys.executable, "-m", "urllib.request", + "http://localhost:%s" % handler.port, + option, filename], + capture_output=True + ) + with open(filename, "rb") as f: + file_content = f.read() + self.assertEqual(proc.stdout, b"") + self.assertEqual(proc.stderr, b"") + self.assertEqual(file_content, content) + def setUpModule(): thread_info = threading_helper.threading_setup() From 33859f4a21e91566adceb9d7f3dc2613f37937c7 Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Tue, 26 Oct 2021 10:23:13 +0200 Subject: [PATCH 3/7] chore: add my name to ACKS And some housekeeping. --- Misc/ACKS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/ACKS b/Misc/ACKS index 23c92abb4d02a7..0698028900605c 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -76,6 +76,7 @@ Jason Asbahr David Ascher Ammar Askar Neil Aspinall +Peter Åstrand Chris AtLee Aymeric Augustin Andres Ayala @@ -491,6 +492,7 @@ Daniel Ellis Phil Elson David Ely Victor van den Elzen +Vlad Emelianov Jeff Epler Tom Epperly Gökcen Eraslan @@ -1384,6 +1386,7 @@ Jean-François Piéronne Oleg Plakhotnyuk Anatoliy Platonov Marcel Plch +Thomas Pohl Remi Pointel Jon Poler Ariel Poliak @@ -1998,8 +2001,5 @@ Tarek Ziadé Jelle Zijlstra Gennadiy Zlobin Doug Zongker -Peter Åstrand -Vlad Emelianov -Andrey Doroschenko (Entries should be added in rough alphabetical order by last names) From 9cbf931c6f6c44c88e4c0d2f7464c81e93eccae5 Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Wed, 27 Oct 2021 14:07:53 +0200 Subject: [PATCH 4/7] fix: typos in news blurb --- .../next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst b/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst index 4a221cecf14a4a..6f8f9cf2fda30b 100644 --- a/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst +++ b/Misc/NEWS.d/next/Library/2021-10-26-07-02-51.bpo-45466.DOzSv2.rst @@ -1,3 +1,3 @@ -The :mod:`urllib.request` module can download files (e.g. ``python -m -urrlib.request https://python.org/``). For more info, call ``python -m -urrlib.request -h``. Patch by Thomas Pohl. +The :mod:`urllib.request` module can now download files (e.g. +``python -m urllib.request https://python.org/``). For more +info, see ``python -m urllib.request -h``. Patch by Thomas Pohl. From 7cb622745943e17b030fbe4d799c0cdff2af9064 Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Mon, 1 Nov 2021 20:20:46 +0100 Subject: [PATCH 5/7] test: use f-strings --- Lib/test/test_urllib2_localnet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_urllib2_localnet.py b/Lib/test/test_urllib2_localnet.py index 4a9b56d72e6539..8d408d93e2bf5e 100644 --- a/Lib/test/test_urllib2_localnet.py +++ b/Lib/test/test_urllib2_localnet.py @@ -668,7 +668,7 @@ def test_download_to_stdout(self): handler = self.start_server([(200, [], content)]) proc = subprocess.run( [sys.executable, "-m", "urllib.request", - "http://localhost:%s" % handler.port], + f"http://localhost:{handler.port}"], capture_output=True ) self.assertEqual(proc.stdout, content) @@ -681,11 +681,11 @@ def test_download_to_file(self): with tempfile.TemporaryDirectory() as directory: for option in ["--output", "-o"]: filename = os.path.join( - directory, "download-test%s.txt" % option + directory, f"download-test{option}.txt" ) proc = subprocess.run( [sys.executable, "-m", "urllib.request", - "http://localhost:%s" % handler.port, + f"http://localhost:{handler.port}", option, filename], capture_output=True ) From aefeb80032a87857c36a4fce875e74acfcc416ab Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Mon, 1 Nov 2021 20:25:40 +0100 Subject: [PATCH 6/7] refactor: use helper function for download --- Lib/urllib/request.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 4e1f83ec228dda..2ec0136c3ba921 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -2783,7 +2783,7 @@ def proxy_bypass(host): proxy_bypass = proxy_bypass_environment -if __name__ == "__main__": +def _download(): from argparse import ArgumentParser from sys import stdout @@ -2808,3 +2808,7 @@ def proxy_bypass(host): if out is not stdout.buffer: out.close() + + +if __name__ == "__main__": + _download() From 2c72f86e47f99bc1ba9713981f4de85aafc81ee7 Mon Sep 17 00:00:00 2001 From: Thomas Pohl Date: Mon, 1 Nov 2021 20:46:51 +0100 Subject: [PATCH 7/7] refactor: according to comments * use buffer to avoid buffer reallocations * catch URLError and print error output * use argparse file handling --- Lib/urllib/request.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 2ec0136c3ba921..9dd6b12c52104b 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -2784,10 +2784,9 @@ def proxy_bypass(host): def _download(): - from argparse import ArgumentParser - from sys import stdout + import argparse - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Download the provided URL (FTP/HTTP/HTTPS supported) " "and print it to stdout by default. If specified, write to OUTPUT " "instead." @@ -2796,18 +2795,21 @@ def _download(): parser.add_argument( "-o", "--output", - type=str, + type=argparse.FileType('wb'), default=sys.stdout.buffer, help="write to OUTPUT instead of stdout" ) args = parser.parse_args() - out = stdout.buffer if args.output is None else open(args.output, "wb") - with urlopen(args.URL) as response: - while data := response.read(1024 * 1024): - out.write(data) - - if out is not stdout.buffer: - out.close() + buffer = memoryview(bytearray(32768)) + try: + with urlopen(args.URL) as response: + while n_bytes_read := response.readinto(buffer): + args.output.write(buffer[:n_bytes_read]) + except URLError as exc: + print(f"Error while downloading '{args.URL}': {exc.reason}") + + if args.output is not sys.stdout.buffer: + args.output.close() if __name__ == "__main__":