scrapy · michalp2213 · May 31, 2020 · May 31, 2020 · Jun 8, 2020 · Gallaecio
diff --git a/scrapy/commands/fetch.py b/scrapy/commands/fetch.py
@@ -1,25 +1,26 @@
 import sys
-from w3lib.url import is_url
 
 from scrapy.commands import ScrapyCommand
 from scrapy.http import Request
 from scrapy.exceptions import UsageError
 from scrapy.utils.datatypes import SequenceExclude
 from scrapy.utils.spider import spidercls_for_request, DefaultSpider
+from scrapy.utils.url import is_uri
+from scrapy.utils.project import get_project_settings
 
 
 class Command(ScrapyCommand):
 
     requires_project = False
 
     def syntax(self):
-        return "[options] <url>"
+        return "[options] <uri>"
 
     def short_desc(self):
-        return "Fetch a URL using the Scrapy downloader"
+        return "Fetch a URI using the Scrapy downloader"
 
     def long_desc(self):
-        return "Fetch a URL using the Scrapy downloader and print its content " \
+        return "Fetch a URI using the Scrapy downloader and print its content " \
             "to stdout. You may want to use --nolog to disable logging"
 
     def add_options(self, parser):
@@ -47,8 +48,15 @@ def _print_bytes(self, bytes_):
         sys.stdout.buffer.write(bytes_ + b'\n')
 
     def run(self, args, opts):
-        if len(args) != 1 or not is_url(args[0]):
+        if len(args) != 1:
             raise UsageError()
+
+        settings = get_project_settings()
+        supported_protocols = settings.getdict('DOWNLOAD_HANDLERS_BASE').keys()
+        supported_protocols |= settings.getdict('DOWNLOAD_HANDLERS').keys()
+        if not is_uri(args[0], supported_protocols):
+            raise UsageError()
+
         request = Request(args[0], callback=self._print_response,
                           cb_kwargs={"opts": opts}, dont_filter=True)
         # by default, let the framework handle redirects,

diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py
@@ -1,14 +1,14 @@
 import json
 import logging
 
-from w3lib.url import is_url
-
 from scrapy.commands import ScrapyCommand
 from scrapy.http import Request
 from scrapy.item import _BaseItem
 from scrapy.utils import display
 from scrapy.utils.conf import arglist_to_dict
 from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
+from scrapy.utils.url import is_uri
+from scrapy.utils.project import get_project_settings
 from scrapy.exceptions import UsageError
 
 logger = logging.getLogger(__name__)
@@ -25,10 +25,10 @@ class Command(ScrapyCommand):
     first_response = None
 
     def syntax(self):
-        return "[options] <url>"
+        return "[options] <uri>"
 
     def short_desc(self):
-        return "Parse URL (using its spider) and print the results"
+        return "Parse URI (using its spider) and print the results"
 
     def add_options(self, parser):
         ScrapyCommand.add_options(self, parser)
@@ -251,10 +251,15 @@ def process_request_cb_kwargs(self, opts):
 
     def run(self, args, opts):
         # parse arguments
-        if not len(args) == 1 or not is_url(args[0]):
+        if not len(args) == 1:
             raise UsageError()
-        else:
-            url = args[0]
+
+        settings = get_project_settings()
+        supported_protocols = settings.getdict('DOWNLOAD_HANDLERS_BASE').keys()
+        supported_protocols |= settings.getdict('DOWNLOAD_HANDLERS').keys()
+        if not is_uri(args[0], supported_protocols):
+            raise UsageError()
+        url = args[0]
 
         # prepare spidercls
         self.set_spidercls(url, opts)

diff --git a/scrapy/utils/url.py b/scrapy/utils/url.py
@@ -133,3 +133,15 @@ def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=
         '' if origin_only else parsed_url.query,
         '' if strip_fragment else parsed_url.fragment
     ))
+
+
+def is_uri(text, protocols={'http', 'https', 'file'}):
-def is_uri(text, protocols={'http', 'https', 'file'}):
+def is_uri(text, *, protocols={'http', 'https', 'file'}):
-def is_uri(text, protocols={'http', 'https', 'file'}):
+def is_uri(text, *, protocols={'http', 'https', 'file'}):
+    regex = r'^(([a-zA-Z][-+.\w]*):)(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
+    pattern = re.compile(regex)
+    match = pattern.fullmatch(text)
-    pattern = re.compile(regex)
-    match = pattern.fullmatch(text)
+    match = re.fullmatch(pattern, text)
-    pattern = re.compile(regex)
-    match = pattern.fullmatch(text)
+    match = re.fullmatch(pattern, text)
+    if match is None:
+        return False
+    scheme = match.group(2).lower()
+    if protocols is not None and scheme not in protocols:
+        return False
+    return True
diff --git a/tests/test_utils_url.py b/tests/test_utils_url.py
@@ -2,7 +2,8 @@
 
 from scrapy.spiders import Spider
 from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
-                              add_http_if_no_scheme, guess_scheme, strip_url)
+                              add_http_if_no_scheme, guess_scheme, strip_url,
+                              is_uri)
 
 
 __doctests__ = ['scrapy.utils.url']
@@ -75,6 +76,17 @@ class MySpider(Spider):
         self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
         self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
 
+    def test_is_uri(self):
+        self.assertTrue(is_uri('http://example.com/some/page.html'))
+        self.assertTrue(is_uri('file://some/path/'))
+        self.assertTrue(is_uri('s3://name/key', {'s3', 'asdf'}))
+        self.assertFalse(is_uri('s3://name/key', {'not', 'on', 'the', 'list'}))
+        self.assertTrue(is_uri('HTTPS://example.com'))
+        self.assertTrue(is_uri('http://admin@example.com:123/some/path?query#fragment'))
+        self.assertFalse(is_uri('://no/scheme'))
+        self.assertFalse(is_uri('#?$://weird/characters'))
+        self.assertFalse(is_uri('some random text'))
+
 
 class AddHttpIfNoScheme(unittest.TestCase):