Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change URI verification in command line tools #4603

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 13 additions & 5 deletions scrapy/commands/fetch.py
@@ -1,25 +1,26 @@
import sys
from w3lib.url import is_url

from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.exceptions import UsageError
from scrapy.utils.datatypes import SequenceExclude
from scrapy.utils.spider import spidercls_for_request, DefaultSpider
from scrapy.utils.url import is_uri
from scrapy.utils.project import get_project_settings


class Command(ScrapyCommand):

requires_project = False

def syntax(self):
return "[options] <url>"
return "[options] <uri>"

def short_desc(self):
return "Fetch a URL using the Scrapy downloader"
return "Fetch a URI using the Scrapy downloader"

def long_desc(self):
return "Fetch a URL using the Scrapy downloader and print its content " \
return "Fetch a URI using the Scrapy downloader and print its content " \
"to stdout. You may want to use --nolog to disable logging"

def add_options(self, parser):
Expand Down Expand Up @@ -47,8 +48,15 @@ def _print_bytes(self, bytes_):
sys.stdout.buffer.write(bytes_ + b'\n')

def run(self, args, opts):
if len(args) != 1 or not is_url(args[0]):
if len(args) != 1:
raise UsageError()

settings = get_project_settings()
supported_protocols = settings.getdict('DOWNLOAD_HANDLERS_BASE').keys()
supported_protocols |= settings.getdict('DOWNLOAD_HANDLERS').keys()
Comment on lines +55 to +56
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See settings.getwithbase. Using that, extracting this code into a function is less important, as you could simplify the line below as:

        if not is_uri(args[0], settings.getwithbase('DOWNLOAD_HANDLERS')):

if not is_uri(args[0], supported_protocols):
raise UsageError()

request = Request(args[0], callback=self._print_response,
cb_kwargs={"opts": opts}, dont_filter=True)
# by default, let the framework handle redirects,
Expand Down
19 changes: 12 additions & 7 deletions scrapy/commands/parse.py
@@ -1,14 +1,14 @@
import json
import logging

from w3lib.url import is_url

from scrapy.commands import ScrapyCommand
from scrapy.http import Request
from scrapy.item import _BaseItem
from scrapy.utils import display
from scrapy.utils.conf import arglist_to_dict
from scrapy.utils.spider import iterate_spider_output, spidercls_for_request
from scrapy.utils.url import is_uri
from scrapy.utils.project import get_project_settings
from scrapy.exceptions import UsageError

logger = logging.getLogger(__name__)
Expand All @@ -25,10 +25,10 @@ class Command(ScrapyCommand):
first_response = None

def syntax(self):
return "[options] <url>"
return "[options] <uri>"

def short_desc(self):
return "Parse URL (using its spider) and print the results"
return "Parse URI (using its spider) and print the results"

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
Expand Down Expand Up @@ -251,10 +251,15 @@ def process_request_cb_kwargs(self, opts):

def run(self, args, opts):
# parse arguments
if not len(args) == 1 or not is_url(args[0]):
if not len(args) == 1:
raise UsageError()
else:
url = args[0]

settings = get_project_settings()
supported_protocols = settings.getdict('DOWNLOAD_HANDLERS_BASE').keys()
supported_protocols |= settings.getdict('DOWNLOAD_HANDLERS').keys()
if not is_uri(args[0], supported_protocols):
raise UsageError()
url = args[0]

# prepare spidercls
self.set_spidercls(url, opts)
Expand Down
12 changes: 12 additions & 0 deletions scrapy/utils/url.py
Expand Up @@ -133,3 +133,15 @@ def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=
'' if origin_only else parsed_url.query,
'' if strip_fragment else parsed_url.fragment
))


def is_uri(text, protocols={'http', 'https', 'file'}):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 It may be better to make protocols keyword only:

Suggested change
def is_uri(text, protocols={'http', 'https', 'file'}):
def is_uri(text, *, protocols={'http', 'https', 'file'}):

Also, maybe it should be schemes instead of protocols.

regex = r'^(([a-zA-Z][-+.\w]*):)(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
pattern = re.compile(regex)
match = pattern.fullmatch(text)
Comment on lines +140 to +141
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💄 This would be simpler and the performance should be the same:

Suggested change
pattern = re.compile(regex)
match = pattern.fullmatch(text)
match = re.fullmatch(pattern, text)

if match is None:
return False
scheme = match.group(2).lower()
if protocols is not None and scheme not in protocols:
return False
return True
14 changes: 13 additions & 1 deletion tests/test_utils_url.py
Expand Up @@ -2,7 +2,8 @@

from scrapy.spiders import Spider
from scrapy.utils.url import (url_is_from_any_domain, url_is_from_spider,
add_http_if_no_scheme, guess_scheme, strip_url)
add_http_if_no_scheme, guess_scheme, strip_url,
is_uri)


__doctests__ = ['scrapy.utils.url']
Expand Down Expand Up @@ -75,6 +76,17 @@ class MySpider(Spider):
self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))

def test_is_uri(self):
self.assertTrue(is_uri('http://example.com/some/page.html'))
self.assertTrue(is_uri('file://some/path/'))
self.assertTrue(is_uri('s3://name/key', {'s3', 'asdf'}))
self.assertFalse(is_uri('s3://name/key', {'not', 'on', 'the', 'list'}))
self.assertTrue(is_uri('HTTPS://example.com'))
self.assertTrue(is_uri('http://admin@example.com:123/some/path?query#fragment'))
self.assertFalse(is_uri('://no/scheme'))
self.assertFalse(is_uri('#?$://weird/characters'))
self.assertFalse(is_uri('some random text'))


class AddHttpIfNoScheme(unittest.TestCase):

Expand Down