Skip to content

Commit

Permalink
Support genspider with HTTPS (#5808)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexpdev committed Mar 14, 2023
1 parent 9411cf4 commit 101a0c3
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 16 deletions.
3 changes: 0 additions & 3 deletions docs/topics/commands.rst
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,6 @@ genspider

Create a new spider in the current folder or in the current project's ``spiders`` folder, if called from inside a project. The ``<name>`` parameter is set as the spider's ``name``, while ``<domain or URL>`` is used to generate the ``allowed_domains`` and ``start_urls`` spider's attributes.

.. note:: Even if an HTTPS URL is specified, the protocol used in
``start_urls`` is always HTTP. This is a known issue: :issue:`3553`.

Usage example::

$ scrapy genspider -l
Expand Down
16 changes: 13 additions & 3 deletions scrapy/commands/genspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ def extract_domain(url):
return o.netloc


def verify_url_scheme(url):
"""Check url for scheme and insert https if none found."""
parsed = urlparse(url)
if parsed.scheme == "" and parsed.netloc == "":
parsed = urlparse("//" + url)._replace(scheme="https")
return parsed.geturl()


class Command(ScrapyCommand):
requires_project = False
default_settings = {"LOG_ENABLED": False}
Expand Down Expand Up @@ -91,7 +99,7 @@ def run(self, args, opts):
raise UsageError()

name, url = args[0:2]
domain = extract_domain(url)
url = verify_url_scheme(url)
module = sanitize_module_name(name)

if self.settings.get("BOT_NAME") == module:
Expand All @@ -103,18 +111,20 @@ def run(self, args, opts):

template_file = self._find_template(opts.template)
if template_file:
self._genspider(module, name, domain, opts.template, template_file)
self._genspider(module, name, url, opts.template, template_file)
if opts.edit:
self.exitcode = os.system(f'scrapy edit "{name}"')

def _genspider(self, module, name, domain, template_name, template_file):
def _genspider(self, module, name, url, template_name, template_file):
"""Generate the spider module, based on the given template"""
capitalized_module = "".join(s.capitalize() for s in module.split("_"))
domain = extract_domain(url)
tvars = {
"project_name": self.settings.get("BOT_NAME"),
"ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
"module": module,
"name": name,
"url": url,
"domain": domain,
"classname": f"{capitalized_module}Spider",
}
Expand Down
2 changes: 1 addition & 1 deletion scrapy/templates/spiders/basic.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import scrapy
class $classname(scrapy.Spider):
name = "$name"
allowed_domains = ["$domain"]
start_urls = ["http://$domain/"]
start_urls = ["$url"]

def parse(self, response):
pass
2 changes: 1 addition & 1 deletion scrapy/templates/spiders/crawl.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ from scrapy.spiders import CrawlSpider, Rule
class $classname(CrawlSpider):
name = "$name"
allowed_domains = ["$domain"]
start_urls = ["http://$domain/"]
start_urls = ["$url"]

rules = (Rule(LinkExtractor(allow=r"Items/"), callback="parse_item", follow=True),)

Expand Down
2 changes: 1 addition & 1 deletion scrapy/templates/spiders/csvfeed.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from scrapy.spiders import CSVFeedSpider
class $classname(CSVFeedSpider):
name = "$name"
allowed_domains = ["$domain"]
start_urls = ["http://$domain/feed.csv"]
start_urls = ["$url"]
#headers = ["id", "name", "description", "image_link"]
#delimiter = "\t"

Expand Down
2 changes: 1 addition & 1 deletion scrapy/templates/spiders/xmlfeed.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from scrapy.spiders import XMLFeedSpider
class $classname(XMLFeedSpider):
name = "$name"
allowed_domains = ["$domain"]
start_urls = ["http://$domain/feed.xml"]
start_urls = ["$url"]
iterator = "iternodes" # you can change this; see the docs
itertag = "item" # change it accordingly

Expand Down
63 changes: 57 additions & 6 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,21 +541,72 @@ def test_url(self, url="test.com", domain="test.com"):
).group(1),
)
self.assertEqual(
f"http://{domain}/",
f"https://{domain}",
self.find_in_file(
Path(self.proj_mod_path, "spiders", "test_name.py"),
r"start_urls\s*=\s*\[['\"](.+)['\"]\]",
).group(1),
)

def test_url_schema(self):
self.test_url("http://test.com", "test.com")
self.test_url("https://test.com", "test.com")

def test_url_path(self):
self.test_url("test.com/some/other/page", "test.com")
def test_template_start_urls(
self, url="test.com", expected="https://test.com", template="basic"
):
self.assertEqual(
0, self.call("genspider", "-t", template, "--force", "test_name", url)
)
self.assertEqual(
expected,
self.find_in_file(
Path(self.proj_mod_path, "spiders", "test_name.py"),
r"start_urls\s*=\s*\[['\"](.+)['\"]\]",
).group(1),
)

def test_genspider_basic_start_urls(self):
self.test_template_start_urls("https://test.com", "https://test.com", "basic")
self.test_template_start_urls("http://test.com", "http://test.com", "basic")
self.test_template_start_urls(
"http://test.com/other/path", "http://test.com/other/path", "basic"
)
self.test_template_start_urls(
"test.com/other/path", "https://test.com/other/path", "basic"
)

def test_url_schema_path(self):
self.test_url("https://test.com/some/other/page", "test.com")
def test_genspider_crawl_start_urls(self):
self.test_template_start_urls("https://test.com", "https://test.com", "crawl")
self.test_template_start_urls("http://test.com", "http://test.com", "crawl")
self.test_template_start_urls(
"http://test.com/other/path", "http://test.com/other/path", "crawl"
)
self.test_template_start_urls(
"test.com/other/path", "https://test.com/other/path", "crawl"
)
self.test_template_start_urls("test.com", "https://test.com", "crawl")

def test_genspider_xmlfeed_start_urls(self):
self.test_template_start_urls(
"https://test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"
)
self.test_template_start_urls(
"http://test.com/feed.xml", "http://test.com/feed.xml", "xmlfeed"
)
self.test_template_start_urls(
"test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"
)

def test_genspider_csvfeed_start_urls(self):
self.test_template_start_urls(
"https://test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"
)
self.test_template_start_urls(
"http://test.com/feed.xml", "http://test.com/feed.xml", "csvfeed"
)
self.test_template_start_urls(
"test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"
)


class GenspiderStandaloneCommandTest(ProjectTest):
Expand Down

0 comments on commit 101a0c3

Please sign in to comment.