Support genspider with HTTPS (#5808)

scrapy · Mar 14, 2023 · 101a0c3 · 101a0c3
1 parent 9411cf4
commit 101a0c3
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 16 deletions.
diff --git a/docs/topics/commands.rst b/docs/topics/commands.rst
@@ -238,9 +238,6 @@ genspider
 
 Create a new spider in the current folder or in the current project's ``spiders`` folder, if called from inside a project. The ``<name>`` parameter is set as the spider's ``name``, while ``<domain or URL>`` is used to generate the ``allowed_domains`` and ``start_urls`` spider's attributes.
 
-.. note:: Even if an HTTPS URL is specified, the protocol used in
-          ``start_urls`` is always HTTP. This is a known issue: :issue:`3553`.
-
 Usage example::
 
     $ scrapy genspider -l

diff --git a/scrapy/commands/genspider.py b/scrapy/commands/genspider.py
@@ -31,6 +31,14 @@ def extract_domain(url):
     return o.netloc
 
 
+def verify_url_scheme(url):
+    """Check url for scheme and insert https if none found."""
+    parsed = urlparse(url)
+    if parsed.scheme == "" and parsed.netloc == "":
+        parsed = urlparse("//" + url)._replace(scheme="https")
+    return parsed.geturl()
+
+
 class Command(ScrapyCommand):
     requires_project = False
     default_settings = {"LOG_ENABLED": False}
@@ -91,7 +99,7 @@ def run(self, args, opts):
             raise UsageError()
 
         name, url = args[0:2]
-        domain = extract_domain(url)
+        url = verify_url_scheme(url)
         module = sanitize_module_name(name)
 
         if self.settings.get("BOT_NAME") == module:
@@ -103,18 +111,20 @@ def run(self, args, opts):
 
         template_file = self._find_template(opts.template)
         if template_file:
-            self._genspider(module, name, domain, opts.template, template_file)
+            self._genspider(module, name, url, opts.template, template_file)
             if opts.edit:
                 self.exitcode = os.system(f'scrapy edit "{name}"')
 
-    def _genspider(self, module, name, domain, template_name, template_file):
+    def _genspider(self, module, name, url, template_name, template_file):
         """Generate the spider module, based on the given template"""
         capitalized_module = "".join(s.capitalize() for s in module.split("_"))
+        domain = extract_domain(url)
         tvars = {
             "project_name": self.settings.get("BOT_NAME"),
             "ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
             "module": module,
             "name": name,
+            "url": url,
             "domain": domain,
             "classname": f"{capitalized_module}Spider",
         }

diff --git a/scrapy/templates/spiders/basic.tmpl b/scrapy/templates/spiders/basic.tmpl
@@ -4,7 +4,7 @@ import scrapy
 class $classname(scrapy.Spider):
     name = "$name"
     allowed_domains = ["$domain"]
-    start_urls = ["http://$domain/"]
+    start_urls = ["$url"]
 
     def parse(self, response):
         pass
diff --git a/scrapy/templates/spiders/crawl.tmpl b/scrapy/templates/spiders/crawl.tmpl
@@ -6,7 +6,7 @@ from scrapy.spiders import CrawlSpider, Rule
 class $classname(CrawlSpider):
     name = "$name"
     allowed_domains = ["$domain"]
-    start_urls = ["http://$domain/"]
+    start_urls = ["$url"]
 
     rules = (Rule(LinkExtractor(allow=r"Items/"), callback="parse_item", follow=True),)
 

diff --git a/scrapy/templates/spiders/csvfeed.tmpl b/scrapy/templates/spiders/csvfeed.tmpl
@@ -4,7 +4,7 @@ from scrapy.spiders import CSVFeedSpider
 class $classname(CSVFeedSpider):
     name = "$name"
     allowed_domains = ["$domain"]
-    start_urls = ["http://$domain/feed.csv"]
+    start_urls = ["$url"]
     #headers = ["id", "name", "description", "image_link"]
     #delimiter = "\t"
 

diff --git a/scrapy/templates/spiders/xmlfeed.tmpl b/scrapy/templates/spiders/xmlfeed.tmpl
@@ -4,7 +4,7 @@ from scrapy.spiders import XMLFeedSpider
 class $classname(XMLFeedSpider):
     name = "$name"
     allowed_domains = ["$domain"]
-    start_urls = ["http://$domain/feed.xml"]
+    start_urls = ["$url"]
     iterator = "iternodes"  # you can change this; see the docs
     itertag = "item"  # change it accordingly
 

diff --git a/tests/test_commands.py b/tests/test_commands.py
@@ -541,21 +541,72 @@ def test_url(self, url="test.com", domain="test.com"):
             ).group(1),
         )
         self.assertEqual(
-            f"http://{domain}/",
+            f"https://{domain}",
             self.find_in_file(
                 Path(self.proj_mod_path, "spiders", "test_name.py"),
                 r"start_urls\s*=\s*\[['\"](.+)['\"]\]",
             ).group(1),
         )
 
     def test_url_schema(self):
-        self.test_url("http://test.com", "test.com")
+        self.test_url("https://test.com", "test.com")
 
-    def test_url_path(self):
-        self.test_url("test.com/some/other/page", "test.com")
+    def test_template_start_urls(
+        self, url="test.com", expected="https://test.com", template="basic"
+    ):
+        self.assertEqual(
+            0, self.call("genspider", "-t", template, "--force", "test_name", url)
+        )
+        self.assertEqual(
+            expected,
+            self.find_in_file(
+                Path(self.proj_mod_path, "spiders", "test_name.py"),
+                r"start_urls\s*=\s*\[['\"](.+)['\"]\]",
+            ).group(1),
+        )
+
+    def test_genspider_basic_start_urls(self):
+        self.test_template_start_urls("https://test.com", "https://test.com", "basic")
+        self.test_template_start_urls("http://test.com", "http://test.com", "basic")
+        self.test_template_start_urls(
+            "http://test.com/other/path", "http://test.com/other/path", "basic"
+        )
+        self.test_template_start_urls(
+            "test.com/other/path", "https://test.com/other/path", "basic"
+        )
 
-    def test_url_schema_path(self):
-        self.test_url("https://test.com/some/other/page", "test.com")
+    def test_genspider_crawl_start_urls(self):
+        self.test_template_start_urls("https://test.com", "https://test.com", "crawl")
+        self.test_template_start_urls("http://test.com", "http://test.com", "crawl")
+        self.test_template_start_urls(
+            "http://test.com/other/path", "http://test.com/other/path", "crawl"
+        )
+        self.test_template_start_urls(
+            "test.com/other/path", "https://test.com/other/path", "crawl"
+        )
+        self.test_template_start_urls("test.com", "https://test.com", "crawl")
+
+    def test_genspider_xmlfeed_start_urls(self):
+        self.test_template_start_urls(
+            "https://test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"
+        )
+        self.test_template_start_urls(
+            "http://test.com/feed.xml", "http://test.com/feed.xml", "xmlfeed"
+        )
+        self.test_template_start_urls(
+            "test.com/feed.xml", "https://test.com/feed.xml", "xmlfeed"
+        )
+
+    def test_genspider_csvfeed_start_urls(self):
+        self.test_template_start_urls(
+            "https://test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"
+        )
+        self.test_template_start_urls(
+            "http://test.com/feed.xml", "http://test.com/feed.xml", "csvfeed"
+        )
+        self.test_template_start_urls(
+            "test.com/feed.csv", "https://test.com/feed.csv", "csvfeed"
+        )
 
 
 class GenspiderStandaloneCommandTest(ProjectTest):