Skip to content

Commit

Permalink
Add typing for scrapy/commands (#6268)
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Mar 6, 2024
1 parent bf14935 commit 6ecc9e0
Show file tree
Hide file tree
Showing 18 changed files with 357 additions and 187 deletions.
48 changes: 28 additions & 20 deletions scrapy/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,61 +3,62 @@
"""

import argparse
import builtins
import os
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, Iterable, List, Optional

from twisted.python import failure

from scrapy.crawler import CrawlerProcess
from scrapy.crawler import Crawler, CrawlerProcess
from scrapy.exceptions import UsageError
from scrapy.utils.conf import arglist_to_dict, feed_process_params_from_cli


class ScrapyCommand:
requires_project = False
requires_project: bool = False
crawler_process: Optional[CrawlerProcess] = None

# default settings to be used for this command instead of global defaults
default_settings: Dict[str, Any] = {}

exitcode = 0
exitcode: int = 0

def __init__(self) -> None:
self.settings: Any = None # set in scrapy.cmdline

def set_crawler(self, crawler):
def set_crawler(self, crawler: Crawler) -> None:
if hasattr(self, "_crawler"):
raise RuntimeError("crawler already set")
self._crawler = crawler
self._crawler: Crawler = crawler

def syntax(self):
def syntax(self) -> str:
"""
Command syntax (preferably one-line). Do not include command name.
"""
return ""

def short_desc(self):
def short_desc(self) -> str:
"""
A short description of the command
"""
return ""

def long_desc(self):
def long_desc(self) -> str:
"""A long description of the command. Return short description when not
available. It cannot contain newlines since contents will be formatted
by optparser which removes newlines and wraps text.
"""
return self.short_desc()

def help(self):
def help(self) -> str:
"""An extensive help for the command. It will be shown when using the
"help" command. It can contain newlines since no post-formatting will
be applied to its contents.
"""
return self.long_desc()

def add_options(self, parser):
def add_options(self, parser: argparse.ArgumentParser) -> None:
"""
Populate option parse with options available for this command
"""
Expand Down Expand Up @@ -92,7 +93,7 @@ def add_options(self, parser):
)
group.add_argument("--pdb", action="store_true", help="enable pdb on failure")

def process_options(self, args, opts):
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
try:
self.settings.setdict(arglist_to_dict(opts.set), priority="cmdline")
except ValueError:
Expand Down Expand Up @@ -129,8 +130,8 @@ class BaseRunSpiderCommand(ScrapyCommand):
Common class used to share functionality between the crawl, parse and runspider commands
"""

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
def add_options(self, parser: argparse.ArgumentParser) -> None:
super().add_options(parser)
parser.add_argument(
"-a",
dest="spargs",
Expand Down Expand Up @@ -162,8 +163,8 @@ def add_options(self, parser):
help="format to use for dumping items",
)

def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
def process_options(self, args: List[str], opts: argparse.Namespace) -> None:
super().process_options(args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
Expand All @@ -183,19 +184,26 @@ class ScrapyHelpFormatter(argparse.HelpFormatter):
Help Formatter for scrapy command line help messages.
"""

def __init__(self, prog, indent_increment=2, max_help_position=24, width=None):
def __init__(
self,
prog: str,
indent_increment: int = 2,
max_help_position: int = 24,
width: Optional[int] = None,
):
super().__init__(
prog,
indent_increment=indent_increment,
max_help_position=max_help_position,
width=width,
)

def _join_parts(self, part_strings):
parts = self.format_part_strings(part_strings)
def _join_parts(self, part_strings: Iterable[str]) -> str:
# scrapy.commands.list shadows builtins.list
parts = self.format_part_strings(builtins.list(part_strings))
return super()._join_parts(parts)

def format_part_strings(self, part_strings):
def format_part_strings(self, part_strings: List[str]) -> List[str]:
"""
Underline and title case command line help message headers.
"""
Expand Down
18 changes: 12 additions & 6 deletions scrapy/commands/bench.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import argparse
import subprocess # nosec
import sys
import time
from typing import Any, Iterable, List
from urllib.parse import urlencode

import scrapy
from scrapy import Request
from scrapy.commands import ScrapyCommand
from scrapy.http import Response
from scrapy.linkextractors import LinkExtractor


Expand All @@ -15,26 +19,28 @@ class Command(ScrapyCommand):
"CLOSESPIDER_TIMEOUT": 10,
}

def short_desc(self):
def short_desc(self) -> str:
return "Run quick benchmark test"

def run(self, args, opts):
def run(self, args: List[str], opts: argparse.Namespace) -> None:
with _BenchServer():
assert self.crawler_process
self.crawler_process.crawl(_BenchSpider, total=100000)
self.crawler_process.start()


class _BenchServer:
def __enter__(self):
def __enter__(self) -> None:
from scrapy.utils.test import get_testenv

pargs = [sys.executable, "-u", "-m", "scrapy.utils.benchserver"]
self.proc = subprocess.Popen(
pargs, stdout=subprocess.PIPE, env=get_testenv()
) # nosec
assert self.proc.stdout
self.proc.stdout.readline()

def __exit__(self, exc_type, exc_value, traceback):
def __exit__(self, exc_type, exc_value, traceback) -> None:
self.proc.kill()
self.proc.wait()
time.sleep(0.2)
Expand All @@ -49,11 +55,11 @@ class _BenchSpider(scrapy.Spider):
baseurl = "http://localhost:8998"
link_extractor = LinkExtractor()

def start_requests(self):
def start_requests(self) -> Iterable[Request]:
qargs = {"total": self.total, "show": self.show}
url = f"{self.baseurl}?{urlencode(qargs, doseq=True)}"
return [scrapy.Request(url, dont_filter=True)]

def parse(self, response):
def parse(self, response: Response) -> Any: # type: ignore[override]
for link in self.link_extractor.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse)
18 changes: 11 additions & 7 deletions scrapy/commands/check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import time
from collections import defaultdict
from typing import List
from unittest import TextTestResult as _TextTestResult
from unittest import TextTestRunner

Expand All @@ -10,9 +12,10 @@


class TextTestResult(_TextTestResult):
def printSummary(self, start, stop):
def printSummary(self, start: float, stop: float) -> None:
write = self.stream.write
writeln = self.stream.writeln
# _WritelnDecorator isn't implemented in typeshed yet
writeln = self.stream.writeln # type: ignore[attr-defined]

run = self.testsRun
plural = "s" if run != 1 else ""
Expand Down Expand Up @@ -42,14 +45,14 @@ class Command(ScrapyCommand):
requires_project = True
default_settings = {"LOG_ENABLED": False}

def syntax(self):
def syntax(self) -> str:
return "[options] <spider>"

def short_desc(self):
def short_desc(self) -> str:
return "Check spider contracts"

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
def add_options(self, parser: argparse.ArgumentParser) -> None:
super().add_options(parser)
parser.add_argument(
"-l",
"--list",
Expand All @@ -66,7 +69,7 @@ def add_options(self, parser):
help="print contract tests for all spiders",
)

def run(self, args, opts):
def run(self, args: List[str], opts: argparse.Namespace) -> None:
# load contracts
contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
conman = ContractsManager(load_object(c) for c in contracts)
Expand All @@ -76,6 +79,7 @@ def run(self, args, opts):
# contract requests
contract_reqs = defaultdict(list)

assert self.crawler_process
spider_loader = self.crawler_process.spider_loader

with set_environ(SCRAPY_CHECK="true"):
Expand Down
14 changes: 10 additions & 4 deletions scrapy/commands/crawl.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import argparse
from typing import List, cast

from twisted.python.failure import Failure

from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError


class Command(BaseRunSpiderCommand):
requires_project = True

def syntax(self):
def syntax(self) -> str:
return "[options] <spider>"

def short_desc(self):
def short_desc(self) -> str:
return "Run a spider"

def run(self, args, opts):
def run(self, args: List[str], opts: argparse.Namespace) -> None:
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
Expand All @@ -20,10 +25,11 @@ def run(self, args, opts):
)
spname = args[0]

assert self.crawler_process
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)

if getattr(crawl_defer, "result", None) is not None and issubclass(
crawl_defer.result.type, Exception
cast(Failure, crawl_defer.result).type, Exception
):
self.exitcode = 1
else:
Expand Down
14 changes: 9 additions & 5 deletions scrapy/commands/edit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import argparse
import os
import sys
from typing import List

from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
Expand All @@ -9,32 +11,34 @@ class Command(ScrapyCommand):
requires_project = True
default_settings = {"LOG_ENABLED": False}

def syntax(self):
def syntax(self) -> str:
return "<spider>"

def short_desc(self):
def short_desc(self) -> str:
return "Edit spider"

def long_desc(self):
def long_desc(self) -> str:
return (
"Edit a spider using the editor defined in the EDITOR environment"
" variable or else the EDITOR setting"
)

def _err(self, msg):
def _err(self, msg: str) -> None:
sys.stderr.write(msg + os.linesep)
self.exitcode = 1

def run(self, args, opts):
def run(self, args: List[str], opts: argparse.Namespace) -> None:
if len(args) != 1:
raise UsageError()

editor = self.settings["EDITOR"]
assert self.crawler_process
try:
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err(f"Spider not found: {args[0]}")

sfile = sys.modules[spidercls.__module__].__file__
assert sfile
sfile = sfile.replace(".pyc", ".py")
self.exitcode = os.system(f'{editor} "{sfile}"') # nosec

0 comments on commit 6ecc9e0

Please sign in to comment.