Skip to content

Commit

Permalink
Merge pull request #1166 from scrapy/spider-loader
Browse files Browse the repository at this point in the history
[MRG+1] rename SpiderManager to SpiderLoader
  • Loading branch information
dangra committed Apr 21, 2015
2 parents 06e1ca9 + ad587ea commit e034947
Show file tree
Hide file tree
Showing 25 changed files with 179 additions and 127 deletions.
20 changes: 10 additions & 10 deletions docs/topics/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -344,22 +344,22 @@ Settings API

Alias for a :meth:`~freeze` call in the object returned by :meth:`copy`

.. _topics-api-spidermanager:
.. _topics-api-spiderloader:

SpiderManager API
=================
SpiderLoader API
================

.. module:: scrapy.spidermanager
:synopsis: The spider manager
.. module:: scrapy.loader
:synopsis: The spider loader

.. class:: SpiderManager
.. class:: SpiderLoader

This class is in charge of retrieving and handling the spider classes
defined across the project.

Custom spider managers can be employed by specifying their path in the
:setting:`SPIDER_MANAGER_CLASS` project setting. They must fully implement
the :class:`scrapy.interfaces.ISpiderManager` interface to guarantee an
Custom spider loaders can be employed by specifying their path in the
:setting:`SPIDER_LOADER_CLASS` project setting. They must fully implement
the :class:`scrapy.interfaces.ISpiderLoader` interface to guarantee an
errorless execution.

.. method:: from_settings(settings)
Expand Down Expand Up @@ -486,7 +486,7 @@ class (which they all inherit from).

Set the given value for the given key only if current value for the
same key is lower than value. If there is no current value for the
given key, the value is always set.
given key, the value is always set.

.. method:: min_value(key, value)

Expand Down
12 changes: 6 additions & 6 deletions docs/topics/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -853,15 +853,15 @@ A dict containing the scrapy contracts enabled by default in Scrapy. You should
never modify this setting in your project, modify :setting:`SPIDER_CONTRACTS`
instead. For more info see :ref:`topics-contracts`.

.. setting:: SPIDER_MANAGER_CLASS
.. setting:: SPIDER_LOADER_CLASS

SPIDER_MANAGER_CLASS
--------------------
SPIDER_LOADER_CLASS
-------------------

Default: ``'scrapy.spidermanager.SpiderManager'``
Default: ``'scrapy.spiderloader.SpiderLoader'``

The class that will be used for handling spiders, which must implement the
:ref:`topics-api-spidermanager`.
The class that will be used for loading spiders, which must implement the
:ref:`topics-api-spiderloader`.

.. setting:: SPIDER_MIDDLEWARES

Expand Down
6 changes: 3 additions & 3 deletions scrapy/commands/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def run(self, args, opts):
# contract requests
contract_reqs = defaultdict(list)

spiders = self.crawler_process.spiders
spider_loader = self.crawler_process.spider_loader

for spidername in args or spiders.list():
spidercls = spiders.load(spidername)
for spidername in args or spider_loader.list():
spidercls = spider_loader.load(spidername)
spidercls.start_requests = lambda s: conman.from_spider(s, result)

tested_methods = conman.tested_methods_from_spidercls(spidercls)
Expand Down
2 changes: 1 addition & 1 deletion scrapy/commands/edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def run(self, args, opts):

editor = self.settings['EDITOR']
try:
spidercls = self.crawler_process.spiders.load(args[0])
spidercls = self.crawler_process.spider_loader.load(args[0])
except KeyError:
return self._err("Spider not found: %s" % args[0])

Expand Down
6 changes: 3 additions & 3 deletions scrapy/commands/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ def run(self, args, opts):
request.meta['handle_httpstatus_all'] = True

spidercls = DefaultSpider
spiders = self.crawler_process.spiders
spider_loader = self.crawler_process.spider_loader
if opts.spider:
spidercls = spiders.load(opts.spider)
spidercls = spider_loader.load(opts.spider)
else:
spidercls = spidercls_for_request(spiders, request, spidercls)
spidercls = spidercls_for_request(spider_loader, request, spidercls)
self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
self.crawler_process.start()
2 changes: 1 addition & 1 deletion scrapy/commands/genspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def run(self, args, opts):
return

try:
spidercls = self.crawler_process.spiders.load(name)
spidercls = self.crawler_process.spider_loader.load(name)
except KeyError:
pass
else:
Expand Down
2 changes: 1 addition & 1 deletion scrapy/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ def short_desc(self):
return "List available spiders"

def run(self, args, opts):
for s in sorted(self.crawler_process.spiders.list()):
for s in sorted(self.crawler_process.spider_loader.list()):
print(s)
6 changes: 3 additions & 3 deletions scrapy/commands/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,15 @@ def get_callback_from_rules(self, spider, response):
level=log.ERROR, spider=spider.name)

def set_spidercls(self, url, opts):
spiders = self.crawler_process.spiders
spider_loader = self.crawler_process.spider_loader
if opts.spider:
try:
self.spidercls = spiders.load(opts.spider)
self.spidercls = spider_loader.load(opts.spider)
except KeyError:
log.msg(format='Unable to find spider: %(spider)s',
level=log.ERROR, spider=opts.spider)
else:
self.spidercls = spidercls_for_request(spiders, Request(url))
self.spidercls = spidercls_for_request(spider_loader, Request(url))
if not self.spidercls:
log.msg(format='Unable to find spider for: %(url)s',
level=log.ERROR, url=url)
Expand Down
6 changes: 3 additions & 3 deletions scrapy/commands/shell.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@ def update_vars(self, vars):

def run(self, args, opts):
url = args[0] if args else None
spiders = self.crawler_process.spiders
spider_loader = self.crawler_process.spider_loader

spidercls = DefaultSpider
if opts.spider:
spidercls = spiders.load(opts.spider)
spidercls = spider_loader.load(opts.spider)
elif url:
spidercls = spidercls_for_request(spiders, Request(url),
spidercls = spidercls_for_request(spider_loader, Request(url),
spidercls, log_multiple=True)

# The crawler is created this way since the Shell manually handles the
Expand Down
37 changes: 28 additions & 9 deletions scrapy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from scrapy.core.engine import ExecutionEngine
from scrapy.resolver import CachingThreadedResolver
from scrapy.interfaces import ISpiderManager
from scrapy.interfaces import ISpiderLoader
from scrapy.extension import ExtensionManager
from scrapy.settings import Settings
from scrapy.signalmanager import SignalManager
Expand Down Expand Up @@ -43,12 +43,11 @@ def __init__(self, spidercls, settings):
def spiders(self):
if not hasattr(self, '_spiders'):
warnings.warn("Crawler.spiders is deprecated, use "
"CrawlerRunner.spiders or instantiate "
"scrapy.spidermanager.SpiderManager with your "
"CrawlerRunner.spider_loader or instantiate "
"scrapy.spiderloader.SpiderLoader with your "
"settings.",
category=ScrapyDeprecationWarning, stacklevel=2)
spman_cls = load_object(self.settings['SPIDER_MANAGER_CLASS'])
self._spiders = spman_cls.from_settings(self.settings)
self._spiders = _get_spider_loader(self.settings.frozencopy())
return self._spiders

@defer.inlineCallbacks
Expand Down Expand Up @@ -85,12 +84,17 @@ def __init__(self, settings):
if isinstance(settings, dict):
settings = Settings(settings)
self.settings = settings
smcls = load_object(settings['SPIDER_MANAGER_CLASS'])
verifyClass(ISpiderManager, smcls)
self.spiders = smcls.from_settings(settings.frozencopy())
self.spider_loader = _get_spider_loader(settings)
self.crawlers = set()
self._active = set()

@property
def spiders(self):
warnings.warn("CrawlerRunner.spiders attribute is renamed to "
"CrawlerRunner.spider_loader.",
category=ScrapyDeprecationWarning, stacklevel=2)
return self.spider_loader

def crawl(self, crawler_or_spidercls, *args, **kwargs):
crawler = crawler_or_spidercls
if not isinstance(crawler_or_spidercls, Crawler):
Expand All @@ -110,7 +114,7 @@ def _done(result):

def _create_crawler(self, spidercls):
if isinstance(spidercls, six.string_types):
spidercls = self.spiders.load(spidercls)
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, self.settings)

def _setup_crawler_logging(self, crawler):
Expand Down Expand Up @@ -178,3 +182,18 @@ def _stop_reactor(self, _=None):
reactor.stop()
except RuntimeError: # raised if already stopped or in shutdown stage
pass


def _get_spider_loader(settings):
""" Get SpiderLoader instance from settings """
if settings.get('SPIDER_MANAGER_CLASS'):
warnings.warn(
'SPIDER_MANAGER_CLASS option is deprecated. '
'Please use SPIDER_LOADER_CLASS.',
category=ScrapyDeprecationWarning, stacklevel=2
)
cls_path = settings.get('SPIDER_LOADER_CLASS',
settings.get('SPIDER_MANAGER_CLASS'))
loader_cls = load_object(cls_path)
verifyClass(ISpiderLoader, loader_cls)
return loader_cls.from_settings(settings.frozencopy())
13 changes: 9 additions & 4 deletions scrapy/interfaces.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
from zope.interface import Interface

class ISpiderManager(Interface):
class ISpiderLoader(Interface):

def from_settings(settings):
"""Returns an instance of the class for the given settings"""
"""Return an instance of the class for the given settings"""

def load(spider_name):
"""Returns the Spider class for the given spider name. If the spider
"""Return the Spider class for the given spider name. If the spider
name is not found, it must raise a KeyError."""

def list():
"""Return a list with the names of all spiders available in the
project"""

def find_by_request(request):
"""Returns the list of spiders names that can handle the given request"""
"""Return the list of spiders names that can handle the given request"""


# ISpiderManager is deprecated, don't use it!
# An alias is kept for backwards compatibility.
ISpiderManager = ISpiderLoader
2 changes: 1 addition & 1 deletion scrapy/settings/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@
SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'

SPIDER_MANAGER_CLASS = 'scrapy.spidermanager.SpiderManager'
SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'

SPIDER_MIDDLEWARES = {}

Expand Down
8 changes: 5 additions & 3 deletions scrapy/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def __init__(self, message):
def __getattr__(self, name):
raise AttributeError(self.message)

spiders = ObsoleteClass("""
"from scrapy.spider import spiders" no longer works - use "from scrapy.spidermanager import SpiderManager" and instantiate it with your project settings"
""")
spiders = ObsoleteClass(
'"from scrapy.spider import spiders" no longer works - use '
'"from scrapy.spiderloader import SpiderLoader" and instantiate '
'it with your project settings"'
)

53 changes: 53 additions & 0 deletions scrapy/spiderloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import

from zope.interface import implementer

from scrapy.interfaces import ISpiderLoader
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes


@implementer(ISpiderLoader)
class SpiderLoader(object):
"""
SpiderLoader is a class which locates and loads spiders
in a Scrapy project.
"""
def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
self._spiders = {}
for name in self.spider_modules:
for module in walk_modules(name):
self._load_spiders(module)

def _load_spiders(self, module):
for spcls in iter_spider_classes(module):
self._spiders[spcls.name] = spcls

@classmethod
def from_settings(cls, settings):
return cls(settings)

def load(self, spider_name):
"""
Return the Spider class for the given spider name. If the spider
name is not found, raise a KeyError.
"""
try:
return self._spiders[spider_name]
except KeyError:
raise KeyError("Spider not found: {}".format(spider_name))

def find_by_request(self, request):
"""
Return the list of spider names that can handle the given request.
"""
return [name for name, cls in self._spiders.items()
if cls.handles_request(request)]

def list(self):
"""
Return a list with the names of all spiders available in the project.
"""
return list(self._spiders.keys())
44 changes: 4 additions & 40 deletions scrapy/spidermanager.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,7 @@
"""
SpiderManager is the class which locates and manages all website-specific
spiders
Backwards compatibility shim. Use scrapy.spiderloader instead.
"""
from scrapy.spiderloader import SpiderLoader
from scrapy.utils.deprecate import create_deprecated_class

from zope.interface import implementer
import six

from scrapy.interfaces import ISpiderManager
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes


@implementer(ISpiderManager)
class SpiderManager(object):

def __init__(self, settings):
self.spider_modules = settings.getlist('SPIDER_MODULES')
self._spiders = {}
for name in self.spider_modules:
for module in walk_modules(name):
self._load_spiders(module)

def _load_spiders(self, module):
for spcls in iter_spider_classes(module):
self._spiders[spcls.name] = spcls

@classmethod
def from_settings(cls, settings):
return cls(settings)

def load(self, spider_name):
try:
return self._spiders[spider_name]
except KeyError:
raise KeyError("Spider not found: {}".format(spider_name))

def find_by_request(self, request):
return [name for name, cls in six.iteritems(self._spiders)
if cls.handles_request(request)]

def list(self):
return list(self._spiders.keys())
SpiderManager = create_deprecated_class('SpiderManager', SpiderLoader)
1 change: 0 additions & 1 deletion scrapy/telnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def _get_telnet_vars(self):
'crawler': self.crawler,
'extensions': self.crawler.extensions,
'stats': self.crawler.stats,
'spiders': self.crawler.spiders,
'settings': self.crawler.settings,
'est': lambda: print_engine_status(self.crawler.engine),
'p': pprint.pprint,
Expand Down
Loading

0 comments on commit e034947

Please sign in to comment.