diff --git a/conftest.py b/conftest.py index 21f152e456b..783a5a2502b 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,5 @@ import six import pytest -from twisted.python import log collect_ignore = ["scrapy/stats.py", "scrapy/project.py"] @@ -10,35 +9,6 @@ if len(file_path) > 0 and file_path[0] != '#': collect_ignore.append(file_path) -class LogObservers: - """Class for keeping track of log observers across test modules""" - - def __init__(self): - self.observers = [] - - def add(self, logfile='test.log'): - fileobj = open(logfile, 'wb') - observer = log.FileLogObserver(fileobj) - log.startLoggingWithObserver(observer.emit, 0) - self.observers.append((fileobj, observer)) - - def remove(self): - fileobj, observer = self.observers.pop() - log.removeObserver(observer.emit) - fileobj.close() - - -@pytest.fixture(scope='module') -def log_observers(): - return LogObservers() - - -@pytest.fixture() -def setlog(request, log_observers): - """Attach test.log file observer to twisted log, for trial compatibility""" - log_observers.add() - request.addfinalizer(log_observers.remove) - @pytest.fixture() def chdir(tmpdir): diff --git a/docs/index.rst b/docs/index.rst index 97ec9539d48..6a1571a699e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -119,7 +119,7 @@ Built-in services topics/webservice :doc:`topics/logging` - Understand the simple logging facility provided by Scrapy. + Learn how to use Python's builtin logging on Scrapy. :doc:`topics/stats` Collect statistics about your scraping crawler. diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index ab2302154f5..479b17f99d1 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -154,15 +154,14 @@ similar to this:: 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled downloader middlewares: ... 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled spider middlewares: ... 2014-01-23 18:13:07-0400 [scrapy] INFO: Enabled item pipelines: ... - 2014-01-23 18:13:07-0400 [dmoz] INFO: Spider opened - 2014-01-23 18:13:08-0400 [dmoz] DEBUG: Crawled (200) (referer: None) - 2014-01-23 18:13:09-0400 [dmoz] DEBUG: Crawled (200) (referer: None) - 2014-01-23 18:13:09-0400 [dmoz] INFO: Closing spider (finished) + 2014-01-23 18:13:07-0400 [scrapy] INFO: Spider opened + 2014-01-23 18:13:08-0400 [scrapy] DEBUG: Crawled (200) (referer: None) + 2014-01-23 18:13:09-0400 [scrapy] DEBUG: Crawled (200) (referer: None) + 2014-01-23 18:13:09-0400 [scrapy] INFO: Closing spider (finished) .. note:: - The lines containing ``[dmoz]`` correspond to the log messages for - our spider. You can see a log line for each URL defined in ``start_urls``. + At the end you can see a log line for each URL defined in ``start_urls``. Because these URLs are the starting ones, they have no referrers, which is shown at the end of the log line, where it says ``(referer: None)``. @@ -268,7 +267,7 @@ This is what the shell looks like:: [ ... Scrapy log here ... ] - 2014-01-23 17:11:42-0400 [default] DEBUG: Crawled (200) (referer: None) + 2014-01-23 17:11:42-0400 [scrapy] DEBUG: Crawled (200) (referer: None) [s] Available Scrapy objects: [s] crawler [s] item {} @@ -424,11 +423,11 @@ Spider would be like this:: Now crawling dmoz.org yields ``DmozItem`` objects:: - [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> + [scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> {'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.\n], 'link': [u'http://gnosis.cx/TPiP/'], 'title': [u'Text Processing in Python']} - [dmoz] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> + [scrapy] DEBUG: Scraped from <200 http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> {'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'], 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'], 'title': [u'XML Processing with Python']} diff --git a/docs/topics/benchmarking.rst b/docs/topics/benchmarking.rst index 9ae6dd2cb20..632190067aa 100644 --- a/docs/topics/benchmarking.rst +++ b/docs/topics/benchmarking.rst @@ -19,20 +19,20 @@ To run it use:: You should see an output like this:: 2013-05-16 13:08:46-0300 [scrapy] INFO: Scrapy 0.17.0 started (bot: scrapybot) - 2013-05-16 13:08:47-0300 [follow] INFO: Spider opened - 2013-05-16 13:08:47-0300 [follow] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:48-0300 [follow] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:49-0300 [follow] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:50-0300 [follow] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:51-0300 [follow] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:52-0300 [follow] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:53-0300 [follow] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:54-0300 [follow] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:55-0300 [follow] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:56-0300 [follow] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:57-0300 [follow] INFO: Closing spider (closespider_timeout) - 2013-05-16 13:08:57-0300 [follow] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) - 2013-05-16 13:08:57-0300 [follow] INFO: Dumping Scrapy stats: + 2013-05-16 13:08:47-0300 [scrapy] INFO: Spider opened + 2013-05-16 13:08:47-0300 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:48-0300 [scrapy] INFO: Crawled 74 pages (at 4440 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:49-0300 [scrapy] INFO: Crawled 143 pages (at 4140 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:50-0300 [scrapy] INFO: Crawled 210 pages (at 4020 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:51-0300 [scrapy] INFO: Crawled 274 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:52-0300 [scrapy] INFO: Crawled 343 pages (at 4140 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:53-0300 [scrapy] INFO: Crawled 410 pages (at 4020 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:54-0300 [scrapy] INFO: Crawled 474 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:55-0300 [scrapy] INFO: Crawled 538 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:56-0300 [scrapy] INFO: Crawled 602 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:57-0300 [scrapy] INFO: Closing spider (closespider_timeout) + 2013-05-16 13:08:57-0300 [scrapy] INFO: Crawled 666 pages (at 3840 pages/min), scraped 0 items (at 0 items/min) + 2013-05-16 13:08:57-0300 [scrapy] INFO: Dumping Scrapy stats: {'downloader/request_bytes': 231508, 'downloader/request_count': 682, 'downloader/request_method_count/GET': 682, @@ -49,7 +49,7 @@ You should see an output like this:: 'scheduler/enqueued': 12767, 'scheduler/enqueued/memory': 12767, 'start_time': datetime.datetime(2013, 5, 16, 16, 8, 47, 676539)} - 2013-05-16 13:08:57-0300 [follow] INFO: Spider closed (closespider_timeout) + 2013-05-16 13:08:57-0300 [scrapy] INFO: Spider closed (closespider_timeout) That tells you that Scrapy is able to crawl about 3900 pages per minute in the hardware where you run it. Note that this is a very simple spider intended to diff --git a/docs/topics/debug.rst b/docs/topics/debug.rst index 5739d9e1b8f..a3e72097c39 100644 --- a/docs/topics/debug.rst +++ b/docs/topics/debug.rst @@ -132,16 +132,13 @@ Logging is another useful option for getting information about your spider run. Although not as convenient, it comes with the advantage that the logs will be available in all future runs should they be necessary again:: - from scrapy import log - def parse_details(self, response): item = response.meta.get('item', None) if item: # populate more `item` fields return item else: - self.log('No item received for %s' % response.url, - level=log.WARNING) + self.logger.warning('No item received for %s', response.url) For more information, check the :ref:`topics-logging` section. diff --git a/docs/topics/downloader-middleware.rst b/docs/topics/downloader-middleware.rst index df679daea1d..3d24b29ad2a 100644 --- a/docs/topics/downloader-middleware.rst +++ b/docs/topics/downloader-middleware.rst @@ -232,14 +232,14 @@ header) and all cookies received in responses (ie. ``Set-Cookie`` header). Here's an example of a log with :setting:`COOKIES_DEBUG` enabled:: - 2011-04-06 14:35:10-0300 [diningcity] INFO: Spider opened - 2011-04-06 14:35:10-0300 [diningcity] DEBUG: Sending cookies to: + 2011-04-06 14:35:10-0300 [scrapy] INFO: Spider opened + 2011-04-06 14:35:10-0300 [scrapy] DEBUG: Sending cookies to: Cookie: clientlanguage_nl=en_EN - 2011-04-06 14:35:14-0300 [diningcity] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html> + 2011-04-06 14:35:14-0300 [scrapy] DEBUG: Received cookies from: <200 http://www.diningcity.com/netherlands/index.html> Set-Cookie: JSESSIONID=B~FA4DC0C496C8762AE4F1A620EAB34F38; Path=/ Set-Cookie: ip_isocode=US Set-Cookie: clientlanguage_nl=en_EN; Expires=Thu, 07-Apr-2011 21:21:34 GMT; Path=/ - 2011-04-06 14:49:50-0300 [diningcity] DEBUG: Crawled (200) (referer: None) + 2011-04-06 14:49:50-0300 [scrapy] DEBUG: Crawled (200) (referer: None) [...] diff --git a/docs/topics/extensions.rst b/docs/topics/extensions.rst index 8cd588c4af1..19c29665136 100644 --- a/docs/topics/extensions.rst +++ b/docs/topics/extensions.rst @@ -102,9 +102,12 @@ number of items will be specified through the ``MYEXT_ITEMCOUNT`` setting. Here is the code of such extension:: + import logging from scrapy import signals from scrapy.exceptions import NotConfigured + logger = logging.getLogger(__name__) + class SpiderOpenCloseLogging(object): def __init__(self, item_count): @@ -133,15 +136,15 @@ Here is the code of such extension:: return ext def spider_opened(self, spider): - spider.log("opened spider %s" % spider.name) + logger.info("opened spider %s", spider.name) def spider_closed(self, spider): - spider.log("closed spider %s" % spider.name) + logger.info("closed spider %s", spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: - spider.log("scraped %d items" % self.items_scraped) + logger.info("scraped %d items", self.items_scraped) .. _topics-extensions-ref: diff --git a/docs/topics/logging.rst b/docs/topics/logging.rst index e9266cd6af2..2cb719998dd 100644 --- a/docs/topics/logging.rst +++ b/docs/topics/logging.rst @@ -4,126 +4,232 @@ Logging ======= -Scrapy provides a logging facility which can be used through the -:mod:`scrapy.log` module. The current underlying implementation uses `Twisted -logging`_ but this may change in the future. +.. note:: + :mod:`scrapy.log` has been deprecated alongside its functions in favor of + explicit calls to the Python standard logging. Keep reading to learn more + about the new logging system. -.. _Twisted logging: http://twistedmatrix.com/documents/current/core/howto/logging.html +Scrapy uses `Python's builtin logging system +`_ for event logging. We'll +provide some simple examples to get you started, but for more advanced +use-cases it's strongly suggested to read thoroughly its documentation. -The logging service must be explicitly started through the -:func:`scrapy.log.start` function to catch the top level Scrapy's log messages. -On top of that, each crawler has its own independent log observer -(automatically attached when it's created) that intercepts its spider's log -messages. +Logging works out of the box, and can be configured to some extent with the +Scrapy settings listed in :ref:`topics-logging-settings`. + +Scrapy calls :func:`scrapy.utils.log.configure_logging` to set some reasonable +defaults and handle those settings in :ref:`topics-logging-settings` when +running commands, so it's recommended to manually call it if you're running +Scrapy from scripts as described in :ref:`run-from-script`. .. _topics-logging-levels: Log levels ========== -Scrapy provides 5 logging levels: - -1. :data:`~scrapy.log.CRITICAL` - for critical errors -2. :data:`~scrapy.log.ERROR` - for regular errors -3. :data:`~scrapy.log.WARNING` - for warning messages -4. :data:`~scrapy.log.INFO` - for informational messages -5. :data:`~scrapy.log.DEBUG` - for debugging messages - -How to set the log level -======================== +Python's builtin logging defines 5 different levels to indicate severity on a +given log message. Here are the standard ones, listed in decreasing order: -You can set the log level using the `--loglevel/-L` command line option, or -using the :setting:`LOG_LEVEL` setting. +1. ``logging.CRITICAL`` - for critical errors (highest severity) +2. ``logging.ERROR`` - for regular errors +3. ``logging.WARNING`` - for warning messages +4. ``logging.INFO`` - for informational messages +5. ``logging.DEBUG`` - for debugging messages (lowest severity) How to log messages =================== -Here's a quick example of how to log a message using the ``WARNING`` level:: +Here's a quick example of how to log a message using the ``logging.WARNING`` +level:: - from scrapy import log - log.msg("This is a warning", level=log.WARNING) + import logging + logging.warning("This is a warning") -Logging from Spiders -==================== +There are shortcuts for issuing log messages on any of the standard 5 levels, +and there's also a general ``logging.log`` method which takes a given level as +argument. If you need so, last example could be rewrote as:: + + import logging + logging.log(logging.WARNING, "This is a warning") + +On top of that, you can create different "loggers" to encapsulate messages (For +example, a common practice it's to create different loggers for every module). +These loggers can be configured independently, and they allow hierarchical +constructions. + +Last examples use the root logger behind the scenes, which is a top level +logger where all messages are propagated to (unless otherwise specified). Using +``logging`` helpers is merely a shortcut for getting the root logger +explicitly, so this is also an equivalent of last snippets:: + + import logging + logger = logging.getLogger() + logger.warning("This is a warning") -The recommended way to log from spiders is by using the Spider -:meth:`~scrapy.spider.Spider.log` method, which already populates the -``spider`` argument of the :func:`scrapy.log.msg` function. The other arguments -are passed directly to the :func:`~scrapy.log.msg` function. +You can use a different logger just by getting its name with the +``logging.getLogger`` function:: -scrapy.log module -================= + import logging + logger = logging.getLogger('mycustomlogger') + logger.warning("This is a warning") -.. module:: scrapy.log - :synopsis: Logging facility +Finally, you can ensure having a custom logger for any module you're working on +by using the ``__name__`` variable, which is populated with current module's +path:: -.. function:: start(logfile=None, loglevel=None, logstdout=None) + import logging + logger = logging.getLogger(__name__) + logger.warning("This is a warning") - Start the top level Scrapy logger. This must be called before actually - logging any top level messages (those logged using this module's - :func:`~scrapy.log.msg` function instead of the :meth:`Spider.log - ` method). Otherwise, messages logged before this - call will get lost. +.. seealso:: - :param logfile: the file path to use for logging output. If omitted, the - :setting:`LOG_FILE` setting will be used. If both are ``None``, the log - will be sent to standard error. - :type logfile: str + Module logging, `HowTo `_ + Basic Logging Tutorial - :param loglevel: the minimum logging level to log. Available values are: - :data:`CRITICAL`, :data:`ERROR`, :data:`WARNING`, :data:`INFO` and - :data:`DEBUG`. + Module logging, `Loggers `_ + Further documentation on loggers - :param logstdout: if ``True``, all standard output (and error) of your - application will be logged instead. For example if you "print 'hello'" - it will appear in the Scrapy log. If omitted, the :setting:`LOG_STDOUT` - setting will be used. - :type logstdout: boolean +.. _topics-logging-from-spiders: -.. function:: msg(message, level=INFO, spider=None) +Logging from Spiders +==================== + +Scrapy provides a :data:`~scrapy.spider.Spider.logger` within each Spider +instance, that can be accessed and used like this:: - Log a message + import scrapy - :param message: the message to log - :type message: str + class MySpider(scrapy.Spider): - :param level: the log level for this message. See - :ref:`topics-logging-levels`. + name = 'myspider' + start_urls = ['http://scrapinghub.com'] - :param spider: the spider to use for logging this message. This parameter - should always be used when logging things related to a particular - spider. - :type spider: :class:`~scrapy.spider.Spider` object + def parse(self, response): + self.logger.info('Parse function called on %s', response.url) -.. data:: CRITICAL +That logger is created using the Spider's name, but you can use any custom +Python logger you want. For example:: - Log level for critical errors + import logging + import scrapy -.. data:: ERROR + logger = logging.getLogger('mycustomlogger') - Log level for errors + class MySpider(scrapy.Spider): -.. data:: WARNING + name = 'myspider' + start_urls = ['http://scrapinghub.com'] - Log level for warnings + def parse(self, response): + logger.info('Parse function called on %s', response.url) -.. data:: INFO +.. _topics-logging-configuration: - Log level for informational messages (recommended level for production - deployments) +Logging configuration +===================== -.. data:: DEBUG +Loggers on their own don't manage how messages sent through them are displayed. +For this task, different "handlers" can be attached to any logger instance and +they will redirect those messages to appropriate destinations, such as the +standard output, files, emails, etc. - Log level for debugging messages (recommended level for development) +By default, Scrapy sets and configures a handler for the root logger, based on +the settings below. + +.. _topics-logging-settings: Logging settings -================ +---------------- These settings can be used to configure the logging: +* :setting:`LOG_FILE` * :setting:`LOG_ENABLED` * :setting:`LOG_ENCODING` -* :setting:`LOG_FILE` * :setting:`LOG_LEVEL` +* :setting:`LOG_FORMAT` +* :setting:`LOG_DATEFORMAT` * :setting:`LOG_STDOUT` +First couple of settings define a destination for log messages. If +:setting:`LOG_FILE` is set, messages sent through the root logger will be +redirected to a file named :setting:`LOG_FILE` with encoding +:setting:`LOG_ENCODING`. If unset and :setting:`LOG_ENABLED` is ``True``, log +messages will be displayed on the standard error. Lastly, if +:setting:`LOG_ENABLED` is ``False``, there won't be any visible log output. + +:setting:`LOG_LEVEL` determines the minimum level of severity to display, those +messages with lower severity will be filtered out. It ranges through the +possible levels listed in :ref:`topics-logging-levels`. + +:setting:`LOG_FORMAT` and :setting:`LOG_DATEFORMAT` specify formatting strings +used as layouts for all messages. Those strings can contain any placeholders +listed in `logging's logrecord attributes docs +`_ and +`datetime's strftime and strptime directives +`_ +respectively. + +Command-line options +-------------------- + +There are command-line arguments, available for all commands, that you can use +to override some of the Scrapy settings regarding logging. + +* ``--logfile FILE`` + Overrides :setting:`LOG_FILE` +* ``--loglevel/-L LEVEL`` + Overrides :setting:`LOG_LEVEL` +* ``--nolog`` + Sets :setting:`LOG_ENABLED` to ``False`` + +.. seealso:: + + Module `logging.handlers `_ + Further documentation on available handlers + +scrapy.utils.log module +======================= + +.. module:: scrapy.utils.log + :synopsis: Logging utils + +.. function:: configure_logging(settings=None) + + This function initializes logging defaults for Scrapy. + + It's automatically called when using Scrapy commands, but needs to be + called explicitely when running custom scripts. In that case, its usage is + not required but it's recommended. + + This function does: + - Route warnings and Twisted logging through Python standard logging + - Set a filter on Scrapy logger for formatting Twisted failures + - Assign DEBUG and ERROR levels to Scrapy and Twisted loggers + respectively + + If `settings` is not ``None``, it will also create a root handler based on + the settings listed in :ref:`topics-logging-settings`. + + If you plan on configuring the handlers yourself is still recommended you + call this function, keeping `settings` as ``None``. Bear in mind there + won't be any log output set by default in that case. + + To get you started on manually configuring logging's output, you can use + `logging.basicConfig()`_ to set a basic root handler. This is an example on + how to redirect ``INFO`` or higher messages to a file:: + + import logging + from scrapy.utils.log import configure_logging + + configure_logging() # Note we aren't providing settings in this case + logging.basicConfig(filename='log.txt', format='%(levelname)s: %(message)s', level=logging.INFO) + + Refer to :ref:`run-from-script` for more details about using Scrapy this + way. + + :param settings: settings used to create and configure a handler for the + root logger. + :type settings: :class:`~scrapy.settings.Settings` object or ``None`` + +.. _logging.basicConfig(): https://docs.python.org/2/library/logging.html#logging.basicConfig diff --git a/docs/topics/practices.rst b/docs/topics/practices.rst index 2f848e6d916..6bd74c794b1 100644 --- a/docs/topics/practices.rst +++ b/docs/topics/practices.rst @@ -30,9 +30,12 @@ project as example. from twisted.internet import reactor from scrapy.crawler import CrawlerRunner + from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings - runner = CrawlerRunner(get_project_settings()) + settings = get_project_settings() + configure_logging(settings) + runner = CrawlerRunner(settings) # 'followall' is the name of one of the spiders of the project. d = runner.crawl('followall', domain='scrapinghub.com') @@ -54,11 +57,13 @@ the spider class as first argument in the :meth:`CrawlerRunner.crawl from twisted.internet import reactor import scrapy from scrapy.crawler import CrawlerRunner + from scrapy.utils.log import configure_logging class MySpider(scrapy.Spider): # Your spider definition ... + configure_logging(settings) runner = CrawlerRunner({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) @@ -85,9 +90,12 @@ Here is an example that runs multiple spiders simultaneously, using the from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner + from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings - runner = CrawlerRunner(get_project_settings()) + settings = get_project_settings() + configure_logging(settings) + runner = CrawlerRunner(settings) dfs = set() for domain in ['scrapinghub.com', 'insophia.com']: d = runner.crawl('followall', domain=domain) @@ -102,9 +110,12 @@ Same example but running the spiders sequentially by chaining the deferreds: from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner + from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings - runner = CrawlerRunner(get_project_settings()) + settings = get_project_settings() + configure_logging(settings) + runner = CrawlerRunner(settings) @defer.inlineCallbacks def crawl(): diff --git a/docs/topics/request-response.rst b/docs/topics/request-response.rst index 5b4ced99275..1d695a5f22b 100644 --- a/docs/topics/request-response.rst +++ b/docs/topics/request-response.rst @@ -189,7 +189,7 @@ Example:: def parse_page2(self, response): # this would log http://www.example.com/some_page.html - self.log("Visited %s" % response.url) + self.logger.info("Visited %s", response.url) In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You @@ -382,7 +382,7 @@ method for this job. Here's an example spider which uses it:: def after_login(self, response): # check login succeed before going on if "authentication failed" in response.body: - self.log("Login failed", level=log.ERROR) + self.logger.error("Login failed") return # continue scraping with authenticated session... diff --git a/docs/topics/settings.rst b/docs/topics/settings.rst index 7deb6e0c606..3936c04b859 100644 --- a/docs/topics/settings.rst +++ b/docs/topics/settings.rst @@ -612,6 +612,31 @@ Default: ``None`` File name to use for logging output. If None, standard error will be used. +.. setting:: LOG_FORMAT + +LOG_FORMAT +---------- + +Default: ``'%(asctime)s [%(name)s] %(levelname)s: %(message)s'`` + +String for formatting log messsages. Refer to the `Python logging documentation`_ for the whole list of available +placeholders. + +.. _Python logging documentation: https://docs.python.org/2/library/logging.html#logrecord-attributes + +.. setting:: LOG_DATEFORMAT + +LOG_DATEFORMAT +-------------- + +Default: ``'%Y-%m-%d %H:%M:%S%z'`` + +String for formatting date/time, expansion of the ``%(asctime)s`` placeholder +in :setting:`LOG_FORMAT`. Refer to the `Python datetime documentation`_ for the whole list of available +directives. + +.. _Python datetime documentation: https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior + .. setting:: LOG_LEVEL LOG_LEVEL diff --git a/docs/topics/shell.rst b/docs/topics/shell.rst index 8fce0cea18a..3b875fec566 100644 --- a/docs/topics/shell.rst +++ b/docs/topics/shell.rst @@ -187,8 +187,8 @@ Here's an example of how you would call it from your spider:: When you run the spider, you will get something similar to this:: - 2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) (referer: None) - 2014-01-23 17:48:31-0400 [myspider] DEBUG: Crawled (200) (referer: None) + 2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) (referer: None) + 2014-01-23 17:48:31-0400 [scrapy] DEBUG: Crawled (200) (referer: None) [s] Available Scrapy objects: [s] crawler ... @@ -211,7 +211,7 @@ Finally you hit Ctrl-D (or Ctrl-Z in Windows) to exit the shell and resume the crawling:: >>> ^D - 2014-01-23 17:50:03-0400 [myspider] DEBUG: Crawled (200) (referer: None) + 2014-01-23 17:50:03-0400 [scrapy] DEBUG: Crawled (200) (referer: None) ... Note that you can't use the ``fetch`` shortcut here since the Scrapy engine is diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst index ddeb17d07be..7c7d5d731de 100644 --- a/docs/topics/spiders.rst +++ b/docs/topics/spiders.rst @@ -112,6 +112,12 @@ scrapy.Spider :class:`~scrapy.settings.Settings` instance, see the :ref:`topics-settings` topic for a detailed introduction on this subject. + .. attribute:: logger + + Python logger created with the Spider's :attr:`name`. You can use it to + send log messages through it as described on + :ref:`topics-logging-from-spiders`. + .. method:: from_crawler(crawler, \*args, \**kwargs) This is the class method used by Scrapy to create your spiders. @@ -194,9 +200,9 @@ scrapy.Spider .. method:: log(message, [level, component]) - Log a message using the :func:`scrapy.log.msg` function, automatically - populating the spider argument with the :attr:`name` of this - spider. For more information see :ref:`topics-logging`. + Wrapper that sends a log message through the Spider's :attr:`logger`, + kept for backwards compatibility. For more information see + :ref:`topics-logging-from-spiders`. .. method:: closed(reason) @@ -218,7 +224,7 @@ Let's see an example:: ] def parse(self, response): - self.log('A response from %s just arrived!' % response.url) + self.logger.info('A response from %s just arrived!', response.url) Return multiple Requests and items from a single callback:: @@ -406,7 +412,7 @@ Let's now take a look at an example CrawlSpider with rules:: ) def parse_item(self, response): - self.log('Hi, this is an item page! %s' % response.url) + self.logger.info('Hi, this is an item page! %s', response.url) item = scrapy.Item() item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = response.xpath('//td[@id="item_name"]/text()').extract() @@ -509,7 +515,6 @@ XMLFeedSpider example These spiders are pretty easy to use, let's have a look at one example:: - from scrapy import log from scrapy.contrib.spiders import XMLFeedSpider from myproject.items import TestItem @@ -521,7 +526,7 @@ These spiders are pretty easy to use, let's have a look at one example:: itertag = 'item' def parse_node(self, response, node): - log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract()))) + self.logger.info('Hi, this is a <%s> node!: %s', self.itertag, ''.join(node.extract())) item = TestItem() item['id'] = node.xpath('@id').extract() @@ -570,7 +575,6 @@ CSVFeedSpider example Let's see an example similar to the previous one, but using a :class:`CSVFeedSpider`:: - from scrapy import log from scrapy.contrib.spiders import CSVFeedSpider from myproject.items import TestItem @@ -583,7 +587,7 @@ Let's see an example similar to the previous one, but using a headers = ['id', 'name', 'description'] def parse_row(self, response, row): - log.msg('Hi, this is a row!: %r' % row) + self.logger.info('Hi, this is a row!: %r', row) item = TestItem() item['id'] = row['id'] diff --git a/pytest.ini b/pytest.ini index 355ee31c3ae..3046dfd8588 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,5 @@ [pytest] -usefixtures = chdir setlog +usefixtures = chdir python_files=test_*.py __init__.py addopts = --doctest-modules --assert=plain twisted = 1 diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py index 3e006ede39e..245b3fdf0cd 100644 --- a/scrapy/commands/parse.py +++ b/scrapy/commands/parse.py @@ -1,5 +1,8 @@ from __future__ import print_function +import logging + from w3lib.url import is_url + from scrapy.command import ScrapyCommand from scrapy.http import Request from scrapy.item import BaseItem @@ -7,7 +10,9 @@ from scrapy.utils.conf import arglist_to_dict from scrapy.utils.spider import iterate_spider_output, spidercls_for_request from scrapy.exceptions import UsageError -from scrapy import log + +logger = logging.getLogger(__name__) + class Command(ScrapyCommand): @@ -119,9 +124,9 @@ def get_callback_from_rules(self, spider, response): if rule.link_extractor.matches(response.url) and rule.callback: return rule.callback else: - log.msg(format='No CrawlSpider rules found in spider %(spider)r, ' - 'please specify a callback to use for parsing', - level=log.ERROR, spider=spider.name) + logger.error('No CrawlSpider rules found in spider %(spider)r, ' + 'please specify a callback to use for parsing', + {'spider': spider.name}) def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader @@ -129,13 +134,13 @@ def set_spidercls(self, url, opts): try: self.spidercls = spider_loader.load(opts.spider) except KeyError: - log.msg(format='Unable to find spider: %(spider)s', - level=log.ERROR, spider=opts.spider) + logger.error('Unable to find spider: %(spider)s', + {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: - log.msg(format='Unable to find spider for: %(url)s', - level=log.ERROR, url=url) + logger.error('Unable to find spider for: %(url)s', + {'url': url}) request = Request(url, opts.callback) _start_requests = lambda s: [self.prepare_request(s, request, opts)] @@ -148,8 +153,8 @@ def start_parsing(self, url, opts): self.crawler_process.start() if not self.first_response: - log.msg(format='No response downloaded for: %(url)s', - level=log.ERROR, url=url) + logger.error('No response downloaded for: %(url)s', + {'url': url}) def prepare_request(self, spider, request, opts): def callback(response): @@ -170,8 +175,8 @@ def callback(response): if callable(cb_method): cb = cb_method else: - log.msg(format='Cannot find callback %(callback)r in spider: %(spider)s', - callback=callback, spider=spider.name, level=log.ERROR) + logger.error('Cannot find callback %(callback)r in spider: %(spider)s', + {'callback': callback, 'spider': spider.name}) return # parse items and requests diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py index f8ad8a4911d..cf99865c475 100644 --- a/scrapy/commands/shell.py +++ b/scrapy/commands/shell.py @@ -9,7 +9,6 @@ from scrapy.command import ScrapyCommand from scrapy.shell import Shell from scrapy.http import Request -from scrapy import log from scrapy.utils.spider import spidercls_for_request, DefaultSpider @@ -54,7 +53,6 @@ def run(self, args, opts): # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_crawler(spidercls) - self.crawler_process._setup_crawler_logging(crawler) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() diff --git a/scrapy/contrib/debug.py b/scrapy/contrib/debug.py index 18a746d31fe..625e132499f 100644 --- a/scrapy/contrib/debug.py +++ b/scrapy/contrib/debug.py @@ -6,13 +6,15 @@ import sys import signal +import logging import traceback import threading from pdb import Pdb from scrapy.utils.engine import format_engine_status from scrapy.utils.trackref import format_live_refs -from scrapy import log + +logger = logging.getLogger(__name__) class StackTraceDump(object): @@ -31,12 +33,14 @@ def from_crawler(cls, crawler): return cls(crawler) def dump_stacktrace(self, signum, frame): - stackdumps = self._thread_stacks() - enginestatus = format_engine_status(self.crawler.engine) - liverefs = format_live_refs() - msg = "Dumping stack trace and engine status" \ - "\n{0}\n{1}\n{2}".format(enginestatus, liverefs, stackdumps) - log.msg(msg) + log_args = { + 'stackdumps': self._thread_stacks(), + 'enginestatus': format_engine_status(self.crawler.engine), + 'liverefs': format_live_refs(), + } + logger.info("Dumping stack trace and engine status\n" + "%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s", + log_args, extra={'crawler': self.crawler}) def _thread_stacks(self): id2name = dict((th.ident, th.name) for th in threading.enumerate()) diff --git a/scrapy/contrib/downloadermiddleware/ajaxcrawl.py b/scrapy/contrib/downloadermiddleware/ajaxcrawl.py index 6c037169186..6b543b8230c 100644 --- a/scrapy/contrib/downloadermiddleware/ajaxcrawl.py +++ b/scrapy/contrib/downloadermiddleware/ajaxcrawl.py @@ -1,14 +1,19 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import import re +import logging + import six from w3lib import html -from scrapy import log + from scrapy.exceptions import NotConfigured from scrapy.http import HtmlResponse from scrapy.utils.response import _noscript_re, _script_re +logger = logging.getLogger(__name__) + + class AjaxCrawlMiddleware(object): """ Handle 'AJAX crawlable' pages marked as crawlable via meta tag. @@ -46,9 +51,9 @@ def process_response(self, request, response, spider): # scrapy already handles #! links properly ajax_crawl_request = request.replace(url=request.url+'#!') - log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s", - level=log.DEBUG, spider=spider, - ajax_crawl_request=ajax_crawl_request, request=request) + logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s", + {'ajax_crawl_request': ajax_crawl_request, 'request': request}, + extra={'spider': spider}) ajax_crawl_request.meta['ajax_crawlable'] = True return ajax_crawl_request diff --git a/scrapy/contrib/downloadermiddleware/cookies.py b/scrapy/contrib/downloadermiddleware/cookies.py index 4b63b8112c5..270d621cd03 100644 --- a/scrapy/contrib/downloadermiddleware/cookies.py +++ b/scrapy/contrib/downloadermiddleware/cookies.py @@ -1,11 +1,13 @@ import os import six +import logging from collections import defaultdict from scrapy.exceptions import NotConfigured from scrapy.http import Response from scrapy.http.cookies import CookieJar -from scrapy import log + +logger = logging.getLogger(__name__) class CookiesMiddleware(object): @@ -54,7 +56,7 @@ def _debug_cookie(self, request, spider): if cl: msg = "Sending cookies to: %s" % request + os.linesep msg += os.linesep.join("Cookie: %s" % c for c in cl) - log.msg(msg, spider=spider, level=log.DEBUG) + logger.debug(msg, extra={'spider': spider}) def _debug_set_cookie(self, response, spider): if self.debug: @@ -62,7 +64,7 @@ def _debug_set_cookie(self, response, spider): if cl: msg = "Received cookies from: %s" % response + os.linesep msg += os.linesep.join("Set-Cookie: %s" % c for c in cl) - log.msg(msg, spider=spider, level=log.DEBUG) + logger.debug(msg, extra={'spider': spider}) def _format_cookie(self, cookie): # build cookie string diff --git a/scrapy/contrib/downloadermiddleware/decompression.py b/scrapy/contrib/downloadermiddleware/decompression.py index c08f50b5ff3..49313cc04b0 100644 --- a/scrapy/contrib/downloadermiddleware/decompression.py +++ b/scrapy/contrib/downloadermiddleware/decompression.py @@ -1,11 +1,12 @@ """ This module implements the DecompressionMiddleware which tries to recognise -and extract the potentially compressed responses that may arrive. +and extract the potentially compressed responses that may arrive. """ import bz2 import gzip import zipfile import tarfile +import logging from tempfile import mktemp import six @@ -15,9 +16,10 @@ except ImportError: from io import BytesIO -from scrapy import log from scrapy.responsetypes import responsetypes +logger = logging.getLogger(__name__) + class DecompressionMiddleware(object): """ This middleware tries to recognise and extract the possibly compressed @@ -80,7 +82,7 @@ def process_response(self, request, response, spider): for fmt, func in six.iteritems(self._formats): new_response = func(response) if new_response: - log.msg(format='Decompressed response with format: %(responsefmt)s', - level=log.DEBUG, spider=spider, responsefmt=fmt) + logger.debug('Decompressed response with format: %(responsefmt)s', + {'responsefmt': fmt}, extra={'spider': spider}) return new_response return response diff --git a/scrapy/contrib/downloadermiddleware/redirect.py b/scrapy/contrib/downloadermiddleware/redirect.py index cfb10d4dba4..f439f43ae8c 100644 --- a/scrapy/contrib/downloadermiddleware/redirect.py +++ b/scrapy/contrib/downloadermiddleware/redirect.py @@ -1,10 +1,12 @@ +import logging from six.moves.urllib.parse import urljoin -from scrapy import log from scrapy.http import HtmlResponse from scrapy.utils.response import get_meta_refresh from scrapy.exceptions import IgnoreRequest, NotConfigured +logger = logging.getLogger(__name__) + class BaseRedirectMiddleware(object): @@ -32,13 +34,13 @@ def _redirect(self, redirected, request, spider, reason): [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust - log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s", - level=log.DEBUG, spider=spider, request=request, - redirected=redirected, reason=reason) + logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s", + {'reason': reason, 'redirected': redirected, 'request': request}, + extra={'spider': spider}) return redirected else: - log.msg(format="Discarding %(request)s: max redirections reached", - level=log.DEBUG, spider=spider, request=request) + logger.debug("Discarding %(request)s: max redirections reached", + {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached") def _redirect_request_using_get(self, request, redirect_url): diff --git a/scrapy/contrib/downloadermiddleware/retry.py b/scrapy/contrib/downloadermiddleware/retry.py index f72f39431d5..714103ba069 100644 --- a/scrapy/contrib/downloadermiddleware/retry.py +++ b/scrapy/contrib/downloadermiddleware/retry.py @@ -17,17 +17,19 @@ protocol. It's included by default because it's a common code used to indicate server overload, which would be something we want to retry """ +import logging from twisted.internet import defer from twisted.internet.error import TimeoutError, DNSLookupError, \ ConnectionRefusedError, ConnectionDone, ConnectError, \ ConnectionLost, TCPTimedOutError -from scrapy import log from scrapy.exceptions import NotConfigured from scrapy.utils.response import response_status_message from scrapy.xlib.tx import ResponseFailed +logger = logging.getLogger(__name__) + class RetryMiddleware(object): @@ -66,13 +68,15 @@ def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: - log.msg(format="Retrying %(request)s (failed %(retries)d times): %(reason)s", - level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason) + logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s", + {'request': request, 'retries': retries, 'reason': reason}, + extra={'spider': spider}) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True retryreq.priority = request.priority + self.priority_adjust return retryreq else: - log.msg(format="Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", - level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason) + logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s", + {'request': request, 'retries': retries, 'reason': reason}, + extra={'spider': spider}) diff --git a/scrapy/contrib/downloadermiddleware/robotstxt.py b/scrapy/contrib/downloadermiddleware/robotstxt.py index a58ecca8eaf..9083482f05c 100644 --- a/scrapy/contrib/downloadermiddleware/robotstxt.py +++ b/scrapy/contrib/downloadermiddleware/robotstxt.py @@ -4,13 +4,16 @@ """ +import logging + from six.moves.urllib import robotparser -from scrapy import signals, log from scrapy.exceptions import NotConfigured, IgnoreRequest from scrapy.http import Request from scrapy.utils.httpobj import urlparse_cached +logger = logging.getLogger(__name__) + class RobotsTxtMiddleware(object): DOWNLOAD_PRIORITY = 1000 @@ -32,8 +35,8 @@ def process_request(self, request, spider): return rp = self.robot_parser(request, spider) if rp and not rp.can_fetch(self._useragent, request.url): - log.msg(format="Forbidden by robots.txt: %(request)s", - level=log.DEBUG, request=request) + logger.debug("Forbidden by robots.txt: %(request)s", + {'request': request}, extra={'spider': spider}) raise IgnoreRequest def robot_parser(self, request, spider): @@ -54,8 +57,9 @@ def robot_parser(self, request, spider): def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: - log.msg(format="Error downloading %%(request)s: %s" % failure.value, - level=log.ERROR, request=request, spider=spider) + logger.error("Error downloading %(request)s: %(f_exception)s", + {'request': request, 'f_exception': failure.value}, + extra={'spider': spider, 'failure': failure}) def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) diff --git a/scrapy/contrib/feedexport.py b/scrapy/contrib/feedexport.py index a8404146ba0..7c6849a7362 100644 --- a/scrapy/contrib/feedexport.py +++ b/scrapy/contrib/feedexport.py @@ -4,7 +4,10 @@ See documentation in docs/topics/feed-exports.rst """ -import sys, os, posixpath +import os +import sys +import logging +import posixpath from tempfile import TemporaryFile from datetime import datetime from six.moves.urllib.parse import urlparse @@ -14,12 +17,14 @@ from twisted.internet import defer, threads from w3lib.url import file_uri_to_path -from scrapy import log, signals +from scrapy import signals from scrapy.utils.ftp import ftp_makedirs_cwd from scrapy.exceptions import NotConfigured from scrapy.utils.misc import load_object from scrapy.utils.python import get_func_args +logger = logging.getLogger(__name__) + class IFeedStorage(Interface): """Interface that all Feed Storages must implement""" @@ -171,11 +176,15 @@ def close_spider(self, spider): if not slot.itemcount and not self.store_empty: return slot.exporter.finish_exporting() - logfmt = "%%s %s feed (%d items) in: %s" % (self.format, \ - slot.itemcount, slot.uri) + logfmt = "%%s %(format)s feed (%(itemcount)d items) in: %(uri)s" + log_args = {'format': self.format, + 'itemcount': slot.itemcount, + 'uri': slot.uri} d = defer.maybeDeferred(slot.storage.store, slot.file) - d.addCallback(lambda _: log.msg(logfmt % "Stored", spider=spider)) - d.addErrback(log.err, logfmt % "Error storing", spider=spider) + d.addCallback(lambda _: logger.info(logfmt % "Stored", log_args, + extra={'spider': spider})) + d.addErrback(lambda f: logger.error(logfmt % "Error storing", log_args, + extra={'spider': spider, 'failure': f})) return d def item_scraped(self, item, spider): @@ -198,7 +207,7 @@ def _load_components(self, setting_prefix): def _exporter_supported(self, format): if format in self.exporters: return True - log.msg("Unknown feed format: %s" % format, log.ERROR) + logger.error("Unknown feed format: %(format)s", {'format': format}) def _storage_supported(self, uri): scheme = urlparse(uri).scheme @@ -207,9 +216,11 @@ def _storage_supported(self, uri): self._get_storage(uri) return True except NotConfigured: - log.msg("Disabled feed storage scheme: %s" % scheme, log.ERROR) + logger.error("Disabled feed storage scheme: %(scheme)s", + {'scheme': scheme}) else: - log.msg("Unknown feed storage scheme: %s" % scheme, log.ERROR) + logger.error("Unknown feed storage scheme: %(scheme)s", + {'scheme': scheme}) def _get_exporter(self, *args, **kwargs): return self.exporters[self.format](*args, **kwargs) diff --git a/scrapy/contrib/logstats.py b/scrapy/contrib/logstats.py index 4f2567c3ff3..647e50f8d22 100644 --- a/scrapy/contrib/logstats.py +++ b/scrapy/contrib/logstats.py @@ -1,7 +1,11 @@ +import logging + from twisted.internet import task from scrapy.exceptions import NotConfigured -from scrapy import log, signals +from scrapy import signals + +logger = logging.getLogger(__name__) class LogStats(object): @@ -35,9 +39,12 @@ def log(self, spider): irate = (items - self.itemsprev) * self.multiplier prate = (pages - self.pagesprev) * self.multiplier self.pagesprev, self.itemsprev = pages, items - msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \ - % (pages, prate, items, irate) - log.msg(msg, spider=spider) + + msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), " + "scraped %(items)d items (at %(itemrate)d items/min)") + log_args = {'pages': pages, 'pagerate': prate, + 'items': items, 'itemrate': irate} + logger.info(msg, log_args, extra={'spider': spider}) def spider_closed(self, spider, reason): if self.task.running: diff --git a/scrapy/contrib/memusage.py b/scrapy/contrib/memusage.py index 6bcba8e119d..402c78b68b1 100644 --- a/scrapy/contrib/memusage.py +++ b/scrapy/contrib/memusage.py @@ -5,16 +5,20 @@ """ import sys import socket +import logging from pprint import pformat from importlib import import_module from twisted.internet import task -from scrapy import signals, log +from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.mail import MailSender from scrapy.utils.engine import get_engine_status +logger = logging.getLogger(__name__) + + class MemoryUsage(object): def __init__(self, crawler): @@ -74,8 +78,8 @@ def _check_limit(self): if self.get_virtual_size() > self.limit: self.crawler.stats.set_value('memusage/limit_reached', 1) mem = self.limit/1024/1024 - log.msg(format="Memory usage exceeded %(memusage)dM. Shutting down Scrapy...", - level=log.ERROR, memusage=mem) + logger.error("Memory usage exceeded %(memusage)dM. Shutting down Scrapy...", + {'memusage': mem}, extra={'crawler': self.crawler}) if self.notify_mails: subj = "%s terminated: memory usage exceeded %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) @@ -95,8 +99,8 @@ def _check_warning(self): if self.get_virtual_size() > self.warning: self.crawler.stats.set_value('memusage/warning_reached', 1) mem = self.warning/1024/1024 - log.msg(format="Memory usage reached %(memusage)dM", - level=log.WARNING, memusage=mem) + logger.warning("Memory usage reached %(memusage)dM", + {'memusage': mem}, extra={'crawler': self.crawler}) if self.notify_mails: subj = "%s warning: memory usage reached %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) diff --git a/scrapy/contrib/pipeline/files.py b/scrapy/contrib/pipeline/files.py index 6086148654b..4809005d174 100644 --- a/scrapy/contrib/pipeline/files.py +++ b/scrapy/contrib/pipeline/files.py @@ -9,6 +9,7 @@ import os.path import rfc822 import time +import logging from six.moves.urllib.parse import urlparse from collections import defaultdict import six @@ -20,12 +21,13 @@ from twisted.internet import defer, threads -from scrapy import log from scrapy.contrib.pipeline.media import MediaPipeline from scrapy.exceptions import NotConfigured, IgnoreRequest from scrapy.http import Request from scrapy.utils.misc import md5sum +logger = logging.getLogger(__name__) + class FileException(Exception): """General media error exception""" @@ -192,9 +194,13 @@ def _onsuccess(result): return # returning None force download referer = request.headers.get('Referer') - log.msg(format='File (uptodate): Downloaded %(medianame)s from %(request)s referred in <%(referer)s>', - level=log.DEBUG, spider=info.spider, - medianame=self.MEDIA_NAME, request=request, referer=referer) + logger.debug( + 'File (uptodate): Downloaded %(medianame)s from %(request)s ' + 'referred in <%(referer)s>', + {'medianame': self.MEDIA_NAME, 'request': request, + 'referer': referer}, + extra={'spider': info.spider} + ) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) @@ -203,17 +209,23 @@ def _onsuccess(result): path = self.file_path(request, info=info) dfd = defer.maybeDeferred(self.store.stat_file, path, info) dfd.addCallbacks(_onsuccess, lambda _: None) - dfd.addErrback(log.err, self.__class__.__name__ + '.store.stat_file') + dfd.addErrback( + lambda f: + logger.error(self.__class__.__name__ + '.store.stat_file', + extra={'spider': info.spider, 'failure': f}) + ) return dfd def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = request.headers.get('Referer') - log.msg(format='File (unknown-error): Error downloading ' - '%(medianame)s from %(request)s referred in ' - '<%(referer)s>: %(exception)s', - level=log.WARNING, spider=info.spider, exception=failure.value, - medianame=self.MEDIA_NAME, request=request, referer=referer) + logger.warning( + 'File (unknown-error): Error downloading %(medianame)s from ' + '%(request)s referred in <%(referer)s>: %(exception)s', + {'medianame': self.MEDIA_NAME, 'request': request, + 'referer': referer, 'exception': failure.value}, + extra={'spider': info.spider} + ) raise FileException @@ -221,34 +233,51 @@ def media_downloaded(self, response, request, info): referer = request.headers.get('Referer') if response.status != 200: - log.msg(format='File (code: %(status)s): Error downloading file from %(request)s referred in <%(referer)s>', - level=log.WARNING, spider=info.spider, - status=response.status, request=request, referer=referer) + logger.warning( + 'File (code: %(status)s): Error downloading file from ' + '%(request)s referred in <%(referer)s>', + {'status': response.status, + 'request': request, 'referer': referer}, + extra={'spider': info.spider} + ) raise FileException('download-error') if not response.body: - log.msg(format='File (empty-content): Empty file from %(request)s referred in <%(referer)s>: no-content', - level=log.WARNING, spider=info.spider, - request=request, referer=referer) + logger.warning( + 'File (empty-content): Empty file from %(request)s referred ' + 'in <%(referer)s>: no-content', + {'request': request, 'referer': referer}, + extra={'spider': info.spider} + ) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' - log.msg(format='File (%(status)s): Downloaded file from %(request)s referred in <%(referer)s>', - level=log.DEBUG, spider=info.spider, - status=status, request=request, referer=referer) + logger.debug( + 'File (%(status)s): Downloaded file from %(request)s referred in ' + '<%(referer)s>', + {'status': status, 'request': request, 'referer': referer}, + extra={'spider': info.spider} + ) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: - whyfmt = 'File (error): Error processing file from %(request)s referred in <%(referer)s>: %(errormsg)s' - log.msg(format=whyfmt, level=log.WARNING, spider=info.spider, - request=request, referer=referer, errormsg=str(exc)) + logger.warning( + 'File (error): Error processing file from %(request)s ' + 'referred in <%(referer)s>: %(errormsg)s', + {'request': request, 'referer': referer, 'errormsg': str(exc)}, + extra={'spider': info.spider}, exc_info=True + ) raise except Exception as exc: - whyfmt = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>' - log.err(None, whyfmt % {'request': request, 'referer': referer}, spider=info.spider) + logger.exception( + 'File (unknown-error): Error processing file from %(request)s ' + 'referred in <%(referer)s>', + {'request': request, 'referer': referer}, + extra={'spider': info.spider} + ) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum} diff --git a/scrapy/contrib/pipeline/media.py b/scrapy/contrib/pipeline/media.py index 012b7979af3..55ef05ad4f8 100644 --- a/scrapy/contrib/pipeline/media.py +++ b/scrapy/contrib/pipeline/media.py @@ -1,13 +1,16 @@ from __future__ import print_function + +import logging from collections import defaultdict from twisted.internet.defer import Deferred, DeferredList from twisted.python.failure import Failure from scrapy.utils.defer import mustbe_deferred, defer_result -from scrapy import log from scrapy.utils.request import request_fingerprint from scrapy.utils.misc import arg_to_iter +logger = logging.getLogger(__name__) + class MediaPipeline(object): @@ -66,7 +69,9 @@ def _process_request(self, request, info): dfd = mustbe_deferred(self.media_to_download, request, info) dfd.addCallback(self._check_media_to_download, request, info) dfd.addBoth(self._cache_result_and_execute_waiters, fp, info) - dfd.addErrback(log.err, spider=info.spider) + dfd.addErrback(lambda f: logger.error( + f.value, extra={'spider': info.spider, 'failure': f}) + ) return dfd.addBoth(lambda _: wad) # it must return wad at last def _check_media_to_download(self, result, request, info): @@ -117,8 +122,11 @@ def media_failed(self, failure, request, info): def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: - msg = '%s found errors processing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: - log.err(value, msg, spider=info.spider) + logger.error( + '%(class)s found errors processing %(item)s', + {'class': self.__class__.__name__, 'item': item}, + extra={'spider': info.spider, 'failure': value} + ) return item diff --git a/scrapy/contrib/spidermiddleware/depth.py b/scrapy/contrib/spidermiddleware/depth.py index 5ccfc86ed33..795b60eb47b 100644 --- a/scrapy/contrib/spidermiddleware/depth.py +++ b/scrapy/contrib/spidermiddleware/depth.py @@ -4,9 +4,13 @@ See documentation in docs/topics/spider-middleware.rst """ -from scrapy import log +import logging + from scrapy.http import Request +logger = logging.getLogger(__name__) + + class DepthMiddleware(object): def __init__(self, maxdepth, stats=None, verbose_stats=False, prio=1): @@ -31,9 +35,9 @@ def _filter(request): if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: - log.msg(format="Ignoring link (depth > %(maxdepth)d): %(requrl)s ", - level=log.DEBUG, spider=spider, - maxdepth=self.maxdepth, requrl=request.url) + logger.debug("Ignoring link (depth > %(maxdepth)d): %(requrl)s ", + {'maxdepth': self.maxdepth, 'requrl': request.url}, + extra={'spider': spider}) return False elif self.stats: if self.verbose_stats: diff --git a/scrapy/contrib/spidermiddleware/httperror.py b/scrapy/contrib/spidermiddleware/httperror.py index 7fb7aa97c8f..6b7c50fce81 100644 --- a/scrapy/contrib/spidermiddleware/httperror.py +++ b/scrapy/contrib/spidermiddleware/httperror.py @@ -3,8 +3,12 @@ See documentation in docs/topics/spider-middleware.rst """ +import logging + from scrapy.exceptions import IgnoreRequest -from scrapy import log + +logger = logging.getLogger(__name__) + class HttpError(IgnoreRequest): """A non-200 response was filtered""" @@ -42,10 +46,8 @@ def process_spider_input(self, response, spider): def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): - log.msg( - format="Ignoring response %(response)r: HTTP status code is not handled or not allowed", - level=log.DEBUG, - spider=spider, - response=response + logger.debug( + "Ignoring response %(response)r: HTTP status code is not handled or not allowed", + {'response': response}, extra={'spider': spider}, ) return [] diff --git a/scrapy/contrib/spidermiddleware/offsite.py b/scrapy/contrib/spidermiddleware/offsite.py index 136714508f6..a90f9f1e073 100644 --- a/scrapy/contrib/spidermiddleware/offsite.py +++ b/scrapy/contrib/spidermiddleware/offsite.py @@ -5,11 +5,13 @@ """ import re +import logging from scrapy import signals from scrapy.http import Request from scrapy.utils.httpobj import urlparse_cached -from scrapy import log + +logger = logging.getLogger(__name__) class OffsiteMiddleware(object): @@ -31,8 +33,8 @@ def process_spider_output(self, response, result, spider): domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen: self.domains_seen.add(domain) - log.msg(format="Filtered offsite request to %(domain)r: %(request)s", - level=log.DEBUG, spider=spider, domain=domain, request=x) + logger.debug("Filtered offsite request to %(domain)r: %(request)s", + {'domain': domain, 'request': x}, extra={'spider': spider}) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: diff --git a/scrapy/contrib/spidermiddleware/urllength.py b/scrapy/contrib/spidermiddleware/urllength.py index fa6f2c909fc..a904635d8d7 100644 --- a/scrapy/contrib/spidermiddleware/urllength.py +++ b/scrapy/contrib/spidermiddleware/urllength.py @@ -4,10 +4,14 @@ See documentation in docs/topics/spider-middleware.rst """ -from scrapy import log +import logging + from scrapy.http import Request from scrapy.exceptions import NotConfigured +logger = logging.getLogger(__name__) + + class UrlLengthMiddleware(object): def __init__(self, maxlength): @@ -23,9 +27,9 @@ def from_settings(cls, settings): def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request) and len(request.url) > self.maxlength: - log.msg(format="Ignoring link (url length > %(maxlength)d): %(url)s ", - level=log.DEBUG, spider=spider, - maxlength=self.maxlength, url=request.url) + logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ", + {'maxlength': self.maxlength, 'url': request.url}, + extra={'spider': spider}) return False else: return True diff --git a/scrapy/contrib/spiders/sitemap.py b/scrapy/contrib/spiders/sitemap.py index 84ae04d0811..90fb9fb4e68 100644 --- a/scrapy/contrib/spiders/sitemap.py +++ b/scrapy/contrib/spiders/sitemap.py @@ -1,10 +1,13 @@ import re +import logging from scrapy.spider import Spider from scrapy.http import Request, XmlResponse from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots from scrapy.utils.gz import gunzip, is_gzipped -from scrapy import log + +logger = logging.getLogger(__name__) + class SitemapSpider(Spider): @@ -32,8 +35,8 @@ def _parse_sitemap(self, response): else: body = self._get_sitemap_body(response) if body is None: - log.msg(format="Ignoring invalid sitemap: %(response)s", - level=log.WARNING, spider=self, response=response) + logger.warning("Ignoring invalid sitemap: %(response)s", + {'response': response}, extra={'spider': self}) return s = Sitemap(body) diff --git a/scrapy/contrib/throttle.py b/scrapy/contrib/throttle.py index a5601bcd0e0..a60b0cd833c 100644 --- a/scrapy/contrib/throttle.py +++ b/scrapy/contrib/throttle.py @@ -1,7 +1,10 @@ import logging + from scrapy.exceptions import NotConfigured from scrapy import signals +logger = logging.getLogger(__name__) + class AutoThrottle(object): @@ -47,9 +50,17 @@ def _response_downloaded(self, response, request, spider): diff = slot.delay - olddelay size = len(response.body) conc = len(slot.transferring) - msg = "slot: %s | conc:%2d | delay:%5d ms (%+d) | latency:%5d ms | size:%6d bytes" % \ - (key, conc, slot.delay * 1000, diff * 1000, latency * 1000, size) - spider.log(msg, level=logging.INFO) + logger.info( + "slot: %(slot)s | conc:%(concurrency)2d | " + "delay:%(delay)5d ms (%(delaydiff)+d) | " + "latency:%(latency)5d ms | size:%(size)6d bytes", + { + 'slot': key, 'concurrency': conc, + 'delay': slot.delay * 1000, 'delaydiff': diff * 1000, + 'latency': latency * 1000, 'size': size + }, + extra={'spider': spider} + ) def _get_slot(self, request, spider): key = request.meta.get('download_slot') diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py index 634c6398ba1..31412a0f45f 100644 --- a/scrapy/core/downloader/handlers/http11.py +++ b/scrapy/core/downloader/handlers/http11.py @@ -1,7 +1,7 @@ """Download handlers for http and https schemes""" import re - +import logging from io import BytesIO from time import time from six.moves.urllib.parse import urldefrag @@ -19,7 +19,9 @@ from scrapy.responsetypes import responsetypes from scrapy.core.downloader.webclient import _parse from scrapy.utils.misc import load_object -from scrapy import log, twisted_version +from scrapy import twisted_version + +logger = logging.getLogger(__name__) class HTTP11DownloadHandler(object): @@ -237,14 +239,16 @@ def _cb_bodyready(self, txresponse, request): expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1 if maxsize and expected_size > maxsize: - log.msg("Expected response size (%s) larger than download max size (%s)." % (expected_size, maxsize), - logLevel=log.ERROR) + logger.error("Expected response size (%(size)s) larger than " + "download max size (%(maxsize)s).", + {'size': expected_size, 'maxsize': maxsize}) txresponse._transport._producer.loseConnection() raise defer.CancelledError() if warnsize and expected_size > warnsize: - log.msg("Expected response size (%s) larger than downlod warn size (%s)." % (expected_size, warnsize), - logLevel=log.WARNING) + logger.warning("Expected response size (%(size)s) larger than " + "download warn size (%(warnsize)s).", + {'size': expected_size, 'warnsize': warnsize}) def _cancel(_): txresponse._transport._producer.loseConnection() @@ -295,13 +299,17 @@ def dataReceived(self, bodyBytes): self._bytes_received += len(bodyBytes) if self._maxsize and self._bytes_received > self._maxsize: - log.msg("Received (%s) bytes larger than download max size (%s)." % (self._bytes_received, self._maxsize), - logLevel=log.ERROR) + logger.error("Received (%(bytes)s) bytes larger than download " + "max size (%(maxsize)s).", + {'bytes': self._bytes_received, + 'maxsize': self._maxsize}) self._finished.cancel() if self._warnsize and self._bytes_received > self._warnsize: - log.msg("Received (%s) bytes larger than download warn size (%s)." % (self._bytes_received, self._warnsize), - logLevel=log.WARNING) + logger.warning("Received (%(bytes)s) bytes larger than download " + "warn size (%(warnsize)s).", + {'bytes': self._bytes_received, + 'warnsize': self._warnsize}) def connectionLost(self, reason): if self._finished.called: diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py index b009898a38c..63267f208a6 100644 --- a/scrapy/core/engine.py +++ b/scrapy/core/engine.py @@ -4,17 +4,21 @@ For more information see docs/topics/architecture.rst """ +import logging from time import time from twisted.internet import defer from twisted.python.failure import Failure -from scrapy import log, signals +from scrapy import signals from scrapy.core.scraper import Scraper from scrapy.exceptions import DontCloseSpider from scrapy.http import Response, Request from scrapy.utils.misc import load_object from scrapy.utils.reactor import CallLaterOnce +from scrapy.utils.log import logformatter_adapter + +logger = logging.getLogger(__name__) class Slot(object): @@ -106,10 +110,10 @@ def _next_request(self, spider): request = next(slot.start_requests) except StopIteration: slot.start_requests = None - except Exception as exc: + except Exception: slot.start_requests = None - log.err(None, 'Obtaining request from start requests', \ - spider=spider) + logger.exception('Error while obtaining start requests', + extra={'spider': spider}) else: self.crawl(request, spider) @@ -130,11 +134,14 @@ def _next_request_from_scheduler(self, spider): return d = self._download(request, spider) d.addBoth(self._handle_downloader_output, request, spider) - d.addErrback(log.msg, spider=spider) + d.addErrback(lambda f: logger.info('Error while handling downloader output', + extra={'spider': spider, 'failure': f})) d.addBoth(lambda _: slot.remove_request(request)) - d.addErrback(log.msg, spider=spider) + d.addErrback(lambda f: logger.info('Error while removing request from slot', + extra={'spider': spider, 'failure': f})) d.addBoth(lambda _: slot.nextcall.schedule()) - d.addErrback(log.msg, spider=spider) + d.addErrback(lambda f: logger.info('Error while scheduling new request', + extra={'spider': spider, 'failure': f})) return d def _handle_downloader_output(self, response, request, spider): @@ -145,7 +152,8 @@ def _handle_downloader_output(self, response, request, spider): return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider) - d.addErrback(log.err, spider=spider) + d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', + extra={'spider': spider, 'failure': f})) return d def spider_is_idle(self, spider): @@ -197,7 +205,7 @@ def _on_success(response): if isinstance(response, Response): response.request = request # tie request to response received logkws = self.logformatter.crawled(request, response, spider) - log.msg(spider=spider, **logkws) + logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) self.signals.send_catch_log(signal=signals.response_received, \ response=response, request=request, spider=spider) return response @@ -215,7 +223,7 @@ def _on_complete(_): def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name - log.msg("Spider opened", spider=spider) + logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) @@ -252,33 +260,42 @@ def close_spider(self, spider, reason='cancelled'): slot = self.slot if slot.closing: return slot.closing - log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider) + logger.info("Closing spider (%(reason)s)", + {'reason': reason}, + extra={'spider': spider}) dfd = slot.close() + def log_failure(msg): + def errback(failure): + logger.error(msg, extra={'spider': spider, 'failure': failure}) + return errback + dfd.addBoth(lambda _: self.downloader.close()) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Downloader close failure')) dfd.addBoth(lambda _: self.scraper.close_spider(spider)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Scraper close failure')) dfd.addBoth(lambda _: slot.scheduler.close(reason)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Scheduler close failure')) dfd.addBoth(lambda _: self.signals.send_catch_log_deferred( signal=signals.spider_closed, spider=spider, reason=reason)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Error while sending spider_close signal')) dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Stats close failure')) - dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider)) + dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)", + {'reason': reason}, + extra={'spider': spider})) dfd.addBoth(lambda _: setattr(self, 'slot', None)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Error while unassigning slot')) dfd.addBoth(lambda _: setattr(self, 'spider', None)) - dfd.addErrback(log.err, spider=spider) + dfd.addErrback(log_failure('Error while unassigning spider')) dfd.addBoth(lambda _: self._spider_closed_callback(spider)) diff --git a/scrapy/core/scheduler.py b/scrapy/core/scheduler.py index 232bc6a401f..f199711ab5f 100644 --- a/scrapy/core/scheduler.py +++ b/scrapy/core/scheduler.py @@ -1,12 +1,15 @@ import os import json +import logging from os.path import join, exists from queuelib import PriorityQueue from scrapy.utils.reqser import request_to_dict, request_from_dict from scrapy.utils.misc import load_object from scrapy.utils.job import job_dir -from scrapy import log + +logger = logging.getLogger(__name__) + class Scheduler(object): @@ -80,9 +83,9 @@ def _dqpush(self, request): self.dqs.push(reqd, -request.priority) except ValueError as e: # non serializable request if self.logunser: - log.msg(format="Unable to serialize request: %(request)s - reason: %(reason)s", - level=log.ERROR, spider=self.spider, - request=request, reason=e) + logger.exception("Unable to serialize request: %(request)s - reason: %(reason)s", + {'request': request, 'reason': e}, + extra={'spider': self.spider}) return else: return True @@ -111,8 +114,8 @@ def _dq(self): prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: - log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)", - spider=self.spider, queuesize=len(q)) + logger.info("Resuming crawl (%(queuesize)d requests scheduled)", + {'queuesize': len(q)}, extra={'spider': self.spider}) return q def _dqdir(self, jobdir): diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py index b301aa962a4..e5d8acea26f 100644 --- a/scrapy/core/scraper.py +++ b/scrapy/core/scraper.py @@ -1,6 +1,7 @@ """This module implements the Scraper component which parses responses and extracts information from them""" +import logging from collections import deque from twisted.python.failure import Failure @@ -9,12 +10,14 @@ from scrapy.utils.defer import defer_result, defer_succeed, parallel, iter_errback from scrapy.utils.spider import iterate_spider_output from scrapy.utils.misc import load_object +from scrapy.utils.log import logformatter_adapter from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest from scrapy import signals from scrapy.http import Request, Response from scrapy.item import BaseItem from scrapy.core.spidermw import SpiderMiddlewareManager -from scrapy import log + +logger = logging.getLogger(__name__) class Slot(object): @@ -102,7 +105,9 @@ def finish_scraping(_): return _ dfd.addBoth(finish_scraping) dfd.addErrback( - log.err, 'Scraper bug processing %s' % request, spider=spider) + lambda f: logger.error('Scraper bug processing %(request)s', + {'request': request}, + extra={'spider': spider, 'failure': f})) self._scrape_next(spider, slot) return dfd @@ -145,10 +150,10 @@ def handle_spider_error(self, _failure, request, response, spider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return referer = request.headers.get('Referer') - log.err( - _failure, - "Spider error processing %s (referer: %s)" % (request, referer), - spider=spider + logger.error( + "Spider error processing %(request)s (referer: %(referer)s)", + {'request': request, 'referer': referer}, + extra={'spider': spider, 'failure': _failure} ) self.signals.send_catch_log( signal=signals.spider_error, @@ -183,9 +188,10 @@ def _process_spidermw_output(self, output, request, response, spider): pass else: typename = type(output).__name__ - log.msg(format='Spider must return Request, BaseItem, dict or None, ' - 'got %(typename)r in %(request)s', - level=log.ERROR, spider=spider, request=request, typename=typename) + logger.error('Spider must return Request, BaseItem, dict or None, ' + 'got %(typename)r in %(request)s', + {'request': request, 'typename': typename}, + extra={'spider': spider}) def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download @@ -194,14 +200,15 @@ def _log_download_errors(self, spider_failure, download_failure, request, spider if (isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest)): if download_failure.frames: - log.err(download_failure, 'Error downloading %s' % request, - spider=spider) + logger.error('Error downloading %(request)s', + {'request': request}, + extra={'spider': spider, 'failure': download_failure}) else: errmsg = download_failure.getErrorMessage() if errmsg: - log.msg(format='Error downloading %(request)s: %(errmsg)s', - level=log.ERROR, spider=spider, request=request, - errmsg=errmsg) + logger.error('Error downloading %(request)s: %(errmsg)s', + {'request': request, 'errmsg': errmsg}, + extra={'spider': spider}) if spider_failure is not download_failure: return spider_failure @@ -214,15 +221,16 @@ def _itemproc_finished(self, output, item, response, spider): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) - log.msg(spider=spider, **logkws) + logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: - log.err(output, 'Error processing %s' % item, spider=spider) + logger.error('Error processing %(item)s', {'item': item}, + extra={'spider': spider, 'failure': output}) else: logkws = self.logformatter.scraped(output, response, spider) - log.msg(spider=spider, **logkws) + logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider) diff --git a/scrapy/crawler.py b/scrapy/crawler.py index b4706919ae2..932fbd16809 100644 --- a/scrapy/crawler.py +++ b/scrapy/crawler.py @@ -1,5 +1,6 @@ import six import signal +import logging import warnings from twisted.internet import reactor, defer @@ -14,7 +15,10 @@ from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.ossignal import install_shutdown_handlers, signal_names from scrapy.utils.misc import load_object -from scrapy import log, signals +from scrapy.utils.log import LogCounterHandler, configure_logging, log_scrapy_info +from scrapy import signals + +logger = logging.getLogger(__name__) class Crawler(object): @@ -28,6 +32,12 @@ def __init__(self, spidercls, settings): self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) + + handler = LogCounterHandler(self, level=settings.get('LOG_LEVEL')) + logging.root.addHandler(handler) + self.signals.connect(lambda: logging.root.removeHandler(handler), + signals.engine_stopped) + lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.extensions = ExtensionManager.from_crawler(self) @@ -99,7 +109,6 @@ def crawl(self, crawler_or_spidercls, *args, **kwargs): crawler = crawler_or_spidercls if not isinstance(crawler_or_spidercls, Crawler): crawler = self._create_crawler(crawler_or_spidercls) - self._setup_crawler_logging(crawler) self.crawlers.add(crawler) d = crawler.crawl(*args, **kwargs) @@ -117,11 +126,6 @@ def _create_crawler(self, spidercls): spidercls = self.spider_loader.load(spidercls) return Crawler(spidercls, self.settings) - def _setup_crawler_logging(self, crawler): - log_observer = log.start_from_crawler(crawler) - if log_observer: - crawler.signals.connect(log_observer.stop, signals.engine_stopped) - def stop(self): return defer.DeferredList([c.stop() for c in list(self.crawlers)]) @@ -139,22 +143,21 @@ def __init__(self, settings): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) self.stopping = False - self.log_observer = log.start_from_settings(self.settings) - log.scrapy_info(settings) + configure_logging(settings) + log_scrapy_info(settings) def _signal_shutdown(self, signum, _): install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] - log.msg(format="Received %(signame)s, shutting down gracefully. Send again to force ", - level=log.INFO, signame=signame) + logger.info("Received %(signame)s, shutting down gracefully. Send again to force ", + {'signame': signame}) reactor.callFromThread(self.stop) def _signal_kill(self, signum, _): install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] - log.msg(format='Received %(signame)s twice, forcing unclean shutdown', - level=log.INFO, signame=signame) - self._stop_logging() + logger.info('Received %(signame)s twice, forcing unclean shutdown', + {'signame': signame}) reactor.callFromThread(self._stop_reactor) def start(self, stop_after_crawl=True): @@ -173,10 +176,6 @@ def start(self, stop_after_crawl=True): reactor.addSystemEventTrigger('before', 'shutdown', self.stop) reactor.run(installSignalHandlers=False) # blocking call - def _stop_logging(self): - if self.log_observer: - self.log_observer.stop() - def _stop_reactor(self, _=None): try: reactor.stop() diff --git a/scrapy/dupefilter.py b/scrapy/dupefilter.py index 9bd6a6e058b..739ba922f2c 100644 --- a/scrapy/dupefilter.py +++ b/scrapy/dupefilter.py @@ -1,7 +1,7 @@ from __future__ import print_function import os +import logging -from scrapy import log from scrapy.utils.job import job_dir from scrapy.utils.request import request_fingerprint @@ -33,6 +33,7 @@ def __init__(self, path=None, debug=False): self.fingerprints = set() self.logdupes = True self.debug = debug + self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, 'requests.seen'), 'a+') self.fingerprints.update(x.rstrip() for x in self.file) @@ -59,13 +60,13 @@ def close(self, reason): def log(self, request, spider): if self.debug: - fmt = "Filtered duplicate request: %(request)s" - log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider) + msg = "Filtered duplicate request: %(request)s" + self.logger.debug(msg, {'request': request}, extra={'spider': spider}) elif self.logdupes: - fmt = ("Filtered duplicate request: %(request)s" + msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") - log.msg(format=fmt, request=request, level=log.DEBUG, spider=spider) + self.logger.debug(msg, {'request': request}, extra={'spider': spider}) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) diff --git a/scrapy/log.py b/scrapy/log.py index aa53e357487..c3f9c4227f9 100644 --- a/scrapy/log.py +++ b/scrapy/log.py @@ -1,19 +1,24 @@ """ -Scrapy logging facility - -See documentation in docs/topics/logging.rst +This module is kept to provide a helpful warning about its removal. """ -import sys + import logging import warnings -from twisted.python import log +from twisted.python.failure import Failure + +from scrapy.exceptions import ScrapyDeprecationWarning + +logger = logging.getLogger(__name__) + +warnings.warn("Module `scrapy.log` has been deprecated, Scrapy now relies on " + "the builtin Python library for logging. Read the updated " + "logging entry in the documentation to learn more.", + ScrapyDeprecationWarning, stacklevel=2) + -import scrapy -from scrapy.utils.python import unicode_to_str -from scrapy.settings import overridden_settings +# Imports kept for backwards-compatibility -# Logging levels DEBUG = logging.DEBUG INFO = logging.INFO WARNING = logging.WARNING @@ -21,141 +26,26 @@ CRITICAL = logging.CRITICAL SILENT = CRITICAL + 1 -level_names = { - logging.DEBUG: "DEBUG", - logging.INFO: "INFO", - logging.WARNING: "WARNING", - logging.ERROR: "ERROR", - logging.CRITICAL: "CRITICAL", - SILENT: "SILENT", -} - -class ScrapyFileLogObserver(log.FileLogObserver): - - def __init__(self, f, level=INFO, encoding='utf-8', crawler=None): - self.level = level - self.encoding = encoding - self.crawler = crawler - if crawler: - self.emit = self._emit_with_crawler - else: - self.emit = self._emit - log.FileLogObserver.__init__(self, f) - - def _emit(self, eventDict): - ev = _adapt_eventdict(eventDict, self.level, self.encoding, - self.crawler) - if ev is not None: - log.FileLogObserver.emit(self, ev) - return ev - - def _emit_with_crawler(self, eventDict): - ev = self._emit(eventDict) - if ev: - level = ev['logLevel'] - sname = 'log_count/%s' % level_names.get(level, level) - self.crawler.stats.inc_value(sname) - -def _adapt_eventdict(eventDict, log_level=INFO, encoding='utf-8', - crawler=None, prepend_level=True): - """Adapt Twisted log eventDict making it suitable for logging with a Scrapy - log observer. It may return None to indicate that the event should be - ignored by a Scrapy log observer. - - `log_level` is the minimum level being logged, and `encoding` is the log - encoding. - """ - ev = eventDict.copy() - if ev['isError']: - ev.setdefault('logLevel', ERROR) - # ignore non-error messages from outside scrapy - if ev.get('system') != 'scrapy' and not ev['isError']: - return +def msg(message=None, _level=logging.INFO, **kw): + warnings.warn('log.msg has been deprecated, create a python logger and ' + 'log through it instead', + ScrapyDeprecationWarning, stacklevel=2) - level = ev.get('logLevel') - if level < log_level: - return + level = kw.pop('level', _level) + message = kw.pop('format', message) + # NOTE: logger.log doesn't handle well passing empty dictionaries with format + # arguments because of some weird use-case: + # https://hg.python.org/cpython/file/648dcafa7e5f/Lib/logging/__init__.py#l269 + logger.log(level, message, *[kw] if kw else []) - spider = ev.get('spider') - if spider: - ev['system'] = unicode_to_str(spider.name, encoding) - if crawler and (not spider or spider.crawler is not crawler): - # ignore events not triggered by own spiders in crawlers' observers - return - if not crawler and spider: - # ignore spiders' events in observers without crawler - return - - lvlname = level_names.get(level, 'NOLEVEL') - message = ev.get('message') - if message: - message = [unicode_to_str(x, encoding) for x in message] - if prepend_level: - message[0] = "%s: %s" % (lvlname, message[0]) - ev['message'] = message - - why = ev.get('why') - if why: - why = unicode_to_str(why, encoding) - if prepend_level: - why = "%s: %s" % (lvlname, why) - ev['why'] = why - - fmt = ev.get('format') - if fmt: - fmt = unicode_to_str(fmt, encoding) - if prepend_level: - fmt = "%s: %s" % (lvlname, fmt) - ev['format'] = fmt - - return ev - -def _get_log_level(level_name_or_id): - if isinstance(level_name_or_id, int): - return level_name_or_id - elif isinstance(level_name_or_id, basestring): - return globals()[level_name_or_id] - else: - raise ValueError("Unknown log level: %r" % level_name_or_id) - -def start(logfile=None, loglevel='INFO', logstdout=True, logencoding='utf-8', crawler=None): - loglevel = _get_log_level(loglevel) - file = open(logfile, 'a') if logfile else sys.stderr - log_observer = ScrapyFileLogObserver(file, loglevel, logencoding, crawler) - _oldshowwarning = warnings.showwarning - log.startLoggingWithObserver(log_observer.emit, setStdout=logstdout) - # restore warnings, wrongly silenced by Twisted - warnings.showwarning = _oldshowwarning - return log_observer - -def msg(message=None, _level=INFO, **kw): - kw['logLevel'] = kw.pop('level', _level) - kw.setdefault('system', 'scrapy') - if message is None: - log.msg(**kw) - else: - log.msg(message, **kw) def err(_stuff=None, _why=None, **kw): - kw['logLevel'] = kw.pop('level', ERROR) - kw.setdefault('system', 'scrapy') - log.err(_stuff, _why, **kw) - -def start_from_settings(settings, crawler=None): - if settings.getbool('LOG_ENABLED'): - return start(settings['LOG_FILE'], settings['LOG_LEVEL'], settings['LOG_STDOUT'], - settings['LOG_ENCODING'], crawler) - -def scrapy_info(settings): - msg("Scrapy %s started (bot: %s)" % (scrapy.__version__, - settings['BOT_NAME'])) - - msg("Optional features available: %s" % ", ".join(scrapy.optional_features), - level=INFO) - - d = dict(overridden_settings(settings)) - msg(format="Overridden settings: %(settings)r", settings=d, level=INFO) - -def start_from_crawler(crawler): - return start_from_settings(crawler.settings, crawler) + warnings.warn('log.err has been deprecated, create a python logger and ' + 'use its error method instead', + ScrapyDeprecationWarning, stacklevel=2) + + level = kw.pop('level', logging.ERROR) + failure = kw.pop('failure', _stuff) or Failure() + message = kw.pop('why', _why) or failure.value + logger.log(level, message, *[kw] if kw else [], extra={'failure': failure}) diff --git a/scrapy/logformatter.py b/scrapy/logformatter.py index d03d2d07f0e..a0508e0b76c 100644 --- a/scrapy/logformatter.py +++ b/scrapy/logformatter.py @@ -1,46 +1,67 @@ import os +import logging from twisted.python.failure import Failure -from scrapy import log +SCRAPEDMSG = u"Scraped from %(src)s" + os.linesep + "%(item)s" +DROPPEDMSG = u"Dropped: %(exception)s" + os.linesep + "%(item)s" +CRAWLEDMSG = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s" -SCRAPEDFMT = u"Scraped from %(src)s" + os.linesep + "%(item)s" -DROPPEDFMT = u"Dropped: %(exception)s" + os.linesep + "%(item)s" -CRAWLEDFMT = u"Crawled (%(status)s) %(request)s (referer: %(referer)s)%(flags)s" class LogFormatter(object): - """Class for generating log messages for different actions. All methods - must return a plain string which doesn't include the log level or the - timestamp + """Class for generating log messages for different actions. + + All methods must return a dictionary listing the parameters `level`, `msg` + and `args` which are going to be used for constructing the log message when + calling logging.log. + + Dictionary keys for the method outputs: + * `level` should be the log level for that action, you can use those + from the python logging library: logging.DEBUG, logging.INFO, + logging.WARNING, logging.ERROR and logging.CRITICAL. + + * `msg` should be a string that can contain different formatting + placeholders. This string, formatted with the provided `args`, is going + to be the log message for that action. + + * `args` should be a tuple or dict with the formatting placeholders for + `msg`. The final log message is computed as output['msg'] % + output['args']. """ def crawled(self, request, response, spider): flags = ' %s' % str(response.flags) if response.flags else '' return { - 'level': log.DEBUG, - 'format': CRAWLEDFMT, - 'status': response.status, - 'request': request, - 'referer': request.headers.get('Referer'), - 'flags': flags, + 'level': logging.DEBUG, + 'msg': CRAWLEDMSG, + 'args': { + 'status': response.status, + 'request': request, + 'referer': request.headers.get('Referer'), + 'flags': flags, + } } def scraped(self, item, response, spider): src = response.getErrorMessage() if isinstance(response, Failure) else response return { - 'level': log.DEBUG, - 'format': SCRAPEDFMT, - 'src': src, - 'item': item, + 'level': logging.DEBUG, + 'msg': SCRAPEDMSG, + 'args': { + 'src': src, + 'item': item, + } } def dropped(self, item, exception, response, spider): return { - 'level': log.WARNING, - 'format': DROPPEDFMT, - 'exception': exception, - 'item': item, + 'level': logging.WARNING, + 'msg': DROPPEDMSG, + 'args': { + 'exception': exception, + 'item': item, + } } @classmethod diff --git a/scrapy/mail.py b/scrapy/mail.py index e1d7c44f672..2b4c579802d 100644 --- a/scrapy/mail.py +++ b/scrapy/mail.py @@ -3,6 +3,8 @@ See documentation in docs/topics/email.rst """ +import logging + from six.moves import cStringIO as StringIO import six @@ -20,7 +22,8 @@ from twisted.internet import defer, reactor, ssl from twisted.mail.smtp import ESMTPSenderFactory -from scrapy import log +logger = logging.getLogger(__name__) + class MailSender(object): @@ -71,8 +74,10 @@ def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', _c _callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg) if self.debug: - log.msg(format='Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s Subject="%(mailsubject)s" Attachs=%(mailattachs)d', - level=log.DEBUG, mailto=to, mailcc=cc, mailsubject=subject, mailattachs=len(attachs)) + logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s ' + 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d', + {'mailto': to, 'mailcc': cc, 'mailsubject': subject, + 'mailattachs': len(attachs)}) return dfd = self._sendmail(rcpts, msg.as_string()) @@ -83,17 +88,18 @@ def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', _c return dfd def _sent_ok(self, result, to, cc, subject, nattachs): - log.msg(format='Mail sent OK: To=%(mailto)s Cc=%(mailcc)s ' - 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d', - mailto=to, mailcc=cc, mailsubject=subject, mailattachs=nattachs) + logger.info('Mail sent OK: To=%(mailto)s Cc=%(mailcc)s ' + 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d', + {'mailto': to, 'mailcc': cc, 'mailsubject': subject, + 'mailattachs': nattachs}) def _sent_failed(self, failure, to, cc, subject, nattachs): errstr = str(failure.value) - log.msg(format='Unable to send mail: To=%(mailto)s Cc=%(mailcc)s ' - 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d' - '- %(mailerr)s', - level=log.ERROR, mailto=to, mailcc=cc, mailsubject=subject, - mailattachs=nattachs, mailerr=errstr) + logger.error('Unable to send mail: To=%(mailto)s Cc=%(mailcc)s ' + 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d' + '- %(mailerr)s', + {'mailto': to, 'mailcc': cc, 'mailsubject': subject, + 'mailattachs': nattachs, 'mailerr': errstr}) def _sendmail(self, to_addrs, msg): msg = StringIO(msg) diff --git a/scrapy/middleware.py b/scrapy/middleware.py index b1494b13781..a7adc39e3a0 100644 --- a/scrapy/middleware.py +++ b/scrapy/middleware.py @@ -1,10 +1,13 @@ +import logging from collections import defaultdict -from scrapy import log from scrapy.exceptions import NotConfigured from scrapy.utils.misc import load_object from scrapy.utils.defer import process_parallel, process_chain, process_chain_both +logger = logging.getLogger(__name__) + + class MiddlewareManager(object): """Base class for implementing middleware managers""" @@ -37,12 +40,15 @@ def from_settings(cls, settings, crawler=None): except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] - log.msg(format="Disabled %(clsname)s: %(eargs)s", - level=log.WARNING, clsname=clsname, eargs=e.args[0]) + logger.warning("Disabled %(clsname)s: %(eargs)s", + {'clsname': clsname, 'eargs': e.args[0]}, + extra={'crawler': crawler}) enabled = [x.__class__.__name__ for x in middlewares] - log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO, - componentname=cls.component_name, enabledlist=', '.join(enabled)) + logger.info("Enabled %(componentname)ss: %(enabledlist)s", + {'componentname': cls.component_name, + 'enabledlist': ', '.join(enabled)}, + extra={'crawler': crawler}) return cls(*middlewares) @classmethod diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py index 49addbc8146..0c3d7c5bd44 100644 --- a/scrapy/settings/default_settings.py +++ b/scrapy/settings/default_settings.py @@ -167,6 +167,8 @@ LOG_ENABLED = True LOG_ENCODING = 'utf-8' LOG_FORMATTER = 'scrapy.logformatter.LogFormatter' +LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s' +LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S%z' LOG_STDOUT = False LOG_LEVEL = 'DEBUG' LOG_FILE = None diff --git a/scrapy/spider.py b/scrapy/spider.py index a2d091804a6..36623b6e228 100644 --- a/scrapy/spider.py +++ b/scrapy/spider.py @@ -3,9 +3,9 @@ See documentation in docs/topics/spiders.rst """ +import logging import warnings -from scrapy import log from scrapy import signals from scrapy.http import Request from scrapy.utils.trackref import object_ref @@ -31,11 +31,19 @@ def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] - def log(self, message, level=log.DEBUG, **kw): - """Log the given messages at the given log level. Always use this - method to send log messages from your spider + @property + def logger(self): + logger = logging.getLogger(self.name) + return logging.LoggerAdapter(logger, {'spider': self}) + + def log(self, message, level=logging.DEBUG, **kw): + """Log the given message at the given log level + + This helper wraps a log call to the logger within the spider, but you + can use it directly (e.g. Spider.logger.info('msg')) or use any other + Python logger too. """ - log.msg(message, spider=self, level=level, **kw) + self.logger.log(level, message, **kw) @classmethod def from_crawler(cls, crawler, *args, **kwargs): diff --git a/scrapy/statscol.py b/scrapy/statscol.py index 8a7eed14963..62b037f3623 100644 --- a/scrapy/statscol.py +++ b/scrapy/statscol.py @@ -2,8 +2,10 @@ Scrapy extension for collecting scraping stats """ import pprint +import logging + +logger = logging.getLogger(__name__) -from scrapy import log class StatsCollector(object): @@ -41,8 +43,8 @@ def open_spider(self, spider): def close_spider(self, spider, reason): if self._dump: - log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), \ - spider=spider) + logger.info("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), + extra={'spider': spider}) self._persist_stats(self._stats, spider) def _persist_stats(self, stats, spider): diff --git a/scrapy/telnet.py b/scrapy/telnet.py index d7cd601a2fa..d9add1d97f6 100644 --- a/scrapy/telnet.py +++ b/scrapy/telnet.py @@ -5,6 +5,7 @@ """ import pprint +import logging from twisted.internet import protocol try: @@ -15,7 +16,7 @@ TWISTED_CONCH_AVAILABLE = False from scrapy.exceptions import NotConfigured -from scrapy import log, signals +from scrapy import signals from scrapy.utils.trackref import print_live_refs from scrapy.utils.engine import print_engine_status from scrapy.utils.reactor import listen_tcp @@ -26,6 +27,8 @@ except ImportError: hpy = None +logger = logging.getLogger(__name__) + # signal to update telnet variables # args: telnet_vars update_telnet_vars = object() @@ -52,8 +55,9 @@ def from_crawler(cls, crawler): def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() - log.msg(format="Telnet console listening on %(host)s:%(port)d", - level=log.DEBUG, host=h.host, port=h.port) + logger.debug("Telnet console listening on %(host)s:%(port)d", + {'host': h.host, 'port': h.port}, + extra={'crawler': self.crawler}) def stop_listening(self): self.port.stopListening() diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py index a889114d5c0..58ddf621528 100644 --- a/scrapy/utils/iterators.py +++ b/scrapy/utils/iterators.py @@ -1,15 +1,20 @@ -import re, csv, six +import re +import csv +import logging try: from cStringIO import StringIO as BytesIO except ImportError: from io import BytesIO +import six + from scrapy.http import TextResponse, Response from scrapy.selector import Selector -from scrapy import log from scrapy.utils.python import re_rsearch, str_to_unicode +logger = logging.getLogger(__name__) + def xmliter(obj, nodename): """Return a iterator of Selector's over all nodes of a XML document, @@ -108,8 +113,10 @@ def _getrow(csv_r): while True: row = _getrow(csv_r) if len(row) != len(headers): - log.msg(format="ignoring row %(csvlnum)d (length: %(csvrow)d, should be: %(csvheader)d)", - level=log.WARNING, csvlnum=csv_r.line_num, csvrow=len(row), csvheader=len(headers)) + logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, " + "should be: %(csvheader)d)", + {'csvlnum': csv_r.line_num, 'csvrow': len(row), + 'csvheader': len(headers)}) continue else: yield dict(zip(headers, row)) diff --git a/scrapy/utils/log.py b/scrapy/utils/log.py new file mode 100644 index 00000000000..c9250d7a105 --- /dev/null +++ b/scrapy/utils/log.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- + +import os +import sys +import logging +import warnings +from logging.config import dictConfig + +from twisted.python.failure import Failure +from twisted.python import log as twisted_log + +import scrapy +from scrapy.settings import overridden_settings +from scrapy.exceptions import ScrapyDeprecationWarning + +logger = logging.getLogger(__name__) + + +class FailureFormatter(logging.Filter): + """Extract exc_info from Failure instances provided as contextual data + + This filter mimics Twisted log.err formatting for its first `_stuff` + argument, which means that reprs of non Failure objects are appended to the + log messages. + """ + + def filter(self, record): + failure = record.__dict__.get('failure') + if failure: + if isinstance(failure, Failure): + record.exc_info = (failure.type, failure.value, failure.tb) + else: + record.msg += os.linesep + repr(failure) + return True + + +class TopLevelFormatter(logging.Filter): + """Keep only top level loggers's name (direct children from root) from + records. + + This filter will replace Scrapy loggers' names with 'scrapy'. This mimics + the old Scrapy log behaviour and helps shortening long names. + + Since it can't be set for just one logger (it won't propagate for its + children), it's going to be set in the root handler, with a parametrized + `loggers` list where it should act. + """ + + def __init__(self, loggers=None): + self.loggers = loggers or [] + + def filter(self, record): + if any(record.name.startswith(l + '.') for l in self.loggers): + record.name = record.name.split('.', 1)[0] + return True + + +DEFAULT_LOGGING = { + 'version': 1, + 'disable_existing_loggers': False, + 'filters': { + 'failure_formatter': { + '()': 'scrapy.utils.log.FailureFormatter', + }, + }, + 'loggers': { + 'scrapy': { + 'level': 'DEBUG', + 'filters': ['failure_formatter'], + }, + 'twisted': { + 'level': 'ERROR', + }, + } +} + + +def configure_logging(settings=None): + """Initialize and configure default loggers + + This function does: + - Route warnings and twisted logging through Python standard logging + - Set FailureFormatter filter on Scrapy logger + - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively + - Create a handler for the root logger according to given settings + """ + if not sys.warnoptions: + # Route warnings through python logging + logging.captureWarnings(True) + + observer = twisted_log.PythonLoggingObserver('twisted') + observer.start() + + dictConfig(DEFAULT_LOGGING) + if settings: + logging.root.setLevel(logging.NOTSET) + + if settings.getbool('LOG_STDOUT'): + sys.stdout = StreamLogger(logging.getLogger('stdout')) + + # Set up the default log handler + filename = settings.get('LOG_FILE') + if filename: + encoding = settings.get('LOG_ENCODING') + handler = logging.FileHandler(filename, encoding=encoding) + elif settings.getbool('LOG_ENABLED'): + handler = logging.StreamHandler() + else: + handler = logging.NullHandler() + + formatter = logging.Formatter( + fmt=settings.get('LOG_FORMAT'), + datefmt=settings.get('LOG_DATEFORMAT') + ) + handler.setFormatter(formatter) + handler.setLevel(settings.get('LOG_LEVEL')) + handler.addFilter(TopLevelFormatter(['scrapy'])) + logging.root.addHandler(handler) + + +def log_scrapy_info(settings): + logger.info("Scrapy %(version)s started (bot: %(bot)s)", + {'version': scrapy.__version__, 'bot': settings['BOT_NAME']}) + + logger.info("Optional features available: %(features)s", + {'features': ", ".join(scrapy.optional_features)}) + + d = dict(overridden_settings(settings)) + logger.info("Overridden settings: %(settings)r", {'settings': d}) + + +class StreamLogger(object): + """Fake file-like stream object that redirects writes to a logger instance + + Taken from: + http://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/ + """ + def __init__(self, logger, log_level=logging.INFO): + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def write(self, buf): + for line in buf.rstrip().splitlines(): + self.logger.log(self.log_level, line.rstrip()) + + +class LogCounterHandler(logging.Handler): + """Record log levels count into a crawler stats""" + + def __init__(self, crawler, *args, **kwargs): + super(LogCounterHandler, self).__init__(*args, **kwargs) + self.crawler = crawler + + def emit(self, record): + sname = 'log_count/{}'.format(record.levelname) + self.crawler.stats.inc_value(sname) + + +def logformatter_adapter(logkws): + """ + Helper that takes the dictionary output from the methods in LogFormatter + and adapts it into a tuple of positional arguments for logger.log calls, + handling backward compatibility as well. + """ + if not {'level', 'msg', 'args'} <= set(logkws): + warnings.warn('Missing keys in LogFormatter method', + ScrapyDeprecationWarning) + + if 'format' in logkws: + warnings.warn('`format` key in LogFormatter methods has been ' + 'deprecated, use `msg` instead', + ScrapyDeprecationWarning) + + level = logkws.get('level', logging.INFO) + message = logkws.get('format', logkws.get('msg')) + # NOTE: This also handles 'args' being an empty dict, that case doesn't + # play well in logger.log calls + args = logkws if not logkws.get('args') else logkws['args'] + + return (level, message, args) diff --git a/scrapy/utils/signal.py b/scrapy/utils/signal.py index 724f3a89269..cab172e4cf0 100644 --- a/scrapy/utils/signal.py +++ b/scrapy/utils/signal.py @@ -1,5 +1,7 @@ """Helper functions for working with signals""" +import logging + from twisted.internet.defer import maybeDeferred, DeferredList, Deferred from twisted.python.failure import Failure @@ -7,7 +9,8 @@ getAllReceivers, disconnect from scrapy.xlib.pydispatch.robustapply import robustApply -from scrapy import log +logger = logging.getLogger(__name__) + def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): """Like pydispatcher.robust.sendRobust but it also logs errors and returns @@ -21,14 +24,14 @@ def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named) if isinstance(response, Deferred): - log.msg(format="Cannot return deferreds from signal handler: %(receiver)s", - level=log.ERROR, spider=spider, receiver=receiver) + logger.error("Cannot return deferreds from signal handler: %(receiver)s", + {'receiver': receiver}, extra={'spider': spider}) except dont_log: result = Failure() except Exception: result = Failure() - log.err(result, "Error caught on signal handler: %s" % receiver, \ - spider=spider) + logger.exception("Error caught on signal handler: %(receiver)s", + {'receiver': receiver}, extra={'spider': spider}) else: result = response responses.append((receiver, result)) @@ -41,8 +44,9 @@ def send_catch_log_deferred(signal=Any, sender=Anonymous, *arguments, **named): """ def logerror(failure, recv): if dont_log is None or not isinstance(failure.value, dont_log): - log.err(failure, "Error caught on signal handler: %s" % recv, \ - spider=spider) + logger.error("Error caught on signal handler: %(receiver)s", + {'receiver': recv}, + extra={'spider': spider, 'failure': failure}) return failure dont_log = named.pop('dont_log', None) diff --git a/scrapy/utils/spider.py b/scrapy/utils/spider.py index 44f098f052c..7ed2d0c3b65 100644 --- a/scrapy/utils/spider.py +++ b/scrapy/utils/spider.py @@ -1,11 +1,13 @@ +import logging import inspect import six -from scrapy import log from scrapy.spider import Spider from scrapy.utils.misc import arg_to_iter +logger = logging.getLogger(__name__) + def iterate_spider_output(result): return arg_to_iter(result) @@ -43,12 +45,12 @@ def spidercls_for_request(spider_loader, request, default_spidercls=None, return spider_loader.load(snames[0]) if len(snames) > 1 and log_multiple: - log.msg(format='More than one spider can handle: %(request)s - %(snames)s', - level=log.ERROR, request=request, snames=', '.join(snames)) + logger.error('More than one spider can handle: %(request)s - %(snames)s', + {'request': request, 'snames': ', '.join(snames)}) if len(snames) == 0 and log_none: - log.msg(format='Unable to find spider that handles: %(request)s', - level=log.ERROR, request=request) + logger.error('Unable to find spider that handles: %(request)s', + {'request': request}) return default_spidercls diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py index a4b769970c9..ad4a6aa7c4d 100644 --- a/scrapy/utils/test.py +++ b/scrapy/utils/test.py @@ -46,19 +46,6 @@ def get_testenv(): env['PYTHONPATH'] = get_pythonpath() return env -def get_testlog(): - """Get Scrapy log of current test, ignoring the rest""" - with open("test.log", "rb") as fp: - loglines = fp.readlines() - - thistest = [] - for line in loglines[::-1]: - thistest.append(line) - if "[-] -->" in line: - break - return "".join(thistest[::-1]) - - def assert_samelines(testcase, text1, text2, msg=None): """Asserts text1 and text2 have the same lines, ignoring differences in line endings between platforms diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt index 7a150b28173..d27d5dd3c6d 100644 --- a/tests/py3-ignores.txt +++ b/tests/py3-ignores.txt @@ -31,7 +31,6 @@ tests/test_http_cookies.py tests/test_http_request.py tests/test_http_response.py tests/test_logformatter.py -tests/test_log.py tests/test_mail.py tests/test_middleware.py tests/test_pipeline_files.py @@ -59,6 +58,7 @@ tests/test_stats.py tests/test_utils_defer.py tests/test_utils_iterators.py tests/test_utils_jsonrpc.py +tests/test_utils_log.py tests/test_utils_python.py tests/test_utils_reqser.py tests/test_utils_request.py diff --git a/tests/requirements.txt b/tests/requirements.txt index 4da8e30ea6c..4f1520021dc 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -4,3 +4,4 @@ mitmproxy==0.10.1 netlib==0.10.1 pytest-twisted jmespath +testfixtures diff --git a/tests/spiders.py b/tests/spiders.py index 86ace9d6e42..5484fc5b923 100644 --- a/tests/spiders.py +++ b/tests/spiders.py @@ -74,7 +74,7 @@ def __init__(self, url="http://localhost:8998", *args, **kwargs): self.start_urls = [url] def parse(self, response): - self.log("Got response %d" % response.status) + self.logger.info("Got response %d" % response.status) class ItemSpider(FollowAllSpider): diff --git a/tests/test_closespider.py b/tests/test_closespider.py index 1700a861ea6..fa0b489989b 100644 --- a/tests/test_closespider.py +++ b/tests/test_closespider.py @@ -39,7 +39,6 @@ def test_closespider_errorcount(self): close_on = 5 crawler = get_crawler(ErrorSpider, {'CLOSESPIDER_ERRORCOUNT': close_on}) yield crawler.crawl(total=1000000) - self.flushLoggedErrors(crawler.spider.exception_cls) reason = crawler.spider.meta['close_reason'] self.assertEqual(reason, 'closespider_errorcount') key = 'spider_exceptions/{name}'\ diff --git a/tests/test_commands.py b/tests/test_commands.py index 68f76d00298..b95d0b0ccb3 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -137,21 +137,20 @@ def test_runspider(self): with open(fname, 'w') as f: f.write(""" import scrapy -from scrapy import log class MySpider(scrapy.Spider): name = 'myspider' def start_requests(self): - self.log("It Works!") + self.logger.debug("It Works!") return [] """) p = self.proc('runspider', fname) log = p.stderr.read() - self.assertIn("[myspider] DEBUG: It Works!", log) - self.assertIn("[myspider] INFO: Spider opened", log) - self.assertIn("[myspider] INFO: Closing spider (finished)", log) - self.assertIn("[myspider] INFO: Spider closed (finished)", log) + self.assertIn("DEBUG: It Works!", log) + self.assertIn("INFO: Spider opened", log) + self.assertIn("INFO: Closing spider (finished)", log) + self.assertIn("INFO: Spider closed (finished)", log) def test_runspider_no_spider_found(self): tmpdir = self.mktemp() @@ -159,7 +158,6 @@ def test_runspider_no_spider_found(self): fname = abspath(join(tmpdir, 'myspider.py')) with open(fname, 'w') as f: f.write(""" -from scrapy import log from scrapy.spider import Spider """) p = self.proc('runspider', fname) @@ -192,7 +190,6 @@ def setUp(self): fname = abspath(join(self.proj_mod_path, 'spiders', 'myspider.py')) with open(fname, 'w') as f: f.write(""" -from scrapy import log import scrapy class MySpider(scrapy.Spider): @@ -200,20 +197,20 @@ class MySpider(scrapy.Spider): def parse(self, response): if getattr(self, 'test_arg', None): - self.log('It Works!') + self.logger.debug('It Works!') return [scrapy.Item(), dict(foo='bar')] """.format(self.spider_name)) fname = abspath(join(self.proj_mod_path, 'pipelines.py')) with open(fname, 'w') as f: f.write(""" -from scrapy import log +import logging class MyPipeline(object): component_name = 'my_pipeline' def process_item(self, item, spider): - log.msg('It Works!') + logging.info('It Works!') return item """) @@ -229,7 +226,7 @@ def test_spider_arguments(self): '-a', 'test_arg=1', '-c', 'parse', self.url('/html')]) - self.assertIn("[parse_spider] DEBUG: It Works!", stderr) + self.assertIn("DEBUG: It Works!", stderr) @defer.inlineCallbacks def test_pipelines(self): @@ -237,7 +234,7 @@ def test_pipelines(self): '--pipelines', '-c', 'parse', self.url('/html')]) - self.assertIn("[scrapy] INFO: It Works!", stderr) + self.assertIn("INFO: It Works!", stderr) @defer.inlineCallbacks def test_parse_items(self): diff --git a/tests/test_crawl.py b/tests/test_crawl.py index 0fcc7c31ebc..f2ebf9c6996 100644 --- a/tests/test_crawl.py +++ b/tests/test_crawl.py @@ -1,8 +1,12 @@ import json import socket +import logging + +from testfixtures import LogCapture from twisted.internet import defer from twisted.trial.unittest import TestCase -from scrapy.utils.test import get_crawler, get_testlog + +from scrapy.utils.test import get_crawler from tests import mock from tests.spiders import FollowAllSpider, DelaySpider, SimpleSpider, \ BrokenStartRequestsSpider, SingleRequestSpider, DuplicateStartRequestsSpider @@ -72,36 +76,47 @@ def test_timeout_failure(self): @defer.inlineCallbacks def test_retry_503(self): crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://localhost:8998/status?n=503") - self._assert_retried() + with LogCapture() as l: + yield crawler.crawl("http://localhost:8998/status?n=503") + self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://localhost:65432/status?n=503") - self._assert_retried() + with LogCapture() as l: + yield crawler.crawl("http://localhost:65432/status?n=503") + self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): with mock.patch('socket.gethostbyname', side_effect=socket.gaierror(-5, 'No address associated with hostname')): crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://example.com/") - self._assert_retried() + with LogCapture() as l: + yield crawler.crawl("http://example.com/") + self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): - crawler = get_crawler(BrokenStartRequestsSpider) - yield crawler.crawl(fail_before_yield=1) - errors = self.flushLoggedErrors(ZeroDivisionError) - self.assertEqual(len(errors), 1) + with LogCapture('scrapy', level=logging.ERROR) as l: + crawler = get_crawler(BrokenStartRequestsSpider) + yield crawler.crawl(fail_before_yield=1) + + self.assertEqual(len(l.records), 1) + record = l.records[0] + self.assertIsNotNone(record.exc_info) + self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): - crawler = get_crawler(BrokenStartRequestsSpider) - yield crawler.crawl(fail_yielding=1) - errors = self.flushLoggedErrors(ZeroDivisionError) - self.assertEqual(len(errors), 1) + with LogCapture('scrapy', level=logging.ERROR) as l: + crawler = get_crawler(BrokenStartRequestsSpider) + yield crawler.crawl(fail_yielding=1) + + self.assertEqual(len(l.records), 1) + record = l.records[0] + self.assertIsNotNone(record.exc_info) + self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): @@ -145,28 +160,29 @@ def test_unbounded_response(self): with multiples lines '''}) crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://localhost:8998/raw?{0}".format(query)) - log = get_testlog() - self.assertEqual(log.count("Got response 200"), 1) + with LogCapture() as l: + yield crawler.crawl("http://localhost:8998/raw?{0}".format(query)) + self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://localhost:8998/drop?abort=0") - self._assert_retried() + with LogCapture() as l: + yield crawler.crawl("http://localhost:8998/drop?abort=0") + self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = get_crawler(SimpleSpider) - yield crawler.crawl("http://localhost:8998/drop?abort=1") - self._assert_retried() + with LogCapture() as l: + yield crawler.crawl("http://localhost:8998/drop?abort=1") + self._assert_retried(l) - def _assert_retried(self): - log = get_testlog() - self.assertEqual(log.count("Retrying"), 2) - self.assertEqual(log.count("Gave up retrying"), 1) + def _assert_retried(self, log): + self.assertEqual(str(log).count("Retrying"), 2) + self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): diff --git a/tests/test_log.py b/tests/test_log.py deleted file mode 100644 index 113d1000451..00000000000 --- a/tests/test_log.py +++ /dev/null @@ -1,169 +0,0 @@ -from io import BytesIO - -from twisted.python import log as txlog, failure -from twisted.trial import unittest - -from scrapy import log -from scrapy.spider import Spider -from scrapy.settings import default_settings -from scrapy.utils.test import get_crawler - -class LogTest(unittest.TestCase): - - def test_get_log_level(self): - default_log_level = getattr(log, default_settings.LOG_LEVEL) - self.assertEqual(log._get_log_level('WARNING'), log.WARNING) - self.assertEqual(log._get_log_level(log.WARNING), log.WARNING) - self.assertRaises(ValueError, log._get_log_level, object()) - -class ScrapyFileLogObserverTest(unittest.TestCase): - - level = log.INFO - encoding = 'utf-8' - - def setUp(self): - self.f = BytesIO() - self.log_observer = log.ScrapyFileLogObserver(self.f, self.level, self.encoding) - self.log_observer.start() - - def tearDown(self): - self.flushLoggedErrors() - self.log_observer.stop() - - def logged(self): - return self.f.getvalue().strip()[25:] - - def first_log_line(self): - logged = self.logged() - return logged.splitlines()[0] if logged else '' - - def test_msg_basic(self): - log.msg("Hello") - self.assertEqual(self.logged(), "[scrapy] INFO: Hello") - - def test_msg_ignore_spider(self): - spider = Spider("myspider") - log.msg("Hello", spider=spider) - self.failIf(self.logged()) - - def test_msg_level1(self): - log.msg("Hello", level=log.WARNING) - self.assertEqual(self.logged(), "[scrapy] WARNING: Hello") - - def test_msg_level2(self): - log.msg("Hello", log.WARNING) - self.assertEqual(self.logged(), "[scrapy] WARNING: Hello") - - def test_msg_wrong_level(self): - log.msg("Hello", level=9999) - self.assertEqual(self.logged(), "[scrapy] NOLEVEL: Hello") - - def test_msg_encoding(self): - log.msg(u"Price: \xa3100") - self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xc2\xa3100") - - def test_msg_ignore_level(self): - log.msg("Hello", level=log.DEBUG) - log.msg("World", level=log.INFO) - self.assertEqual(self.logged(), "[scrapy] INFO: World") - - def test_msg_ignore_system(self): - txlog.msg("Hello") - self.failIf(self.logged()) - - def test_msg_ignore_system_err(self): - txlog.msg("Hello") - self.failIf(self.logged()) - - def test_err_noargs(self): - try: - a = 1/0 - except: - log.err() - self.assertIn('Traceback', self.logged()) - self.assertIn('ZeroDivisionError', self.logged()) - - def test_err_why(self): - log.err(TypeError("bad type"), "Wrong type") - self.assertEqual(self.first_log_line(), "[scrapy] ERROR: Wrong type") - self.assertIn('TypeError', self.logged()) - self.assertIn('bad type', self.logged()) - - def test_error_outside_scrapy(self): - """Scrapy logger should still print outside errors""" - txlog.err(TypeError("bad type"), "Wrong type") - self.assertEqual(self.first_log_line(), "[-] ERROR: Wrong type") - self.assertIn('TypeError', self.logged()) - self.assertIn('bad type', self.logged()) - -# this test fails in twisted trial observer, not in scrapy observer -# def test_err_why_encoding(self): -# log.err(TypeError("bad type"), u"\xa3") -# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xc2\xa3") - - def test_err_exc(self): - log.err(TypeError("bad type")) - self.assertIn('Unhandled Error', self.logged()) - self.assertIn('TypeError', self.logged()) - self.assertIn('bad type', self.logged()) - - def test_err_failure(self): - log.err(failure.Failure(TypeError("bad type"))) - self.assertIn('Unhandled Error', self.logged()) - self.assertIn('TypeError', self.logged()) - self.assertIn('bad type', self.logged()) - - -class Latin1ScrapyFileLogObserverTest(ScrapyFileLogObserverTest): - - encoding = 'latin-1' - - def test_msg_encoding(self): - log.msg(u"Price: \xa3100") - logged = self.f.getvalue().strip()[25:] - self.assertEqual(self.logged(), "[scrapy] INFO: Price: \xa3100") - -# this test fails in twisted trial observer, not in scrapy observer -# def test_err_why_encoding(self): -# log.err(TypeError("bad type"), u"\xa3") -# self.assertEqual(self.first_log_line(), "[scrapy] ERROR: \xa3") - - -class CrawlerScrapyFileLogObserverTest(unittest.TestCase): - - def setUp(self): - self.f = BytesIO() - self.crawler = get_crawler(Spider) - self.spider = self.crawler.spider = self.crawler._create_spider('test') - self.log_observer = log.ScrapyFileLogObserver(self.f, log.INFO, - 'utf-8', self.crawler) - self.log_observer.start() - - def tearDown(self): - self.flushLoggedErrors() - self.log_observer.stop() - - def logged(self): - return self.f.getvalue().strip()[25:] - - def test_msg_basic(self): - log.msg("Hello", spider=self.spider) - self.assertEqual(self.logged(), "[test] INFO: Hello") - - def test_msg_ignore_scrapy_channel(self): - log.msg("Hello") - self.failIf(self.logged()) - - def test_msg_ignore_another_crawler(self): - crawler = get_crawler(Spider) - log.msg("Hello", spider=crawler._create_spider('test')) - self.failIf(self.logged()) - - def test_msg_stats_log(self): - assert self.crawler.stats.get_value('log_count/INFO', 0) == 0 - log.msg("Hello", spider=self.spider) - self.assertEqual(self.crawler.stats.get_value('log_count/INFO'), 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_logformatter.py b/tests/test_logformatter.py index adec19afa4c..8446fd646c8 100644 --- a/tests/test_logformatter.py +++ b/tests/test_logformatter.py @@ -24,14 +24,14 @@ def test_crawled(self): req = Request("http://www.example.com") res = Response("http://www.example.com") logkws = self.formatter.crawled(req, res, self.spider) - logline = logkws['format'] % logkws + logline = logkws['msg'] % logkws['args'] self.assertEqual(logline, "Crawled (200) (referer: None)") req = Request("http://www.example.com", headers={'referer': 'http://example.com'}) res = Response("http://www.example.com", flags=['cached']) logkws = self.formatter.crawled(req, res, self.spider) - logline = logkws['format'] % logkws + logline = logkws['msg'] % logkws['args'] self.assertEqual(logline, "Crawled (200) (referer: http://example.com) ['cached']") @@ -40,7 +40,7 @@ def test_dropped(self): exception = Exception(u"\u2018") response = Response("http://www.example.com") logkws = self.formatter.dropped(item, exception, response, self.spider) - logline = logkws['format'] % logkws + logline = logkws['msg'] % logkws['args'] lines = logline.splitlines() assert all(isinstance(x, unicode) for x in lines) self.assertEqual(lines, [u"Dropped: \u2018", '{}']) @@ -50,7 +50,7 @@ def test_scraped(self): item['name'] = u'\xa3' response = Response("http://www.example.com") logkws = self.formatter.scraped(item, response, self.spider) - logline = logkws['format'] % logkws + logline = logkws['msg'] % logkws['args'] lines = logline.splitlines() assert all(isinstance(x, unicode) for x in lines) self.assertEqual(lines, [u"Scraped from <200 http://www.example.com>", u'name: \xa3']) diff --git a/tests/test_pipeline_media.py b/tests/test_pipeline_media.py index e4fae39836e..0e946303be4 100644 --- a/tests/test_pipeline_media.py +++ b/tests/test_pipeline_media.py @@ -1,9 +1,9 @@ from __future__ import print_function +from testfixtures import LogCapture from twisted.trial import unittest from twisted.python.failure import Failure from twisted.internet import reactor from twisted.internet.defer import Deferred, inlineCallbacks -from twisted.python import log as txlog from scrapy.http import Request, Response from scrapy.spider import Spider @@ -11,7 +11,6 @@ from scrapy.contrib.pipeline.media import MediaPipeline from scrapy.utils.signal import disconnect_all from scrapy import signals -from scrapy import log def _mocked_download_func(request, info): @@ -60,26 +59,21 @@ def test_default_item_completed(self): fail = Failure(Exception()) results = [(True, 1), (False, fail)] - events = [] - txlog.addObserver(events.append) - new_item = self.pipe.item_completed(results, item, self.info) - txlog.removeObserver(events.append) - self.flushLoggedErrors() + with LogCapture() as l: + new_item = self.pipe.item_completed(results, item, self.info) assert new_item is item - assert len(events) == 1 - assert events[0]['logLevel'] == log.ERROR - assert events[0]['failure'] is fail + assert len(l.records) == 1 + record = l.records[0] + assert record.levelname == 'ERROR' + assert record.failure is fail # disable failure logging and check again self.pipe.LOG_FAILED_RESULTS = False - events = [] - txlog.addObserver(events.append) - new_item = self.pipe.item_completed(results, item, self.info) - txlog.removeObserver(events.append) - self.flushLoggedErrors() + with LogCapture() as l: + new_item = self.pipe.item_completed(results, item, self.info) assert new_item is item - assert len(events) == 0 + assert len(l.records) == 0 @inlineCallbacks def test_default_process_item(self): diff --git a/tests/test_proxy_connect.py b/tests/test_proxy_connect.py index 5ce48ebf8c8..0f06fd53dea 100644 --- a/tests/test_proxy_connect.py +++ b/tests/test_proxy_connect.py @@ -5,10 +5,11 @@ from threading import Thread from libmproxy import controller, proxy from netlib import http_auth +from testfixtures import LogCapture from twisted.internet import defer from twisted.trial.unittest import TestCase -from scrapy.utils.test import get_testlog, get_crawler +from scrapy.utils.test import get_crawler from scrapy.http import Request from tests.spiders import SimpleSpider, SingleRequestSpider from tests.mockserver import MockServer @@ -50,39 +51,44 @@ def tearDown(self): @defer.inlineCallbacks def test_https_connect_tunnel(self): crawler = get_crawler(SimpleSpider) - yield crawler.crawl("https://localhost:8999/status?n=200") - self._assert_got_response_code(200) + with LogCapture() as l: + yield crawler.crawl("https://localhost:8999/status?n=200") + self._assert_got_response_code(200, l) @defer.inlineCallbacks def test_https_noconnect(self): os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) - yield crawler.crawl("https://localhost:8999/status?n=200") - self._assert_got_response_code(200) + with LogCapture() as l: + yield crawler.crawl("https://localhost:8999/status?n=200") + self._assert_got_response_code(200, l) os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888' @defer.inlineCallbacks def test_https_connect_tunnel_error(self): crawler = get_crawler(SimpleSpider) - yield crawler.crawl("https://localhost:99999/status?n=200") - self._assert_got_tunnel_error() + with LogCapture() as l: + yield crawler.crawl("https://localhost:99999/status?n=200") + self._assert_got_tunnel_error(l) @defer.inlineCallbacks def test_https_tunnel_auth_error(self): os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888' crawler = get_crawler(SimpleSpider) - yield crawler.crawl("https://localhost:8999/status?n=200") + with LogCapture() as l: + yield crawler.crawl("https://localhost:8999/status?n=200") # The proxy returns a 407 error code but it does not reach the client; # he just sees a TunnelError. - self._assert_got_tunnel_error() + self._assert_got_tunnel_error(l) os.environ['https_proxy'] = 'http://scrapy:scrapy@localhost:8888' @defer.inlineCallbacks def test_https_tunnel_without_leak_proxy_authorization_header(self): request = Request("https://localhost:8999/echo") crawler = get_crawler(SingleRequestSpider) - yield crawler.crawl(seed=request) - self._assert_got_response_code(200) + with LogCapture() as l: + yield crawler.crawl(seed=request) + self._assert_got_response_code(200, l) echo = json.loads(crawler.spider.meta['responses'][0].body) self.assertTrue('Proxy-Authorization' not in echo['headers']) @@ -90,13 +96,12 @@ def test_https_tunnel_without_leak_proxy_authorization_header(self): def test_https_noconnect_auth_error(self): os.environ['https_proxy'] = 'http://wrong:wronger@localhost:8888?noconnect' crawler = get_crawler(SimpleSpider) - yield crawler.crawl("https://localhost:8999/status?n=200") - self._assert_got_response_code(407) + with LogCapture() as l: + yield crawler.crawl("https://localhost:8999/status?n=200") + self._assert_got_response_code(407, l) - def _assert_got_response_code(self, code): - log = get_testlog() - self.assertEqual(log.count('Crawled (%d)' % code), 1) + def _assert_got_response_code(self, code, log): + self.assertEqual(str(log).count('Crawled (%d)' % code), 1) - def _assert_got_tunnel_error(self): - log = get_testlog() - self.assertEqual(log.count('TunnelError'), 1) + def _assert_got_tunnel_error(self, log): + self.assertEqual(str(log).count('TunnelError'), 1) diff --git a/tests/test_spider.py b/tests/test_spider.py index 56bcb2b1e35..517fc09950e 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -2,6 +2,8 @@ import inspect import warnings from io import BytesIO + +from testfixtures import LogCapture from twisted.trial import unittest from scrapy import signals @@ -102,6 +104,23 @@ def test_update_settings(self): self.assertEqual(settings.get('TEST2'), 'spider') self.assertEqual(settings.get('TEST3'), 'project') + def test_logger(self): + spider = self.spider_class('example.com') + with LogCapture() as l: + spider.logger.info('test log msg') + l.check(('example.com', 'INFO', 'test log msg')) + + record = l.records[0] + self.assertIn('spider', record.__dict__) + self.assertIs(record.spider, spider) + + def test_log(self): + spider = self.spider_class('example.com') + with mock.patch('scrapy.spider.Spider.logger') as mock_logger: + spider.log('test log msg', 'INFO') + mock_logger.log.assert_called_once_with('INFO', 'test log msg') + + class InitSpiderTest(SpiderTest): spider_class = InitSpider diff --git a/tests/test_spidermiddleware_httperror.py b/tests/test_spidermiddleware_httperror.py index 5394f0eeea1..503865f6305 100644 --- a/tests/test_spidermiddleware_httperror.py +++ b/tests/test_spidermiddleware_httperror.py @@ -1,9 +1,10 @@ from unittest import TestCase +from testfixtures import LogCapture from twisted.trial.unittest import TestCase as TrialTestCase from twisted.internet import defer -from scrapy.utils.test import get_crawler, get_testlog +from scrapy.utils.test import get_crawler from tests.mockserver import MockServer from scrapy.http import Response, Request from scrapy.spider import Spider @@ -174,14 +175,13 @@ def test_middleware_works(self): @defer.inlineCallbacks def test_logging(self): crawler = get_crawler(_HttpErrorSpider) - yield crawler.crawl(bypass_status_codes={402}) - # print(get_testlog()) + with LogCapture() as log: + yield crawler.crawl(bypass_status_codes={402}) self.assertEqual(crawler.spider.parsed, {'200', '402'}) self.assertEqual(crawler.spider.skipped, {'402'}) self.assertEqual(crawler.spider.failed, {'404', '500'}) - log = get_testlog() - self.assertIn('Ignoring response <404', log) - self.assertIn('Ignoring response <500', log) - self.assertNotIn('Ignoring response <200', log) - self.assertNotIn('Ignoring response <402', log) + self.assertIn('Ignoring response <404', str(log)) + self.assertIn('Ignoring response <500', str(log)) + self.assertNotIn('Ignoring response <200', str(log)) + self.assertNotIn('Ignoring response <402', str(log)) diff --git a/tests/test_utils_defer.py b/tests/test_utils_defer.py index b9cb3ab44f4..260ed6c2e3c 100644 --- a/tests/test_utils_defer.py +++ b/tests/test_utils_defer.py @@ -74,7 +74,6 @@ def test_process_parallel(self): def test_process_parallel_failure(self): d = process_parallel([cb1, cb_fail, cb3], 'res', 'v1', 'v2') self.failUnlessFailure(d, TypeError) - self.flushLoggedErrors() return d diff --git a/tests/test_utils_log.py b/tests/test_utils_log.py new file mode 100644 index 00000000000..e6b8c77d929 --- /dev/null +++ b/tests/test_utils_log.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import os +import sys +import logging +import unittest + +from testfixtures import LogCapture +from twisted.python.failure import Failure + +from scrapy.utils.log import (FailureFormatter, TopLevelFormatter, + LogCounterHandler, StreamLogger) +from scrapy.utils.test import get_crawler + + +class FailureFormatterTest(unittest.TestCase): + + def setUp(self): + self.logger = logging.getLogger('test') + self.filter = FailureFormatter() + self.logger.addFilter(self.filter) + + def tearDown(self): + self.logger.removeFilter(self.filter) + + def test_failure_format(self): + with LogCapture() as l: + try: + 0/0 + except ZeroDivisionError: + self.logger.exception('test log msg') + failure = Failure() + + self.logger.error('test log msg', extra={'failure': failure}) + + self.assertEqual(len(l.records), 2) + exc_record, failure_record = l.records + self.assertTupleEqual(failure_record.exc_info, exc_record.exc_info) + + formatter = logging.Formatter() + self.assertMultiLineEqual(formatter.format(failure_record), + formatter.format(exc_record)) + + def test_non_failure_format(self): + with LogCapture() as l: + self.logger.error('test log msg', extra={'failure': 3}) + + self.assertEqual(len(l.records), 1) + self.assertMultiLineEqual(l.records[0].getMessage(), + 'test log msg' + os.linesep + '3') + + +class TopLevelFormatterTest(unittest.TestCase): + + def setUp(self): + self.handler = LogCapture() + self.handler.addFilter(TopLevelFormatter(['test'])) + + def test_top_level_logger(self): + logger = logging.getLogger('test') + with self.handler as l: + logger.warning('test log msg') + + l.check(('test', 'WARNING', 'test log msg')) + + def test_children_logger(self): + logger = logging.getLogger('test.test1') + with self.handler as l: + logger.warning('test log msg') + + l.check(('test', 'WARNING', 'test log msg')) + + def test_overlapping_name_logger(self): + logger = logging.getLogger('test2') + with self.handler as l: + logger.warning('test log msg') + + l.check(('test2', 'WARNING', 'test log msg')) + + def test_different_name_logger(self): + logger = logging.getLogger('different') + with self.handler as l: + logger.warning('test log msg') + + l.check(('different', 'WARNING', 'test log msg')) + + +class LogCounterHandlerTest(unittest.TestCase): + + def setUp(self): + self.logger = logging.getLogger('test') + self.logger.setLevel(logging.NOTSET) + self.logger.propagate = False + self.crawler = get_crawler(settings_dict={'LOG_LEVEL': 'WARNING'}) + self.handler = LogCounterHandler(self.crawler) + self.logger.addHandler(self.handler) + + def tearDown(self): + self.logger.propagate = True + self.logger.removeHandler(self.handler) + + def test_init(self): + self.assertIsNone(self.crawler.stats.get_value('log_count/DEBUG')) + self.assertIsNone(self.crawler.stats.get_value('log_count/INFO')) + self.assertIsNone(self.crawler.stats.get_value('log_count/WARNING')) + self.assertIsNone(self.crawler.stats.get_value('log_count/ERROR')) + self.assertIsNone(self.crawler.stats.get_value('log_count/CRITICAL')) + + def test_accepted_level(self): + self.logger.error('test log msg') + self.assertEqual(self.crawler.stats.get_value('log_count/ERROR'), 1) + + def test_filtered_out_level(self): + self.logger.debug('test log msg') + self.assertIsNone(self.crawler.stats.get_value('log_count/INFO')) + + +class StreamLoggerTest(unittest.TestCase): + + def setUp(self): + self.stdout = sys.stdout + logger = logging.getLogger('test') + logger.setLevel(logging.WARNING) + sys.stdout = StreamLogger(logger, logging.ERROR) + + def tearDown(self): + sys.stdout = self.stdout + + def test_redirect(self): + with LogCapture() as l: + print('test log msg') + l.check(('test', 'ERROR', 'test log msg')) diff --git a/tests/test_utils_signal.py b/tests/test_utils_signal.py index a9f377dabf0..57107391115 100644 --- a/tests/test_utils_signal.py +++ b/tests/test_utils_signal.py @@ -1,11 +1,11 @@ +from testfixtures import LogCapture from twisted.trial import unittest -from twisted.python import log as txlog from twisted.python.failure import Failure from twisted.internet import defer, reactor from scrapy.xlib.pydispatch import dispatcher from scrapy.utils.signal import send_catch_log, send_catch_log_deferred -from scrapy import log + class SendCatchLogTest(unittest.TestCase): @@ -14,26 +14,24 @@ def test_send_catch_log(self): test_signal = object() handlers_called = set() - def log_received(event): - handlers_called.add(log_received) - assert "error_handler" in event['message'][0] - assert event['logLevel'] == log.ERROR - - txlog.addObserver(log_received) dispatcher.connect(self.error_handler, signal=test_signal) dispatcher.connect(self.ok_handler, signal=test_signal) - result = yield defer.maybeDeferred(self._get_result, test_signal, arg='test', \ - handlers_called=handlers_called) + with LogCapture() as l: + result = yield defer.maybeDeferred( + self._get_result, test_signal, arg='test', + handlers_called=handlers_called + ) assert self.error_handler in handlers_called assert self.ok_handler in handlers_called - assert log_received in handlers_called + self.assertEqual(len(l.records), 1) + record = l.records[0] + self.assertIn('error_handler', record.getMessage()) + self.assertEqual(record.levelname, 'ERROR') self.assertEqual(result[0][0], self.error_handler) self.assert_(isinstance(result[0][1], Failure)) self.assertEqual(result[1], (self.ok_handler, "OK")) - txlog.removeObserver(log_received) - self.flushLoggedErrors() dispatcher.disconnect(self.error_handler, signal=test_signal) dispatcher.disconnect(self.ok_handler, signal=test_signal) @@ -73,12 +71,9 @@ class SendCatchLogTest2(unittest.TestCase): def test_error_logged_if_deferred_not_supported(self): test_signal = object() test_handler = lambda: defer.Deferred() - log_events = [] - txlog.addObserver(log_events.append) dispatcher.connect(test_handler, test_signal) - send_catch_log(test_signal) - self.assertTrue(log_events) - self.assertIn("Cannot return deferreds from signal handler", str(log_events)) - txlog.removeObserver(log_events.append) - self.flushLoggedErrors() + with LogCapture() as l: + send_catch_log(test_signal) + self.assertEqual(len(l.records), 1) + self.assertIn("Cannot return deferreds from signal handler", str(l)) dispatcher.disconnect(test_handler, test_signal) diff --git a/tox.ini b/tox.ini index ae2ef9e07a8..5385cb87cff 100644 --- a/tox.ini +++ b/tox.ini @@ -50,6 +50,7 @@ deps = # tests requirements pytest>=2.6.0 pytest-twisted + testfixtures [testenv:py34] basepython = python3.4