Permalink
Browse files

scrapyd: added support for storing items by default

Items are stored the same way as logs, in jsonlines format.

Also renamed logs_to_keep setting to jobs_to_keep.
  • Loading branch information...
1 parent 0693694 commit dbda33efa6ddfde96421724622f0ff0c15b02b57 @pablohoffman pablohoffman committed Jan 4, 2012
@@ -189,10 +189,16 @@ logs_dir
The directory where the Scrapy processes logs will be stored.
-logs_to_keep
+items_dir
+---------
+
+The directory where the Scrapy items will be stored.
+
+jobs_to_keep
------------
-The number of logs to keep per spider. Defaults to ``5``.
+The number of finished jobs to keep per spider. Defaults to ``5``. This
+includes logs and items.
runner
------
@@ -1,7 +1,8 @@
[scrapyd]
eggs_dir = eggs
logs_dir = logs
-logs_to_keep = 5
+items_dir = items
+jobs_to_keep = 5
dbs_dir = dbs
max_proc = 0
max_proc_per_cpu = 4
View
@@ -11,7 +11,8 @@ class Environment(object):
def __init__(self, config, initenv=os.environ):
self.dbs_dir = config.get('dbs_dir', 'dbs')
self.logs_dir = config.get('logs_dir', 'logs')
- self.logs_to_keep = config.getint('logs_to_keep', 5)
+ self.items_dir = config.get('items_dir', 'items')
+ self.jobs_to_keep = config.getint('jobs_to_keep', 5)
if config.cp.has_section('settings'):
self.settings = dict(config.cp.items('settings'))
else:
@@ -29,15 +30,17 @@ def get_environment(self, message, slot):
env['SCRAPY_SETTINGS_MODULE'] = self.settings[project]
dbpath = os.path.join(self.dbs_dir, '%s.db' % project)
env['SCRAPY_SQLITE_DB'] = dbpath
- env['SCRAPY_LOG_FILE'] = self._get_log_file(message)
+ env['SCRAPY_LOG_FILE'] = self._get_file(message, self.logs_dir, 'log')
+ env['SCRAPY_FEED_URI'] = self._get_file(message, self.items_dir, 'jl')
return env
- def _get_log_file(self, message):
- logsdir = os.path.join(self.logs_dir, message['_project'], \
+ def _get_file(self, message, dir, ext):
+ logsdir = os.path.join(dir, message['_project'], \
message['_spider'])
if not os.path.exists(logsdir):
os.makedirs(logsdir)
- to_delete = sorted((os.path.join(logsdir, x) for x in os.listdir(logsdir)), key=os.path.getmtime)[:-self.logs_to_keep]
+ to_delete = sorted((os.path.join(logsdir, x) for x in \
+ os.listdir(logsdir)), key=os.path.getmtime)[:-self.jobs_to_keep]
for x in to_delete:
os.remove(x)
- return os.path.join(logsdir, "%s.log" % message['_job'])
+ return os.path.join(logsdir, "%s.%s" % (message['_job'], ext))
View
@@ -68,6 +68,7 @@ def __init__(self, slot, project, spider, job, env):
self.end_time = None
self.env = env
self.logfile = env['SCRAPY_LOG_FILE']
+ self.itemsfile = env['SCRAPY_FEED_URI']
self.deferred = defer.Deferred()
def outReceived(self, data):
@@ -88,6 +89,6 @@ def processEnded(self, status):
self.deferred.callback(self)
def log(self, msg):
- msg += "project=%r spider=%r job=%r pid=%r log=%r" % (self.project, \
- self.spider, self.job, self.pid, self.logfile)
+ msg += "project=%r spider=%r job=%r pid=%r log=%r items=%r" % (self.project, \
+ self.spider, self.job, self.pid, self.logfile, self.itemsfile)
log.msg(msg, system="Launcher")
View
@@ -18,16 +18,18 @@ def _get_config():
conf = {
'eggs_dir': os.path.join(datadir, 'eggs'),
'logs_dir': os.path.join(datadir, 'logs'),
+ 'items_dir': os.path.join(datadir, 'items'),
'dbs_dir': os.path.join(datadir, 'dbs'),
}
- for k in ['eggs_dir', 'logs_dir', 'dbs_dir']: # create dirs
+ for k in ['eggs_dir', 'logs_dir', 'items_dir', 'dbs_dir']: # create dirs
d = conf[k]
if not os.path.exists(d):
os.makedirs(d)
scrapyd_conf = """
[scrapyd]
eggs_dir = %(eggs_dir)s
logs_dir = %(logs_dir)s
+items_dir = %(items_dir)s
dbs_dir = %(dbs_dir)s
""" % conf
return Config(extra_sources=[StringIO(scrapyd_conf)])
@@ -31,4 +31,5 @@ def test_get_environment_with_eggfile(self):
self.assertEqual(env['SCRAPY_JOB'], 'ID')
self.assert_(env['SCRAPY_SQLITE_DB'].endswith('mybot.db'))
self.assert_(env['SCRAPY_LOG_FILE'].endswith(os.path.join('mybot', 'myspider', 'ID.log')))
+ self.assert_(env['SCRAPY_FEED_URI'].endswith(os.path.join('mybot', 'myspider', 'ID.jl')))
self.failIf('SCRAPY_SETTINGS_MODULE' in env)
View
@@ -13,6 +13,7 @@ def __init__(self, config, app):
self.debug = config.getboolean('debug', False)
self.runner = config.get('runner')
logsdir = config.get('logs_dir')
+ itemsdir = config.get('items_dir')
self.app = app
self.putChild('', Home(self))
self.putChild('schedule.json', webservice.Schedule(self))
@@ -25,6 +26,7 @@ def __init__(self, config, app):
self.putChild('delversion.json', webservice.DeleteVersion(self))
self.putChild('listjobs.json', webservice.ListJobs(self))
self.putChild('logs', static.File(logsdir, 'text/plain'))
+ self.putChild('items', static.File(itemsdir, 'text/plain'))
self.putChild('jobs', Jobs(self))
self.update_projects()
@@ -68,6 +70,7 @@ def render_GET(self, txrequest):
<p>Available projects: <b>%(projects)s</b></p>
<ul>
<li><a href="/jobs">Jobs</a></li>
+<li><a href="/items/">Items</li>
<li><a href="/logs/">Logs</li>
<li><a href="http://doc.scrapy.org/en/latest/topics/scrapyd.html">Documentation</a></li>
</ul>
@@ -98,24 +101,25 @@ def render(self, txrequest):
s += "<h1>Jobs</h1>"
s += "<p><a href='..'>Go back</a></p>"
s += "<table border='1'>"
- s += "<th>Project</th><th>Spider</th><th>Job</th><th>PID</th><th>Runtime</th><th>Log</th>"
- s += "<tr><th colspan='6' style='background-color: #ddd'>Pending</th></tr>"
+ s += "<th>Project</th><th>Spider</th><th>Job</th><th>PID</th><th>Runtime</th><th>Log</th><th>Items</th>"
+ s += "<tr><th colspan='7' style='background-color: #ddd'>Pending</th></tr>"
for project, queue in self.root.poller.queues.items():
for m in queue.list():
s += "<tr>"
s += "<td>%s</td>" % project
s += "<td>%s</td>" % str(m['name'])
s += "<td>%s</td>" % str(m['_job'])
s += "</tr>"
- s += "<tr><th colspan='6' style='background-color: #ddd'>Running</th></tr>"
+ s += "<tr><th colspan='7' style='background-color: #ddd'>Running</th></tr>"
for p in self.root.launcher.processes.values():
s += "<tr>"
for a in ['project', 'spider', 'job', 'pid']:
s += "<td>%s</td>" % getattr(p, a)
s += "<td>%s</td>" % (datetime.now() - p.start_time)
s += "<td><a href='/logs/%s/%s/%s.log'>Log</a></td>" % (p.project, p.spider, p.job)
+ s += "<td><a href='/items/%s/%s/%s.jl'>Items</a></td>" % (p.project, p.spider, p.job)
s += "</tr>"
- s += "<tr><th colspan='6' style='background-color: #ddd'>Finished</th></tr>"
+ s += "<tr><th colspan='7' style='background-color: #ddd'>Finished</th></tr>"
for p in self.root.launcher.finished:
s += "<tr>"
for a in ['project', 'spider', 'job']:

0 comments on commit dbda33e

Please sign in to comment.