Permalink
Browse files

Initial project structure with tox, travis, and initial commands (#2)

* init tests and project structure

* readme

* added spider loader

* get_project_root() raises an exception if not in project
  • Loading branch information...
1 parent d8c94d1 commit bb94c458053a4392e3560a79157de8f01e10f9d3 @aron-bordin aron-bordin committed with Raul Gallegos May 23, 2016
View
@@ -60,3 +60,6 @@ target/
#Ipython Notebook
.ipynb_checkpoints
+
+# IDEs
+.idea/
View
@@ -0,0 +1,14 @@
+language: python
+python: 3.5
+sudo: false
+env:
+ - TOXENV=py27
+ - TOXENV=py35
+install:
+ - pip install -U tox twine wheel codecov
+script: tox
+after_success:
+ - codecov
+cache:
+ directories:
+ - $HOME/.cache/pip
View
@@ -1 +1,9 @@
-# scrapy-streaming
+# Scrapy Streaming (WIP)
+
+[![Build Status](https://travis-ci.org/scrapy-plugins/scrapy-streaming.svg?branch=master)](https://travis-ci.org/scrapy-plugins/scrapy-streaming)
+[![codecov](https://codecov.io/gh/scrapy-plugins/scrapy-streaming/branch/master/graph/badge.svg)](https://codecov.io/gh/scrapy-plugins/scrapy-streaming)
+
+The Scrapy Streaming provides an interface to write spiders using any programming language, using json objects to make requests, parse web contents, get data, and more.
+
+Also, we officially provide helper libraries to develop your spiders using Java, JS, and R.
+
View
@@ -0,0 +1 @@
+scrapy
No changes.
No changes.
@@ -0,0 +1,8 @@
+from scrapy.commands.crawl import Command
+
+
+class CrawlCommand(Command):
+ """
+ Extends the scrapy crawl command, adding the possibility to start a external spider using the crawl command
+ """
+ pass
@@ -0,0 +1,19 @@
+from scrapy.commands.list import Command
+
+from scrapy_streaming.external_spiderloader import ExternalSpiderLoader
+
+
+class ListCommand(Command):
+ """
+ Extends the Scrapy list command, adding external spider to the list
+ """
+
+ def run(self, args, opts):
+ print('[Scrapy Spiders]')
+ super(ListCommand, self).run(args, opts)
+
+ spiders = [spider.name for spider in ExternalSpiderLoader.from_settings(self.settings).list()]
+ if spiders:
+ print('[External Spiders]')
+ for spider in sorted(spiders):
+ print(spider)
@@ -0,0 +1,27 @@
+import os
+
+from scrapy.commands import ScrapyCommand
+from scrapy.exceptions import UsageError
+
+
+class StreamingCommand(ScrapyCommand):
+ """
+ Command to start stand-alone executables with the the scrapy scrapy_streaming
+ """
+
+ requires_project = False
+
+ def syntax(self):
+ return "[options] <path of executable>"
+
+ def short_desc(self):
+ return "Run a external spider using Scrapy Streaming given its path (doesn't require a project)"
+
+ def run(self, args, opts):
+ if len(args) != 1:
+ raise UsageError()
+ filename = args[0]
+ if not os.path.exists(filename):
+ raise UsageError("File not found: %s\n" % filename)
+
+ raise NotImplementedError()
@@ -0,0 +1,71 @@
+import json
+import os
+
+from scrapy_streaming.utils import get_project_root
+
+
+class ExternalSpider(object):
+ """
+ Object to represent external spiders defined in ``external.json``.
+ """
+
+ def __init__(self, name, command, args=None):
+ if args is not None and not isinstance(args, list):
+ raise ValueError("'args' must be defined as an array of strings")
+ self.name = name
+ self.command = command
+ self.args = args
+
+ @classmethod
+ def from_dict(cls, spider):
+ return cls(**spider)
+
+
+class ExternalSpiderLoader(object):
+ """
+ This class manages external spiders defined in the ``external.json``
+ """
+
+ def __init__(self, settings):
+ path = settings.get('EXTERNAL_SPIDERS_PATH', get_project_root())
+ # TODO add EXTERNAL_SPIDERS_PATH in docs
+ path = os.path.abspath(path)
+ self.external = os.path.join(path, 'external.json')
+ self._spiders = {}
+ self._fetch_spiders()
+
+ @classmethod
+ def from_settings(cls, settings):
+ return cls(settings)
+
+ def _fetch_spiders(self):
+ """
+ Loads the content in the ``external.json`` file and generate a mapping of external spiders.
+ Keep the original mapping if the file is not found.
+ Throws JSONDecodeError if it's not a valid json file.
+ """
+ for spider in _read_json(self.external):
+ if not isinstance(spider, dict):
+ raise ValueError('External spiders must be defined as json objects.'
+ ' Read the docs for more information')
+
+ external_spider = ExternalSpider.from_dict(spider)
+ self._spiders[external_spider.name] = external_spider
+ return self._spiders
+
+ def list(self):
+ """
+ Returns a list with instance of loaded spiders (ExternalSpider objects)
+ """
+ return list(self._spiders.values())
+
+
+def _read_json(path):
+ """
+ Parse the json given its path. Raises an exception if the file doesn't exist.
+ """
+ if os.path.isfile(path):
+ return json.loads(open(path).read())
+ else:
+ raise Exception('Could not found "%s" file. Please, check if it\'s in your project root '
+ 'or defined in path defined at EXTERNAL_SPIDERS_PATH setting.' % path)
@@ -0,0 +1,15 @@
+import os
+
+from scrapy.utils.conf import closest_scrapy_cfg
+from scrapy.utils.project import inside_project
+
+
+def get_project_root():
+ """
+ Returns the absolute path of the root of the project, and raise an exception
+ if the current directory is not inside a project path
+ """
+ os.path.abspath('.')
+ if inside_project():
+ return os.path.dirname(closest_scrapy_cfg())
+ raise Exception(os.getcwd(), " does not belong to a Scrapy project")
View
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+from setuptools import setup
+
+setup(
+ name='scrapy-streaming',
+ version='0.1',
+ url='https://github.com/scrapy-plugins/scrapy-streaming',
+ description='Develop Spiders using any Programming Language',
+ author='Scrapy developers',
+ packages=['scrapy_streaming'],
+ requires=['scrapy'],
+
+ entry_points={
+ 'scrapy.commands': [
+ 'streaming=scrapy_streaming.commands.streaming:StreamingCommand',
+ 'list=scrapy_streaming.commands.list:ListCommand',
+ 'crawl=scrapy_streaming.commands.crawl:CrawlCommand'
+ ],
+ },
+)
View
No changes.
@@ -0,0 +1,85 @@
+import os
+import subprocess
+import tempfile
+from tempfile import mkdtemp
+
+from os.path import join
+
+import sys
+from time import sleep
+
+from scrapy.utils.python import to_native_str
+from scrapy.utils.test import get_testenv
+from shutil import rmtree
+from twisted.trial import unittest
+
+
+class ProjectTest(unittest.TestCase):
+ project_name = 'testproject'
+
+ def setUp(self):
+ self.temp_path = mkdtemp()
+ self.cwd = self.temp_path
+ self.proj_path = join(self.temp_path, self.project_name)
+ self.proj_mod_path = join(self.proj_path, self.project_name)
+ self.env = get_testenv()
+
+ self.call('startproject', self.project_name)
+ self.cwd = join(self.temp_path, self.project_name)
+ os.chdir(self.cwd)
+ self.env['SCRAPY_SETTINGS_MODULE'] = '%s.settings' % self.project_name
+ self.external_path = join(self.cwd, 'external.json')
+ with open(self.external_path, 'w') as external:
+ external.write('''
+[
+ {
+ "name": "PythonSpider",
+ "command": "scripts/dmoz.py"
+ },
+
+ {
+ "name": "JavaSpider",
+ "command": "java",
+ "args": ["MyClass"]
+ }
+]
+''')
+
+ def tearDown(self):
+ rmtree(self.temp_path)
+
+ def call(self, *new_args, **kwargs):
+ with tempfile.NamedTemporaryFile() as out:
+ args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
+ return subprocess.call(args, stdout=out, stderr=out, cwd=self.cwd,
+ env=self.env, **kwargs)
+
+ def proc(self, *new_args, **kwargs):
+ args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
+ p = subprocess.Popen(args, cwd=self.cwd, env=self.env,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ **kwargs)
+
+ waited = 0
+ interval = 0.2
+ while p.poll() is None:
+ sleep(interval)
+ waited += interval
+ if waited > 15:
+ p.kill()
+ assert False, 'Command took too much time to complete'
+
+ return p
+
+
+class ListCommandTest(ProjectTest):
+
+ def test_list_is_running(self):
+ self.assertEqual(0, self.call('list'))
+
+ def test_external_spiders(self):
+ p = self.proc('list')
+ out = to_native_str(p.stdout.read())
+
+ self.assertIn("JavaSpider", out)
+ self.assertIn("PythonSpider", out)
@@ -0,0 +1,41 @@
+from tests.test_commands import ProjectTest
+from twisted.trial import unittest
+
+from scrapy_streaming.external_spiderloader import ExternalSpider, ExternalSpiderLoader, _read_json
+
+
+class ExternalSpiderTest(unittest.TestCase):
+
+ def test_wrong_arg_type(self):
+ params = {'name': 'Name', 'command': 'python', 'args': {'a': 'b'}}
+ self.assertRaises(ValueError, ExternalSpider.from_dict, params)
+
+
+class ExternalSpiderLoaderTest(ProjectTest):
+
+ def test_list(self):
+ e = ExternalSpiderLoader({})
+
+ self.assertEqual(2, len(e.list()))
+
+ def test_invalid_json(self):
+ open(self.external_path, 'w').write('''
+[
+ {
+ "name": "PythonSpider",
+ "command": "scripts/dmoz.py"
+ },
+''')
+ self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})
+
+ def test_invalid_json_content(self):
+ open(self.external_path, 'w').write('''
+{
+ "name": "PythonSpider",
+ "command": "scripts/dmoz.py"
+}
+''')
+ self.assertRaises(ValueError, ExternalSpiderLoader.from_settings, {})
+
+ def test_invalid_file(self):
+ self.assertRaises(Exception, _read_json, '/home')
View
@@ -0,0 +1,16 @@
+import os
+
+from tests.test_commands import ProjectTest
+
+
+from scrapy_streaming.utils import get_project_root
+
+
+class UtilsTest(ProjectTest):
+
+ def test_get_project(self):
+ self.assertEqual(get_project_root(), self.cwd)
+
+ def test_get_project_default(self):
+ os.chdir('../')
+ self.assertRaises(Exception, get_project_root)
View
@@ -0,0 +1,27 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py27,py35
+
+[testenv]
+deps =
+ -rrequirements.txt
+ pytest
+ pytest-cov
+ hypothesis
+ hypothesis-pytest
+commands =
+ pip install -e .
+ py.test --doctest-modules --cov=scrapy_streaming {posargs:scrapy_streaming tests}
+
+[testenv:py33]
+basepython = python3.3
+
+[testenv:py34]
+basepython = python3.4
+
+[testenv:py35]
+basepython = python3.5

0 comments on commit bb94c45

Please sign in to comment.