Skip to content

Commit

Permalink
modified cli 4 spiderhost.[sh|py] & rm spiders.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
simonsdave committed Jan 24, 2016
1 parent 17c48d2 commit 8307098
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 126 deletions.
19 changes: 12 additions & 7 deletions CHANGELOG.md
Expand Up @@ -3,18 +3,23 @@ All notable changes to this project will be documented in this file.
Format of this file follows [these](http://keepachangelog.com/) guidelines.
This project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased] - [unreleased]

### Added

- ...
## [0.6.0] - [2016-01-24]

### Changed

- colorama now req'd to be @ least version 0.3.5 instead of only 0.3.5

### Fixed
- ...
- command line args to bin/spiderhost.sh have been simplified - now just
take spider name and spider args just as you'd expect - no more url
encoding of args and ----- indicating no spider args

- like the changes to bin/spiderhost.sh, bin/spiderhost.py now just accepts
regular command line arguments of a spider name and spider args - much easier

### Removed

- bin/spiders.sh is no longer needed - callers now access bin/spiders.py
directly rather that getting at bin/spiders.py through bin/spiders.sh

## [0.5.0] - [2015-05-10]

Expand Down
62 changes: 13 additions & 49 deletions bin/spiderhost.py
Expand Up @@ -4,69 +4,32 @@
import logging
import optparse
import time
import urlparse

from cloudfeaster.spider import SpiderCrawler


def parse_urlencoded_spider_args_option(option, opt, value):
if not value:
# since urlparse.parse_qs() fails on zero length string
return []
try:
parsed_value = urlparse.parse_qs(
value,
keep_blank_values=True,
strict_parsing=True)
return [parsed_value[str(i)][0] for i in range(0, len(parsed_value))]
except ValueError:
msg = "option %s: must be url encoded query string" % opt
raise optparse.OptionValueError(msg)


class CommandLineParserOption(optparse.Option):
new_types = (
"urlencoded_spider_args",
)
TYPES = optparse.Option.TYPES + new_types
TYPE_CHECKER = optparse.Option.TYPE_CHECKER.copy()
TYPE_CHECKER["urlencoded_spider_args"] = parse_urlencoded_spider_args_option


class CommandLineParser(optparse.OptionParser):

def __init__(self):
optparse.OptionParser.__init__(self)

description = (
"The Spider Host ..."
)
optparse.OptionParser.__init__(
self,
"usage: %prog [options]",
description=description,
option_class=CommandLineParserOption)

help = "spider - required"
self.add_option(
"--spider",
action="store",
dest="spider",
default=None,
type="string",
help=help)
"usage: %prog <spider> [<arg1> ... <argN>]",
description=description)

help = "args - required"
self.add_option(
"--args",
action="store",
dest="args",
default=[],
type="urlencoded_spider_args",
help=help)
self.spider = None
self.args = None

def parse_args(self, *args, **kwargs):
(clo, cla) = optparse.OptionParser.parse_args(self, *args, **kwargs)
if not clo.spider:
self.error("'--spider' is required")
if not cla:
self.error("spider is required")
self.spider = cla[0]
self.args = cla[1:]
return (clo, cla)


Expand All @@ -81,6 +44,7 @@ def parse_args(self, *args, **kwargs):
# configure logging ...
#
# remember gmt = utc
#
logging.Formatter.converter = time.gmtime
logging.basicConfig(
level=logging.INFO,
Expand All @@ -91,6 +55,6 @@ def parse_args(self, *args, **kwargs):
#
# Run the spider and dump results to stdout
#
spider_crawler = SpiderCrawler(clo.spider)
crawl_result = spider_crawler.crawl(*clo.args)
spider_crawler = SpiderCrawler(clp.spider)
crawl_result = spider_crawler.crawl(*clp.args)
print json.dumps(crawl_result)
33 changes: 5 additions & 28 deletions bin/spiderhost.sh
Expand Up @@ -3,31 +3,14 @@
# Execute this script to run a spider inside a docker container.
#
# This script is a wrapper around spiderhost.py that simply makes
# sure Xvfb is running before spiderhost.py executes and also wraps
# the interaction with etcd to store the results of running the
# spider.
# sure Xvfb is running before spiderhost.py executes is this script
# is being run on a linux OS.

if [ $# -ne 4 ]; then
echo "usage: `basename $0` <spider output url> <spider> <args> <ttl>" >&2
if [ "$#" == 0 ]; then
echo "usage: `basename $0` <spider> <arg1> ... <argN>" >&2
exit 1
fi

SPIDER_OUTPUT_URL=$1
SPIDER=$2
#
# :TRICKY: The odd if statement below is here because when Fleet runs this
# script and args is a zero length string, Fleet seems to get confused and
# not supply the right number of arguments. So, the checking for ----- is
# there to ensure that Fleet is never put in the position where it has to
# pass an argument that's zero length.
#
if [ "-----" == "$3" ]; then
ARGS=""
else
ARGS=$3
fi
TTL=$4

if [ "Linux" == "$(uname -s)" ]; then
if [ "" == "$DISPLAY" ]; then
export DISPLAY=:99
Expand All @@ -37,15 +20,9 @@ if [ "Linux" == "$(uname -s)" ]; then
fi
fi

SPIDER_OUTPUT=$(mktemp 2> /dev/null || mktemp -t DAS)
spiderhost.py --spider="$SPIDER" --args="$ARGS" >& "$SPIDER_OUTPUT"
spiderhost.py ${@}
if [ "$?" != "0" ]; then
exit 2
fi

HTTP_STATUS_CODE=$(curl -s -L -o /dev/null -w "%{http_code}" -X PUT --data-urlencode value@$SPIDER_OUTPUT -d ttl=$TTL $SPIDER_OUTPUT_URL)
if [ "$?" != "0" ] || [ "$HTTP_STATUS_CODE" != "201" ]; then
exit 3
fi

exit 0
41 changes: 0 additions & 41 deletions bin/spiders.sh

This file was deleted.

1 change: 0 additions & 1 deletion setup.py
Expand Up @@ -45,7 +45,6 @@
"bin/spiderhost.py",
"bin/spiderhost.sh",
"bin/spiders.py",
"bin/spiders.sh",
],
install_requires=[
"colorama>=0.3.5",
Expand Down

0 comments on commit 8307098

Please sign in to comment.