From 8307098eb42341a6772e847c253826df80d780a6 Mon Sep 17 00:00:00 2001 From: Dave Simons Date: Sun, 24 Jan 2016 13:16:59 -0500 Subject: [PATCH] modified cli 4 spiderhost.[sh|py] & rm spiders.sh --- CHANGELOG.md | 19 +++++++++------ bin/spiderhost.py | 62 ++++++++++------------------------------------- bin/spiderhost.sh | 33 ++++--------------------- bin/spiders.sh | 41 ------------------------------- setup.py | 1 - 5 files changed, 30 insertions(+), 126 deletions(-) delete mode 100755 bin/spiders.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index bda6dde..c9125dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,18 +3,23 @@ All notable changes to this project will be documented in this file. Format of this file follows [these](http://keepachangelog.com/) guidelines. This project adheres to [Semantic Versioning](http://semver.org/). -## [Unreleased] - [unreleased] - -### Added - -- ... +## [0.6.0] - [2016-01-24] ### Changed - colorama now req'd to be @ least version 0.3.5 instead of only 0.3.5 -### Fixed -- ... +- command line args to bin/spiderhost.sh have been simplified - now just + take spider name and spider args just as you'd expect - no more url + encoding of args and ----- indicating no spider args + +- like the changes to bin/spiderhost.sh, bin/spiderhost.py now just accepts + regular command line arguments of a spider name and spider args - much easier + +### Removed + +- bin/spiders.sh is no longer needed - callers now access bin/spiders.py + directly rather that getting at bin/spiders.py through bin/spiders.sh ## [0.5.0] - [2015-05-10] diff --git a/bin/spiderhost.py b/bin/spiderhost.py index 952199e..bd6ec38 100755 --- a/bin/spiderhost.py +++ b/bin/spiderhost.py @@ -4,69 +4,32 @@ import logging import optparse import time -import urlparse from cloudfeaster.spider import SpiderCrawler -def parse_urlencoded_spider_args_option(option, opt, value): - if not value: - # since urlparse.parse_qs() fails on zero length string - return [] - try: - parsed_value = urlparse.parse_qs( - value, - keep_blank_values=True, - strict_parsing=True) - return [parsed_value[str(i)][0] for i in range(0, len(parsed_value))] - except ValueError: - msg = "option %s: must be url encoded query string" % opt - raise optparse.OptionValueError(msg) - - -class CommandLineParserOption(optparse.Option): - new_types = ( - "urlencoded_spider_args", - ) - TYPES = optparse.Option.TYPES + new_types - TYPE_CHECKER = optparse.Option.TYPE_CHECKER.copy() - TYPE_CHECKER["urlencoded_spider_args"] = parse_urlencoded_spider_args_option - - class CommandLineParser(optparse.OptionParser): def __init__(self): + optparse.OptionParser.__init__(self) + description = ( "The Spider Host ..." ) optparse.OptionParser.__init__( self, - "usage: %prog [options]", - description=description, - option_class=CommandLineParserOption) - - help = "spider - required" - self.add_option( - "--spider", - action="store", - dest="spider", - default=None, - type="string", - help=help) + "usage: %prog [ ... ]", + description=description) - help = "args - required" - self.add_option( - "--args", - action="store", - dest="args", - default=[], - type="urlencoded_spider_args", - help=help) + self.spider = None + self.args = None def parse_args(self, *args, **kwargs): (clo, cla) = optparse.OptionParser.parse_args(self, *args, **kwargs) - if not clo.spider: - self.error("'--spider' is required") + if not cla: + self.error("spider is required") + self.spider = cla[0] + self.args = cla[1:] return (clo, cla) @@ -81,6 +44,7 @@ def parse_args(self, *args, **kwargs): # configure logging ... # # remember gmt = utc + # logging.Formatter.converter = time.gmtime logging.basicConfig( level=logging.INFO, @@ -91,6 +55,6 @@ def parse_args(self, *args, **kwargs): # # Run the spider and dump results to stdout # - spider_crawler = SpiderCrawler(clo.spider) - crawl_result = spider_crawler.crawl(*clo.args) + spider_crawler = SpiderCrawler(clp.spider) + crawl_result = spider_crawler.crawl(*clp.args) print json.dumps(crawl_result) diff --git a/bin/spiderhost.sh b/bin/spiderhost.sh index f7ac084..65b91e6 100755 --- a/bin/spiderhost.sh +++ b/bin/spiderhost.sh @@ -3,31 +3,14 @@ # Execute this script to run a spider inside a docker container. # # This script is a wrapper around spiderhost.py that simply makes -# sure Xvfb is running before spiderhost.py executes and also wraps -# the interaction with etcd to store the results of running the -# spider. +# sure Xvfb is running before spiderhost.py executes is this script +# is being run on a linux OS. -if [ $# -ne 4 ]; then - echo "usage: `basename $0` " >&2 +if [ "$#" == 0 ]; then + echo "usage: `basename $0` ... " >&2 exit 1 fi -SPIDER_OUTPUT_URL=$1 -SPIDER=$2 -# -# :TRICKY: The odd if statement below is here because when Fleet runs this -# script and args is a zero length string, Fleet seems to get confused and -# not supply the right number of arguments. So, the checking for ----- is -# there to ensure that Fleet is never put in the position where it has to -# pass an argument that's zero length. -# -if [ "-----" == "$3" ]; then - ARGS="" -else - ARGS=$3 -fi -TTL=$4 - if [ "Linux" == "$(uname -s)" ]; then if [ "" == "$DISPLAY" ]; then export DISPLAY=:99 @@ -37,15 +20,9 @@ if [ "Linux" == "$(uname -s)" ]; then fi fi -SPIDER_OUTPUT=$(mktemp 2> /dev/null || mktemp -t DAS) -spiderhost.py --spider="$SPIDER" --args="$ARGS" >& "$SPIDER_OUTPUT" +spiderhost.py ${@} if [ "$?" != "0" ]; then exit 2 fi -HTTP_STATUS_CODE=$(curl -s -L -o /dev/null -w "%{http_code}" -X PUT --data-urlencode value@$SPIDER_OUTPUT -d ttl=$TTL $SPIDER_OUTPUT_URL) -if [ "$?" != "0" ] || [ "$HTTP_STATUS_CODE" != "201" ]; then - exit 3 -fi - exit 0 diff --git a/bin/spiders.sh b/bin/spiders.sh deleted file mode 100755 index 0499b18..0000000 --- a/bin/spiders.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -# Execute this script to discover all the spiders that are -# available to run in a docker image. -# -# This script is a wrapper around spiders.py that simply seperates -# the responsibility of spider discovery (spiders.py) from saving -# the discovered spider results (spiders.sh). -# -# Assuming etcd is running, the follow will verify this script -# is working correctly: -# -# curl -s -L -o /dev/null -w "%{http_code}\n" http://127.0.0.1:4001/v2/keys/0456d10966d742eca749fc9226b77456 -# <<>> -# spiders.sh http://127.0.0.1:4001/v2/keys/0456d10966d742eca749fc9226b77456 20 -# <<>> -# curl -s -L -o /dev/null -w "%{http_code}\n" http://127.0.0.1:4001/v2/keys/0456d10966d742eca749fc9226b77456 -# <<>> -# curl -s -L http://127.0.0.1:4001/v2/keys/0456d10966d742eca749fc9226b77456 -# <<< should return {} or more if spider modules are installed>>> - -if [ $# -ne 2 ]; then - echo "usage: `basename $0` " >&2 - exit 1 -fi - -OUTPUT_URL=$1 -TTL=$2 - -OUTPUT=$(mktemp 2> /dev/null || mktemp -t DAS) -spiders.py >& "$OUTPUT" -if [ "$?" != "0" ]; then - exit 1 -fi - -HTTP_STATUS_CODE=$(curl -s -L -o /dev/null -w "%{http_code}" -X PUT --data-urlencode value@$OUTPUT -d ttl=$TTL $OUTPUT_URL) -if [ "$?" != "0" ] || [ "$HTTP_STATUS_CODE" != "201" ]; then - exit 2 -fi - -exit 0 diff --git a/setup.py b/setup.py index 9e4fa73..4e529fe 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ "bin/spiderhost.py", "bin/spiderhost.sh", "bin/spiders.py", - "bin/spiders.sh", ], install_requires=[ "colorama>=0.3.5",