@@ -0,0 +1,59 @@
#!/usr/bin/python

from time import sleep
import requests
from bs4 import BeautifulSoup
import os
from math import ceil
import re
import codecs

# scrape list of tags
tagList = []
tagPage = requests.get("http://bandcamp.com/tags")
soup = BeautifulSoup(tagPage.text, "lxml")
for item in soup.select("#tags_cloud > a"):
if(item.get_text() not in tagList and item.get_text() != ""):
human_tag = item.get_text()
# format the tag to what the api expects
# http://stackoverflow.com/questions/6116978/python-replace-multiple-strings
rep = {" & ": "-", "&": "-", " ": "-", "/": "-"} # define desired replacements here
rep = dict((re.escape(k), v) for k, v in rep.iteritems())
pattern = re.compile("|".join(rep.keys()))
machine_tag = pattern.sub(lambda m: rep[re.escape(m.group(0))], human_tag)
tagList.append(machine_tag)
try:
os.mkdir(machine_tag) # directory to save json
with codecs.open(machine_tag + "/" + "human.txt", 'w+', "utf-8-sig") as f:
f.write(human_tag)
except Exception as e:
print(e)

for index, tag in enumerate(tagList):
print(tag + " " + str(index+1) + "/" + str(len(tagList)))

req = requests.Response()
while (req.status_code != 200):
req = requests.get(
"http://bandcamp.com/api/discover/3/get_web?t=" +
tag + "&s=top&p=0")
jsonObj = req.json()

# each page contains at most 48 entries
# find total num entries in page 0 and calculate num pages
totalCount = jsonObj.get("total_count")
print(totalCount)
totalPages = int(ceil(totalCount/48.0))

# iterate through pages, write to disk
for page in range(0, totalPages):
with open(tag+'/'+str(page), 'w+') as f:
req = requests.Response()
while (req.status_code != 200): # lol
req = requests.get(
"http://bandcamp.com/api/discover/3/get_web?t=" +
tag + "&s=top&p=" + str(page))
f.write(req.text.encode("utf8"))
print(str(page + 1) + "/" + str(totalPages) + " " +
str(req.status_code) + " " + str(len(req.text)))
print(tag + " complete.")
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item


class ArbitraryItem(Item):
def __setitem__(self, key, value):
self._values[key] = value
self.fields[key] = {}

# class BandcampTag(scrapy.Item):
# machine_name = scrapy.Field()
# human_name = scrapy.Field()
# albums = scrapy.Field()
# total_albums = scrapy.Field()
#
#
#class BandcampAlbum(scrapy.Item):
# field = scrapy.Field()
@@ -0,0 +1,117 @@
# Copyright 2014 Michael Malocha <michael@knockrentals.com>
#
# Expanded from the work by Julien Duponchelle <julien@duponchelle.info>.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Elastic Search Pipeline for scrappy expanded with support for multiple items"""

from datetime import datetime
from elasticsearch import Elasticsearch, helpers
import logging
import hashlib
import types

from transportNTLM import TransportNTLM


class InvalidSettingsException(Exception):
pass

class ElasticSearchPipeline(object):
settings = None
es = None
items_buffer = []

@classmethod
def validate_settings(cls, settings):
def validate_setting(setting_key):
if settings[setting_key] is None:
raise InvalidSettingsException('%s is not defined in settings.py' % setting_key)

required_settings = {'ELASTICSEARCH_SERVERS', 'ELASTICSEARCH_INDEX', 'ELASTICSEARCH_TYPE'}

for required_setting in required_settings:
validate_setting(required_setting)

@classmethod
def from_crawler(cls, crawler):
ext = cls()
ext.settings = crawler.settings

cls.validate_settings(ext.settings)

es_servers = ext.settings['ELASTICSEARCH_SERVERS']
es_servers = es_servers if isinstance(es_servers, list) else [es_servers]

authType = ext.settings['ELASTICSEARCH_AUTH']
if authType == 'NTLM':
ext.es = Elasticsearch(hosts=es_servers,
transport_class=TransportNTLM,
ntlm_user= ext.settings['ELASTICSEARCH_USERNAME'],
ntlm_pass= ext.settings['ELASTICSEARCH_PASSWORD'],
timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT',60))
else :
ext.es = Elasticsearch(hosts=es_servers, timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT', 60))
return ext

def get_unique_key(self, unique_key):
if isinstance(unique_key, list):
unique_key = unique_key[0]
elif not isinstance(unique_key, str):
raise Exception('unique key must be str')

return unique_key

def index_item(self, item):

index_name = self.settings['ELASTICSEARCH_INDEX']
index_suffix_format = self.settings.get('ELASTICSEARCH_INDEX_DATE_FORMAT', None)

if index_suffix_format:
index_name += "-" + datetime.strftime(datetime.now(),index_suffix_format)

index_action = {
'_index': index_name,
'_type': self.settings['ELASTICSEARCH_TYPE'],
'_source': dict(item)
}

if self.settings['ELASTICSEARCH_UNIQ_KEY'] is not None:
item_unique_key = item[self.settings['ELASTICSEARCH_UNIQ_KEY']]
unique_key = self.get_unique_key(item_unique_key)
item_id = hashlib.sha1(unique_key).hexdigest()
index_action['_id'] = item_id
logging.debug('Generated unique key %s' % item_id)

self.items_buffer.append(index_action)

if len(self.items_buffer) == self.settings.get('ELASTICSEARCH_BUFFER_LENGTH', 500):
self.send_items()
self.items_buffer = []

def send_items(self):
helpers.bulk(self.es, self.items_buffer)

def process_item(self, item, spider):
if isinstance(item, types.GeneratorType) or isinstance(item, types.ListType):
for each in item:
self.process_item(each, spider)
else:
self.index_item(item)
logging.debug('Item sent to Elastic Search %s' % self.settings['ELASTICSEARCH_INDEX'])
return item

def close_spider(self, spider):
if len(self.items_buffer):
self.send_items()
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-

# Scrapy settings for scrapytest project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapytest'

SPIDER_MODULES = ['scrapytest.spiders']
NEWSPIDER_MODULE = 'scrapytest.spiders'

ITEM_PIPELINES = {
'scrapytest.pipelines.ElasticSearchPipeline': 1
}

ELASTICSEARCH_SERVERS = ['localhost:9200']
ELASTICSEARCH_INDEX = 'scrapy2'
ELASTICSEARCH_TYPE = 'items'
ELASTICSEARCH_UNIQ_KEY = 'id'

LOG_ENABLED = True

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapytest (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapytest.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapytest.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'scrapytest.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
AUTOTHROTTLE_START_DELAY = 0.0
# The maximum download delay to be set in case of high latencies
AUTOTHROTTLE_MAX_DELAY = 60.0
# The average number of requests Scrapy should be sending in parallel to
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 20.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = True

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
@@ -0,0 +1,42 @@
import scrapy
import json
from math import ceil

from scrapytest.items import ArbitraryItem

api_url_format = "http://151.101.65.28/api/discover/3/get_web?t=%s&s=top&p=%s"
# bandcamp.com ip address to save dns lookups

class BandcampSpider(scrapy.Spider):
name = "bandcamp"
allowed_domains = ['bandcamp.com', '151.101.65.28']
start_urls = [
"http://bandcamp.com/tags"
]

def parse(self, response):
for link in response.css("#tags_cloud > a"):
machine_tag_name = link.css("::attr('href')").extract()[0].replace("/tag/", "")
human_tag_name = link.xpath("text()").extract()
url = api_url_format % (machine_tag_name, 0)
meta = {"machine": machine_tag_name, "human": human_tag_name}
yield scrapy.Request(url, meta=meta, callback=self.parse_tag)

def parse_tag(self, response):
machine_tag_name = response.meta["machine"]
json_obj = json.loads(response.body_as_unicode())
total_count = json_obj.get("total_count")
num_pages = int(ceil(total_count/48.0))
for i in range(num_pages):
url = api_url_format % (machine_tag_name, i)
yield scrapy.Request(url, meta=response.meta, callback=self.parse_album)

def parse_album(self, response):
json_obj = json.loads(response.body_as_unicode())
items = json_obj.get('items')
for item in items:
album = ArbitraryItem(item)
album['id'] = str(album['id'])
album['machine_tag_name'] = response.meta['machine']
album['human_tag_name'] = response.meta['human']
yield album
@@ -0,0 +1,358 @@
import time
from itertools import chain

from elasticsearch.connection import RequestsHttpConnection
from elasticsearch.connection_pool import ConnectionPool, DummyConnectionPool
from elasticsearch.serializer import JSONSerializer, Deserializer, DEFAULT_SERIALIZERS
from elasticsearch.exceptions import ConnectionError, TransportError, SerializationError, \
ConnectionTimeout, ImproperlyConfigured
import requests
from requests_ntlm import HttpNtlmAuth

def get_host_info(node_info, host):
"""
Simple callback that takes the node info from `/_cluster/nodes` and a
parsed connection information and return the connection information. If
`None` is returned this node will be skipped.
Useful for filtering nodes (by proximity for example) or if additional
information needs to be provided for the :class:`~elasticsearch.Connection`
class. By default master only nodes are filtered out since they shouldn't
typically be used for API operations.
:arg node_info: node information from `/_cluster/nodes`
:arg host: connection information (host, port) extracted from the node info
"""
attrs = node_info.get('attributes', {})

# ignore master only nodes
if (attrs.get('data', 'true') == 'false' and
attrs.get('client', 'false') == 'false' and
attrs.get('master', 'true') == 'true'):
return None
return host

class TransportNTLM(object):
"""
Encapsulation of transport-related to logic. Handles instantiation of the
individual connections as well as creating a connection pool to hold them.
Main interface is the `perform_request` method.
"""
def __init__(self, hosts, connection_class=RequestsHttpConnection,
connection_pool_class=ConnectionPool, host_info_callback=get_host_info,
sniff_on_start=False, sniffer_timeout=None, sniff_timeout=.1,
sniff_on_connection_fail=False, serializer=JSONSerializer(), serializers=None,
default_mimetype='application/json', max_retries=3, retry_on_status=(503, 504, ),
retry_on_timeout=False, send_get_body_as='GET', **kwargs):
"""
:arg hosts: list of dictionaries, each containing keyword arguments to
create a `connection_class` instance
:arg connection_class: subclass of :class:`~elasticsearch.Connection` to use
:arg connection_pool_class: subclass of :class:`~elasticsearch.ConnectionPool` to use
:arg host_info_callback: callback responsible for taking the node information from
`/_cluser/nodes`, along with already extracted information, and
producing a list of arguments (same as `hosts` parameter)
:arg sniff_on_start: flag indicating whether to obtain a list of nodes
from the cluser at startup time
:arg sniffer_timeout: number of seconds between automatic sniffs
:arg sniff_on_connection_fail: flag controlling if connection failure triggers a sniff
:arg sniff_timeout: timeout used for the sniff request - it should be a
fast api call and we are talking potentially to more nodes so we want
to fail quickly. Not used during initial sniffing (if
``sniff_on_start`` is on) when the connection still isn't
initialized.
:arg serializer: serializer instance
:arg serializers: optional dict of serializer instances that will be
used for deserializing data coming from the server. (key is the mimetype)
:arg default_mimetype: when no mimetype is specified by the server
response assume this mimetype, defaults to `'application/json'`
:arg max_retries: maximum number of retries before an exception is propagated
:arg retry_on_status: set of HTTP status codes on which we should retry
on a different node. defaults to ``(503, 504, )``
:arg retry_on_timeout: should timeout trigger a retry on different
node? (default `False`)
:arg send_get_body_as: for GET requests with body this option allows
you to specify an alternate way of execution for environments that
don't support passing bodies with GET requests. If you set this to
'POST' a POST method will be used instead, if to 'source' then the body
will be serialized and passed as a query parameter `source`.
Any extra keyword arguments will be passed to the `connection_class`
when creating and instance unless overriden by that connection's
options provided as part of the hosts parameter.
"""

# serialization config
_serializers = DEFAULT_SERIALIZERS.copy()
# if a serializer has been specified, use it for deserialization as well
_serializers[serializer.mimetype] = serializer
# if custom serializers map has been supplied, override the defaults with it
if serializers:
_serializers.update(serializers)
# create a deserializer with our config
self.deserializer = Deserializer(_serializers, default_mimetype)

self.max_retries = max_retries
self.retry_on_timeout = retry_on_timeout
self.retry_on_status = retry_on_status
self.send_get_body_as = send_get_body_as

# data serializer
self.serializer = serializer

# store all strategies...
self.connection_pool_class = connection_pool_class
self.connection_class = connection_class

# ...save kwargs to be passed to the connections
self.kwargs = kwargs
self.hosts = hosts

# ...and instantiate them
self.set_connections(hosts)
# retain the original connection instances for sniffing
self.seed_connections = self.connection_pool.connections[:]

# sniffing data
self.sniffer_timeout = sniffer_timeout
self.sniff_on_connection_fail = sniff_on_connection_fail
self.last_sniff = time.time()
self.sniff_timeout = sniff_timeout

# callback to construct host dict from data in /_cluster/nodes
self.host_info_callback = host_info_callback

if sniff_on_start:
self.sniff_hosts(True)


def add_connection(self, host):
"""
Create a new :class:`~elasticsearch.Connection` instance and add it to the pool.
:arg host: kwargs that will be used to create the instance
"""
self.hosts.append(host)
self.set_connections(self.hosts)

def set_connections(self, hosts):
"""
Instantiate all the connections and crate new connection pool to hold
them. Tries to identify unchanged hosts and re-use existing
:class:`~elasticsearch.Connection` instances.
:arg hosts: same as `__init__`
"""
# construct the connections
def _create_connection(host):
# if this is not the initial setup look at the existing connection
# options and identify connections that haven't changed and can be
# kept around.
if hasattr(self, 'connection_pool'):
for (connection, old_host) in self.connection_pool.connection_opts:
if old_host == host:
return connection

# previously unseen params, create new connection
kwargs = self.kwargs.copy()
kwargs.update(host)

if 'scheme' in host and host['scheme'] != self.connection_class.transport_schema:
raise ImproperlyConfigured(
'Scheme specified in connection (%s) is not the same as the connection class (%s) specifies (%s).' % (
host['scheme'], self.connection_class.__name__, self.connection_class.transport_schema
))
ntlm_auth = HttpNtlmAuth(kwargs.get('ntlm_user',''), kwargs.get('ntlm_pass',''))
return self.connection_class(http_auth=ntlm_auth, **kwargs)
connections = map(_create_connection, hosts)

connections = list(zip(connections, hosts))
if len(connections) == 1:
self.connection_pool = DummyConnectionPool(connections)
else:
# pass the hosts dicts to the connection pool to optionally extract parameters from
self.connection_pool = self.connection_pool_class(connections, **self.kwargs)

def get_connection(self):
"""
Retreive a :class:`~elasticsearch.Connection` instance from the
:class:`~elasticsearch.ConnectionPool` instance.
"""
if self.sniffer_timeout:
if time.time() >= self.last_sniff + self.sniffer_timeout:
self.sniff_hosts()
return self.connection_pool.get_connection()

def _get_sniff_data(self, initial=False):
"""
Perform the request to get sniffins information. Returns a list of
dictionaries (one per node) containing all the information from the
cluster.
It also sets the last_sniff attribute in case of a successful attempt.
In rare cases it might be possible to override this method in your
custom Transport class to serve data from alternative source like
configuration management.
"""
previous_sniff = self.last_sniff

try:
# reset last_sniff timestamp
self.last_sniff = time.time()
# go through all current connections as well as the
# seed_connections for good measure
for c in chain(self.connection_pool.connections, self.seed_connections):
try:
# use small timeout for the sniffing request, should be a fast api call
_, headers, node_info = c.perform_request('GET', '/_nodes/_all/clear',
timeout=self.sniff_timeout if not initial else None)
node_info = self.deserializer.loads(node_info, headers.get('content-type'))
break
except (ConnectionError, SerializationError):
pass
else:
raise TransportError("N/A", "Unable to sniff hosts.")
except:
# keep the previous value on error
self.last_sniff = previous_sniff
raise

return list(node_info['nodes'].values())


def sniff_hosts(self, initial=False):
"""
Obtain a list of nodes from the cluster and create a new connection
pool using the information retrieved.
To extract the node connection parameters use the ``nodes_to_host_callback``.
:arg initial: flag indicating if this is during startup
(``sniff_on_start``), ignore the ``sniff_timeout`` if ``True``
"""
node_info = self._get_sniff_data(initial)

hosts = []
address_key = self.connection_class.transport_schema + '_address'
for n in node_info:
host = {}
address = n.get(address_key, '')
if '/' in address:
host['host'], address = address.split('/', 1)

# malformed address
if ':' not in address:
continue

ip, port = address.rsplit(':', 1)

# use the ip if not overridden by publish_host
host.setdefault('host', ip)
host['port'] = int(port)

host = self.host_info_callback(n, host)
if host is not None:
hosts.append(host)

# we weren't able to get any nodes, maybe using an incompatible
# transport_schema or host_info_callback blocked all - raise error.
if not hosts:
raise TransportError("N/A", "Unable to sniff hosts - no viable hosts found.")

self.set_connections(hosts)

def mark_dead(self, connection):
"""
Mark a connection as dead (failed) in the connection pool. If sniffing
on failure is enabled this will initiate the sniffing process.
:arg connection: instance of :class:`~elasticsearch.Connection` that failed
"""
# mark as dead even when sniffing to avoid hitting this host during the sniff process
self.connection_pool.mark_dead(connection)
if self.sniff_on_connection_fail:
self.sniff_hosts()

def perform_request(self, method, url, params=None, body=None):
"""
Perform the actual request. Retrieve a connection from the connection
pool, pass all the information to it's perform_request method and
return the data.
If an exception was raised, mark the connection as failed and retry (up
to `max_retries` times).
If the operation was succesful and the connection used was previously
marked as dead, mark it as live, resetting it's failure count.
:arg method: HTTP method to use
:arg url: absolute url (without host) to target
:arg params: dictionary of query parameters, will be handed over to the
underlying :class:`~elasticsearch.Connection` class for serialization
:arg body: body of the request, will be serializes using serializer and
passed to the connection
"""
if body is not None:
body = self.serializer.dumps(body)

# some clients or environments don't support sending GET with body
if method in ('HEAD', 'GET') and self.send_get_body_as != 'GET':
# send it as post instead
if self.send_get_body_as == 'POST':
method = 'POST'

# or as source parameter
elif self.send_get_body_as == 'source':
if params is None:
params = {}
params['source'] = body
body = None

if body is not None:
try:
body = body.encode('utf-8')
except (UnicodeDecodeError, AttributeError):
# bytes/str - no need to re-encode
pass

ignore = ()
timeout = None
if params:
timeout = params.pop('request_timeout', None)
ignore = params.pop('ignore', ())
if isinstance(ignore, int):
ignore = (ignore, )

for attempt in range(self.max_retries + 1):
connection = self.get_connection()

try:
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)

except TransportError as e:
retry = False
if isinstance(e, ConnectionTimeout):
retry = self.retry_on_timeout
elif isinstance(e, ConnectionError):
retry = True
elif e.status_code in self.retry_on_status:
retry = True

if retry:
# only mark as dead if we are retrying
self.mark_dead(connection)
# raise exception on last retry
if attempt == self.max_retries:
raise
else:
raise

else:
# connection didn't fail, confirm it's live status
self.connection_pool.mark_live(connection)
if data:
data = self.deserializer.loads(data, headers.get('content-type'))
return status, data

@@ -0,0 +1,10 @@
Metadata-Version: 1.0
Name: project
Version: 1.0
Summary: UNKNOWN
Home-page: UNKNOWN
Author: UNKNOWN
Author-email: UNKNOWN
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
@@ -0,0 +1,14 @@
setup.py
project.egg-info/PKG-INFO
project.egg-info/SOURCES.txt
project.egg-info/dependency_links.txt
project.egg-info/entry_points.txt
project.egg-info/top_level.txt
scrapytest/__init__.py
scrapytest/bandcamp-tagscrape.py
scrapytest/items.py
scrapytest/pipelines.py
scrapytest/settings.py
scrapytest/transportNTLM.py
scrapytest/spiders/__init__.py
scrapytest/spiders/bandcamp_spider.py
@@ -0,0 +1 @@

@@ -0,0 +1,3 @@
[scrapy]
settings = scrapytest.settings

@@ -0,0 +1 @@
scrapytest
@@ -0,0 +1,2 @@
projects:
default: 73419
@@ -0,0 +1,17 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = scrapytest.settings

[deploy]
url = http://localhost:6800/
project = scrapytest

[scrapyd]
eggs_dir = scrapy_dir/eggs
logs_dir = scrapy_dir/logs
items_dir = scrapy_dir/items
dbs_dir = scrapy_dir/dbs
@@ -0,0 +1,100 @@
# The default ``config.py``
# flake8: noqa


def set_prefs(prefs):
"""This function is called before opening the project"""

# Specify which files and folders to ignore in the project.
# Changes to ignored resources are not added to the history and
# VCSs. Also they are not returned in `Project.get_files()`.
# Note that ``?`` and ``*`` match all characters but slashes.
# '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
# 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
# '.svn': matches 'pkg/.svn' and all of its children
# 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
# 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
'.hg', '.svn', '_svn', '.git', '.tox']

# Specifies which files should be considered python files. It is
# useful when you have scripts inside your project. Only files
# ending with ``.py`` are considered to be python files by
# default.
#prefs['python_files'] = ['*.py']

# Custom source folders: By default rope searches the project
# for finding source folders (folders that should be searched
# for finding modules). You can add paths to that list. Note
# that rope guesses project source folders correctly most of the
# time; use this if you have any problems.
# The folders should be relative to project root and use '/' for
# separating folders regardless of the platform rope is running on.
# 'src/my_source_folder' for instance.
#prefs.add('source_folders', 'src')

# You can extend python path for looking up modules
#prefs.add('python_path', '~/python/')

# Should rope save object information or not.
prefs['save_objectdb'] = True
prefs['compress_objectdb'] = False

# If `True`, rope analyzes each module when it is being saved.
prefs['automatic_soa'] = True
# The depth of calls to follow in static object analysis
prefs['soa_followed_calls'] = 0

# If `False` when running modules or unit tests "dynamic object
# analysis" is turned off. This makes them much faster.
prefs['perform_doa'] = True

# Rope can check the validity of its object DB when running.
prefs['validate_objectdb'] = True

# How many undos to hold?
prefs['max_history_items'] = 32

# Shows whether to save history across sessions.
prefs['save_history'] = True
prefs['compress_history'] = False

# Set the number spaces used for indenting. According to
# :PEP:`8`, it is best to use 4 spaces. Since most of rope's
# unit-tests use 4 spaces it is more reliable, too.
prefs['indent_size'] = 4

# Builtin and c-extension modules that are allowed to be imported
# and inspected by rope.
prefs['extension_modules'] = []

# Add all standard c-extensions to extension_modules list.
prefs['import_dynload_stdmods'] = True

# If `True` modules with syntax errors are considered to be empty.
# The default value is `False`; When `False` syntax errors raise
# `rope.base.exceptions.ModuleSyntaxError` exception.
prefs['ignore_syntax_errors'] = False

# If `True`, rope ignores unresolvable imports. Otherwise, they
# appear in the importing namespace.
prefs['ignore_bad_imports'] = False

# If `True`, rope will insert new module imports as
# `from <package> import <module>` by default.
prefs['prefer_module_from_imports'] = False

# If `True`, rope will transform a comma list of imports into
# multiple separate import statements when organizing
# imports.
prefs['split_imports'] = False

# If `True`, rope will sort imports alphabetically by module name
# instead of alphabetically by import statement, with from imports
# after normal imports.
prefs['sort_imports_alphabetically'] = False


def project_opened(project):
"""This function is called after opening the project"""
# Do whatever you like here!
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file.
Binary file not shown.
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy.item import Item


class ArbitraryItem(Item):
def __setitem__(self, key, value):
self._values[key] = value
self.fields[key] = {}

# class BandcampTag(scrapy.Item):
# machine_name = scrapy.Field()
# human_name = scrapy.Field()
# albums = scrapy.Field()
# total_albums = scrapy.Field()
#
#
#class BandcampAlbum(scrapy.Item):
# field = scrapy.Field()
Binary file not shown.
@@ -0,0 +1,44 @@
from scrapy.conf import settings
from scrapy.downloadermiddlewares.retry import RetryMiddleware
import telnetlib
import logging
import time
import random

logger = logging.getLogger(__name__)


class ProxyMiddleware(object):

#Overide the request process by making it go through Tor
def process_request(self, request, spider):
request.meta['proxy'] = settings.get('HTTP_PROXY')


class RetryChangeProxyMiddleware(RetryMiddleware):

last = 0
timelimit = 15

def _retry(self, request, reason, spider):
settings = spider.settings

if isinstance(reason, basestring):
logger.debug('Valid retry, reason: ' + reason + ' for URL ' + request.url)
t = time.time()
diff = t - RetryChangeProxyMiddleware.last
if diff > RetryChangeProxyMiddleware.timelimit:
tn = telnetlib.Telnet('127.0.0.1', 9051)
tn.read_until("Escape character is '^]'.", 2)
tn.write('AUTHENTICATE "267765"\r\n')
tn.read_until("250 OK", 2)
tn.write("signal NEWNYM\r\n")
tn.read_until("250 OK", 2)
tn.write("quit\r\n")
tn.close()
#time.sleep(3)
RetryChangeProxyMiddleware.last = t
logger.info('Proxy changed! New last: %s' % time.strftime("%H:%M:%S"))
else:
logger.debug('Proxy not changed! Time difference is %s seconds' % ("{:.2f}".format(diff)))
return RetryMiddleware._retry(self, request, reason, spider)
Binary file not shown.
@@ -0,0 +1,117 @@
# Copyright 2014 Michael Malocha <michael@knockrentals.com>
#
# Expanded from the work by Julien Duponchelle <julien@duponchelle.info>.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Elastic Search Pipeline for scrappy expanded with support for multiple items"""

from datetime import datetime
from elasticsearch import Elasticsearch, helpers
import logging
import hashlib
import types

from transportNTLM import TransportNTLM


class InvalidSettingsException(Exception):
pass

class ElasticSearchPipeline(object):
settings = None
es = None
items_buffer = []

@classmethod
def validate_settings(cls, settings):
def validate_setting(setting_key):
if settings[setting_key] is None:
raise InvalidSettingsException('%s is not defined in settings.py' % setting_key)

required_settings = {'ELASTICSEARCH_SERVERS', 'ELASTICSEARCH_INDEX', 'ELASTICSEARCH_TYPE'}

for required_setting in required_settings:
validate_setting(required_setting)

@classmethod
def from_crawler(cls, crawler):
ext = cls()
ext.settings = crawler.settings

cls.validate_settings(ext.settings)

es_servers = ext.settings['ELASTICSEARCH_SERVERS']
es_servers = es_servers if isinstance(es_servers, list) else [es_servers]

authType = ext.settings['ELASTICSEARCH_AUTH']
if authType == 'NTLM':
ext.es = Elasticsearch(hosts=es_servers,
transport_class=TransportNTLM,
ntlm_user= ext.settings['ELASTICSEARCH_USERNAME'],
ntlm_pass= ext.settings['ELASTICSEARCH_PASSWORD'],
timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT',60))
else :
ext.es = Elasticsearch(hosts=es_servers, timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT', 60))
return ext

def get_unique_key(self, unique_key):
if isinstance(unique_key, list):
unique_key = unique_key[0]
elif not isinstance(unique_key, str):
raise Exception('unique key must be str')

return unique_key

def index_item(self, item):

index_name = self.settings['ELASTICSEARCH_INDEX']
index_suffix_format = self.settings.get('ELASTICSEARCH_INDEX_DATE_FORMAT', None)

if index_suffix_format:
index_name += "-" + datetime.strftime(datetime.now(),index_suffix_format)

index_action = {
'_index': index_name,
'_type': self.settings['ELASTICSEARCH_TYPE'],
'_source': dict(item)
}

if self.settings['ELASTICSEARCH_UNIQ_KEY'] is not None:
item_unique_key = item[self.settings['ELASTICSEARCH_UNIQ_KEY']]
unique_key = self.get_unique_key(item_unique_key)
item_id = hashlib.sha1(unique_key).hexdigest()
index_action['_id'] = item_id
logging.debug('Generated unique key %s' % item_id)

self.items_buffer.append(index_action)

if len(self.items_buffer) == self.settings.get('ELASTICSEARCH_BUFFER_LENGTH', 500):
self.send_items()
self.items_buffer = []

def send_items(self):
helpers.bulk(self.es, self.items_buffer)

def process_item(self, item, spider):
if isinstance(item, types.GeneratorType) or isinstance(item, types.ListType):
for each in item:
self.process_item(each, spider)
else:
self.index_item(item)
logging.debug('Item sent to Elastic Search %s' % self.settings['ELASTICSEARCH_INDEX'])
return item

def close_spider(self, spider):
if len(self.items_buffer):
self.send_items()
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,23 @@
attrs==15.2.0
cffi==1.5.2
cryptography==1.3.1
cssselect==0.9.1
elasticsearch==2.3.0
enum34==1.1.3
idna==2.1
ipaddress==1.0.16
lxml==3.6.0
pyasn1==0.1.9
pyasn1-modules==0.0.8
pycparser==2.14
pyOpenSSL==16.0.0
queuelib==1.4.2
requests==2.10.0
requests-ntlm==0.2.0
Scrapy==1.0.5
service-identity==16.0.0
six==1.10.0
Twisted==16.1.1
urllib3==1.15.1
w3lib==1.14.2
zope.interface==4.1.3
@@ -0,0 +1,113 @@
#!/usr/bin/env bash
curl -XPUT "http://localhost:9200/scrapy4" -d '
{
"mappings" : {
"items" : {
"dynamic": "true",
"properties" : {
"art_id" : {
"type" : "long"
},
"band_id" : {
"type" : "long"
},
"bio_image" : {
"properties" : {
"height" : {
"type" : "long"
},
"image_id" : {
"type" : "long"
},
"width" : {
"type" : "long"
}
}
},
"category" : {
"type" : "string"
},
"featured_track" : {
"properties" : {
"duration" : {
"type" : "double"
},
"encodings_id" : {
"type" : "long"
},
"file" : {
"properties" : {
"mp3-128" : {
"type" : "string",
"index": "not_analyzed"
}
}
},
"id" : {
"type" : "long"
},
"title" : {
"type" : "string"
}
}
},
"genre_text" : {
"type" : "string"
},
"human_tag_name" : {
"type" : "string",
"index" : "not_analyzed"
},
"is_preorder" : {
"type" : "long"
},
"item_type_id" : {
"type" : "string"
},
"location_text" : {
"type" : "string"
},
"machine_tag_name" : {
"type" : "string",
"index" : "not_analyzed"
},
"primary_text" : {
"type" : "string"
},
"publish_date" : {
"type" : "string",
"index": "not_analyzed"
},
"score" : {
"type" : "double"
},
"secondary_text" : {
"type" : "string"
},
"type" : {
"type" : "string"
},
"url_hints" : {
"properties" : {
"custom_domain" : {
"type" : "string"
},
"custom_domain_verified" : {
"type" : "long"
},
"item_type" : {
"type" : "string"
},
"slug" : {
"type" : "string"
},
"subdomain" : {
"type" : "string"
}
}
}
}
}
}
}
}'
@@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-

# Scrapy settings for scrapytest project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapytest'

SPIDER_MODULES = ['scrapytest.spiders']
NEWSPIDER_MODULE = 'scrapytest.spiders'

ITEM_PIPELINES = {
'scrapytest.pipelines.ElasticSearchPipeline': 1
}

ELASTICSEARCH_SERVERS = ['localhost:9200']
ELASTICSEARCH_INDEX = 'scrapy4'
ELASTICSEARCH_TYPE = 'items'
ELASTICSEARCH_UNIQ_KEY = 'id'

LOG_ENABLED = True
LOG_LEVEL = 'INFO'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapytest (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapytest.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapytest.middlewares.ProxyMiddleware': 543,
'scrapytest.middlewares.RetryChangeProxyMiddleware': 600
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'scrapytest.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 2.0
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60.0
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = True

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Binary file not shown.
@@ -0,0 +1,96 @@
# The default ``config.py``


def set_prefs(prefs):
"""This function is called before opening the project"""

# Specify which files and folders to ignore in the project.
# Changes to ignored resources are not added to the history and
# VCSs. Also they are not returned in `Project.get_files()`.
# Note that ``?`` and ``*`` match all characters but slashes.
# '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
# 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
# '.svn': matches 'pkg/.svn' and all of its children
# 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
# 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
prefs['ignored_resources'] = [
'*.pyc', '*~', '.ropeproject', '.hg', '.svn', '_svn', '.git',
'.tox', '.env', 'node_modules', 'bower_components']

# Specifies which files should be considered python files. It is
# useful when you have scripts inside your project. Only files
# ending with ``.py`` are considered to be python files by
# default.
#prefs['python_files'] = ['*.py']

# Custom source folders: By default rope searches the project
# for finding source folders (folders that should be searched
# for finding modules). You can add paths to that list. Note
# that rope guesses project source folders correctly most of the
# time; use this if you have any problems.
# The folders should be relative to project root and use '/' for
# separating folders regardless of the platform rope is running on.
# 'src/my_source_folder' for instance.
#prefs.add('source_folders', 'src')

# You can extend python path for looking up modules
#prefs.add('python_path', '~/python/')

# Should rope save object information or not.
prefs['save_objectdb'] = True
prefs['compress_objectdb'] = False

# If `True`, rope analyzes each module when it is being saved.
prefs['automatic_soa'] = True
# The depth of calls to follow in static object analysis
prefs['soa_followed_calls'] = 0

# If `False` when running modules or unit tests "dynamic object
# analysis" is turned off. This makes them much faster.
prefs['perform_doa'] = True

# Rope can check the validity of its object DB when running.
prefs['validate_objectdb'] = True

# How many undos to hold?
prefs['max_history_items'] = 32

# Shows whether to save history across sessions.
prefs['save_history'] = True
prefs['compress_history'] = False

# Set the number spaces used for indenting. According to
# :PEP:`8`, it is best to use 4 spaces. Since most of rope's
# unit-tests use 4 spaces it is more reliable, too.
prefs['indent_size'] = 4

# Builtin and c-extension modules that are allowed to be imported
# and inspected by rope.
prefs['extension_modules'] = []

# Add all standard c-extensions to extension_modules list.
prefs['import_dynload_stdmods'] = True

# If `True` modules with syntax errors are considered to be empty.
# The default value is `False`; When `False` syntax errors raise
# `rope.base.exceptions.ModuleSyntaxError` exception.
prefs['ignore_syntax_errors'] = False

# If `True`, rope ignores unresolvable imports. Otherwise, they
# appear in the importing namespace.
prefs['ignore_bad_imports'] = False

# If `True`, rope will transform a comma list of imports into
# multiple separate import statements when organizing
# imports.
prefs['split_imports'] = False

# If `True`, rope will sort imports alphabetically by module name
# instead of alphabetically by import statement, with from imports
# after normal imports.
prefs['sort_imports_alphabetically'] = False


def project_opened(project):
"""This function is called after opening the project"""
# Do whatever you like here!
Binary file not shown.
@@ -0,0 +1 @@
�]q(]q]qe.
Binary file not shown.
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Binary file not shown.
@@ -0,0 +1,42 @@
import scrapy
import json
from math import ceil

from scrapytest.items import ArbitraryItem

api_url_format = "http://151.101.65.28/api/discover/3/get_web?t=%s&s=top&p=%s"
# bandcamp.com ip address to save dns lookups

class BandcampSpider(scrapy.Spider):
name = "bandcamp"
allowed_domains = ['bandcamp.com', '151.101.65.28']
start_urls = [
"http://bandcamp.com/tags"
]

def parse(self, response):
for link in response.css("#tags_cloud > a"):
machine_tag_name = link.css("::attr('href')").extract()[0].replace("/tag/", "")
human_tag_name = link.xpath("text()").extract()
url = api_url_format % (machine_tag_name, 0)
meta = {"machine": machine_tag_name, "human": human_tag_name}
yield scrapy.Request(url, meta=meta, callback=self.parse_tag)

def parse_tag(self, response):
machine_tag_name = response.meta["machine"]
json_obj = json.loads(response.body_as_unicode())
total_count = json_obj.get("total_count")
num_pages = int(ceil(total_count/48.0))
for i in range(num_pages):
url = api_url_format % (machine_tag_name, i)
yield scrapy.Request(url, meta=response.meta, callback=self.parse_album)

def parse_album(self, response):
json_obj = json.loads(response.body_as_unicode())
items = json_obj.get('items')
for item in items:
album = ArbitraryItem(item)
album['id'] = str(album['id'])
album['machine_tag_name'] = response.meta['machine']
album['human_tag_name'] = response.meta['human']
yield album
Binary file not shown.
@@ -0,0 +1,358 @@
import time
from itertools import chain

from elasticsearch.connection import RequestsHttpConnection
from elasticsearch.connection_pool import ConnectionPool, DummyConnectionPool
from elasticsearch.serializer import JSONSerializer, Deserializer, DEFAULT_SERIALIZERS
from elasticsearch.exceptions import ConnectionError, TransportError, SerializationError, \
ConnectionTimeout, ImproperlyConfigured
import requests
from requests_ntlm import HttpNtlmAuth

def get_host_info(node_info, host):
"""
Simple callback that takes the node info from `/_cluster/nodes` and a
parsed connection information and return the connection information. If
`None` is returned this node will be skipped.
Useful for filtering nodes (by proximity for example) or if additional
information needs to be provided for the :class:`~elasticsearch.Connection`
class. By default master only nodes are filtered out since they shouldn't
typically be used for API operations.
:arg node_info: node information from `/_cluster/nodes`
:arg host: connection information (host, port) extracted from the node info
"""
attrs = node_info.get('attributes', {})

# ignore master only nodes
if (attrs.get('data', 'true') == 'false' and
attrs.get('client', 'false') == 'false' and
attrs.get('master', 'true') == 'true'):
return None
return host

class TransportNTLM(object):
"""
Encapsulation of transport-related to logic. Handles instantiation of the
individual connections as well as creating a connection pool to hold them.
Main interface is the `perform_request` method.
"""
def __init__(self, hosts, connection_class=RequestsHttpConnection,
connection_pool_class=ConnectionPool, host_info_callback=get_host_info,
sniff_on_start=False, sniffer_timeout=None, sniff_timeout=.1,
sniff_on_connection_fail=False, serializer=JSONSerializer(), serializers=None,
default_mimetype='application/json', max_retries=3, retry_on_status=(503, 504, ),
retry_on_timeout=False, send_get_body_as='GET', **kwargs):
"""
:arg hosts: list of dictionaries, each containing keyword arguments to
create a `connection_class` instance
:arg connection_class: subclass of :class:`~elasticsearch.Connection` to use
:arg connection_pool_class: subclass of :class:`~elasticsearch.ConnectionPool` to use
:arg host_info_callback: callback responsible for taking the node information from
`/_cluser/nodes`, along with already extracted information, and
producing a list of arguments (same as `hosts` parameter)
:arg sniff_on_start: flag indicating whether to obtain a list of nodes
from the cluser at startup time
:arg sniffer_timeout: number of seconds between automatic sniffs
:arg sniff_on_connection_fail: flag controlling if connection failure triggers a sniff
:arg sniff_timeout: timeout used for the sniff request - it should be a
fast api call and we are talking potentially to more nodes so we want
to fail quickly. Not used during initial sniffing (if
``sniff_on_start`` is on) when the connection still isn't
initialized.
:arg serializer: serializer instance
:arg serializers: optional dict of serializer instances that will be
used for deserializing data coming from the server. (key is the mimetype)
:arg default_mimetype: when no mimetype is specified by the server
response assume this mimetype, defaults to `'application/json'`
:arg max_retries: maximum number of retries before an exception is propagated
:arg retry_on_status: set of HTTP status codes on which we should retry
on a different node. defaults to ``(503, 504, )``
:arg retry_on_timeout: should timeout trigger a retry on different
node? (default `False`)
:arg send_get_body_as: for GET requests with body this option allows
you to specify an alternate way of execution for environments that
don't support passing bodies with GET requests. If you set this to
'POST' a POST method will be used instead, if to 'source' then the body
will be serialized and passed as a query parameter `source`.
Any extra keyword arguments will be passed to the `connection_class`
when creating and instance unless overriden by that connection's
options provided as part of the hosts parameter.
"""

# serialization config
_serializers = DEFAULT_SERIALIZERS.copy()
# if a serializer has been specified, use it for deserialization as well
_serializers[serializer.mimetype] = serializer
# if custom serializers map has been supplied, override the defaults with it
if serializers:
_serializers.update(serializers)
# create a deserializer with our config
self.deserializer = Deserializer(_serializers, default_mimetype)

self.max_retries = max_retries
self.retry_on_timeout = retry_on_timeout
self.retry_on_status = retry_on_status
self.send_get_body_as = send_get_body_as

# data serializer
self.serializer = serializer

# store all strategies...
self.connection_pool_class = connection_pool_class
self.connection_class = connection_class

# ...save kwargs to be passed to the connections
self.kwargs = kwargs
self.hosts = hosts

# ...and instantiate them
self.set_connections(hosts)
# retain the original connection instances for sniffing
self.seed_connections = self.connection_pool.connections[:]

# sniffing data
self.sniffer_timeout = sniffer_timeout
self.sniff_on_connection_fail = sniff_on_connection_fail
self.last_sniff = time.time()
self.sniff_timeout = sniff_timeout

# callback to construct host dict from data in /_cluster/nodes
self.host_info_callback = host_info_callback

if sniff_on_start:
self.sniff_hosts(True)


def add_connection(self, host):
"""
Create a new :class:`~elasticsearch.Connection` instance and add it to the pool.
:arg host: kwargs that will be used to create the instance
"""
self.hosts.append(host)
self.set_connections(self.hosts)

def set_connections(self, hosts):
"""
Instantiate all the connections and crate new connection pool to hold
them. Tries to identify unchanged hosts and re-use existing
:class:`~elasticsearch.Connection` instances.
:arg hosts: same as `__init__`
"""
# construct the connections
def _create_connection(host):
# if this is not the initial setup look at the existing connection
# options and identify connections that haven't changed and can be
# kept around.
if hasattr(self, 'connection_pool'):
for (connection, old_host) in self.connection_pool.connection_opts:
if old_host == host:
return connection

# previously unseen params, create new connection
kwargs = self.kwargs.copy()
kwargs.update(host)

if 'scheme' in host and host['scheme'] != self.connection_class.transport_schema:
raise ImproperlyConfigured(
'Scheme specified in connection (%s) is not the same as the connection class (%s) specifies (%s).' % (
host['scheme'], self.connection_class.__name__, self.connection_class.transport_schema
))
ntlm_auth = HttpNtlmAuth(kwargs.get('ntlm_user',''), kwargs.get('ntlm_pass',''))
return self.connection_class(http_auth=ntlm_auth, **kwargs)
connections = map(_create_connection, hosts)

connections = list(zip(connections, hosts))
if len(connections) == 1:
self.connection_pool = DummyConnectionPool(connections)
else:
# pass the hosts dicts to the connection pool to optionally extract parameters from
self.connection_pool = self.connection_pool_class(connections, **self.kwargs)

def get_connection(self):
"""
Retreive a :class:`~elasticsearch.Connection` instance from the
:class:`~elasticsearch.ConnectionPool` instance.
"""
if self.sniffer_timeout:
if time.time() >= self.last_sniff + self.sniffer_timeout:
self.sniff_hosts()
return self.connection_pool.get_connection()

def _get_sniff_data(self, initial=False):
"""
Perform the request to get sniffins information. Returns a list of
dictionaries (one per node) containing all the information from the
cluster.
It also sets the last_sniff attribute in case of a successful attempt.
In rare cases it might be possible to override this method in your
custom Transport class to serve data from alternative source like
configuration management.
"""
previous_sniff = self.last_sniff

try:
# reset last_sniff timestamp
self.last_sniff = time.time()
# go through all current connections as well as the
# seed_connections for good measure
for c in chain(self.connection_pool.connections, self.seed_connections):
try:
# use small timeout for the sniffing request, should be a fast api call
_, headers, node_info = c.perform_request('GET', '/_nodes/_all/clear',
timeout=self.sniff_timeout if not initial else None)
node_info = self.deserializer.loads(node_info, headers.get('content-type'))
break
except (ConnectionError, SerializationError):
pass
else:
raise TransportError("N/A", "Unable to sniff hosts.")
except:
# keep the previous value on error
self.last_sniff = previous_sniff
raise

return list(node_info['nodes'].values())


def sniff_hosts(self, initial=False):
"""
Obtain a list of nodes from the cluster and create a new connection
pool using the information retrieved.
To extract the node connection parameters use the ``nodes_to_host_callback``.
:arg initial: flag indicating if this is during startup
(``sniff_on_start``), ignore the ``sniff_timeout`` if ``True``
"""
node_info = self._get_sniff_data(initial)

hosts = []
address_key = self.connection_class.transport_schema + '_address'
for n in node_info:
host = {}
address = n.get(address_key, '')
if '/' in address:
host['host'], address = address.split('/', 1)

# malformed address
if ':' not in address:
continue

ip, port = address.rsplit(':', 1)

# use the ip if not overridden by publish_host
host.setdefault('host', ip)
host['port'] = int(port)

host = self.host_info_callback(n, host)
if host is not None:
hosts.append(host)

# we weren't able to get any nodes, maybe using an incompatible
# transport_schema or host_info_callback blocked all - raise error.
if not hosts:
raise TransportError("N/A", "Unable to sniff hosts - no viable hosts found.")

self.set_connections(hosts)

def mark_dead(self, connection):
"""
Mark a connection as dead (failed) in the connection pool. If sniffing
on failure is enabled this will initiate the sniffing process.
:arg connection: instance of :class:`~elasticsearch.Connection` that failed
"""
# mark as dead even when sniffing to avoid hitting this host during the sniff process
self.connection_pool.mark_dead(connection)
if self.sniff_on_connection_fail:
self.sniff_hosts()

def perform_request(self, method, url, params=None, body=None):
"""
Perform the actual request. Retrieve a connection from the connection
pool, pass all the information to it's perform_request method and
return the data.
If an exception was raised, mark the connection as failed and retry (up
to `max_retries` times).
If the operation was succesful and the connection used was previously
marked as dead, mark it as live, resetting it's failure count.
:arg method: HTTP method to use
:arg url: absolute url (without host) to target
:arg params: dictionary of query parameters, will be handed over to the
underlying :class:`~elasticsearch.Connection` class for serialization
:arg body: body of the request, will be serializes using serializer and
passed to the connection
"""
if body is not None:
body = self.serializer.dumps(body)

# some clients or environments don't support sending GET with body
if method in ('HEAD', 'GET') and self.send_get_body_as != 'GET':
# send it as post instead
if self.send_get_body_as == 'POST':
method = 'POST'

# or as source parameter
elif self.send_get_body_as == 'source':
if params is None:
params = {}
params['source'] = body
body = None

if body is not None:
try:
body = body.encode('utf-8')
except (UnicodeDecodeError, AttributeError):
# bytes/str - no need to re-encode
pass

ignore = ()
timeout = None
if params:
timeout = params.pop('request_timeout', None)
ignore = params.pop('ignore', ())
if isinstance(ignore, int):
ignore = (ignore, )

for attempt in range(self.max_retries + 1):
connection = self.get_connection()

try:
status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout)

except TransportError as e:
retry = False
if isinstance(e, ConnectionTimeout):
retry = self.retry_on_timeout
elif isinstance(e, ConnectionError):
retry = True
elif e.status_code in self.retry_on_status:
retry = True

if retry:
# only mark as dead if we are retrying
self.mark_dead(connection)
# raise exception on last retry
if attempt == self.max_retries:
raise
else:
raise

else:
# connection didn't fail, confirm it's live status
self.connection_pool.mark_live(connection)
if data:
data = self.deserializer.loads(data, headers.get('content-type'))
return status, data

Binary file not shown.
@@ -0,0 +1,10 @@
# Automatically created by: scrapyd-deploy

from setuptools import setup, find_packages

setup(
name = 'project',
version = '1.0',
packages = find_packages(),
entry_points = {'scrapy': ['settings = scrapytest.settings']},
)