| @@ -0,0 +1,59 @@ | ||
| #!/usr/bin/python | ||
|
|
||
| from time import sleep | ||
| import requests | ||
| from bs4 import BeautifulSoup | ||
| import os | ||
| from math import ceil | ||
| import re | ||
| import codecs | ||
|
|
||
| # scrape list of tags | ||
| tagList = [] | ||
| tagPage = requests.get("http://bandcamp.com/tags") | ||
| soup = BeautifulSoup(tagPage.text, "lxml") | ||
| for item in soup.select("#tags_cloud > a"): | ||
| if(item.get_text() not in tagList and item.get_text() != ""): | ||
| human_tag = item.get_text() | ||
| # format the tag to what the api expects | ||
| # http://stackoverflow.com/questions/6116978/python-replace-multiple-strings | ||
| rep = {" & ": "-", "&": "-", " ": "-", "/": "-"} # define desired replacements here | ||
| rep = dict((re.escape(k), v) for k, v in rep.iteritems()) | ||
| pattern = re.compile("|".join(rep.keys())) | ||
| machine_tag = pattern.sub(lambda m: rep[re.escape(m.group(0))], human_tag) | ||
| tagList.append(machine_tag) | ||
| try: | ||
| os.mkdir(machine_tag) # directory to save json | ||
| with codecs.open(machine_tag + "/" + "human.txt", 'w+', "utf-8-sig") as f: | ||
| f.write(human_tag) | ||
| except Exception as e: | ||
| print(e) | ||
|
|
||
| for index, tag in enumerate(tagList): | ||
| print(tag + " " + str(index+1) + "/" + str(len(tagList))) | ||
|
|
||
| req = requests.Response() | ||
| while (req.status_code != 200): | ||
| req = requests.get( | ||
| "http://bandcamp.com/api/discover/3/get_web?t=" + | ||
| tag + "&s=top&p=0") | ||
| jsonObj = req.json() | ||
|
|
||
| # each page contains at most 48 entries | ||
| # find total num entries in page 0 and calculate num pages | ||
| totalCount = jsonObj.get("total_count") | ||
| print(totalCount) | ||
| totalPages = int(ceil(totalCount/48.0)) | ||
|
|
||
| # iterate through pages, write to disk | ||
| for page in range(0, totalPages): | ||
| with open(tag+'/'+str(page), 'w+') as f: | ||
| req = requests.Response() | ||
| while (req.status_code != 200): # lol | ||
| req = requests.get( | ||
| "http://bandcamp.com/api/discover/3/get_web?t=" + | ||
| tag + "&s=top&p=" + str(page)) | ||
| f.write(req.text.encode("utf8")) | ||
| print(str(page + 1) + "/" + str(totalPages) + " " + | ||
| str(req.status_code) + " " + str(len(req.text))) | ||
| print(tag + " complete.") |
| @@ -0,0 +1,24 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Define here the models for your scraped items | ||
| # | ||
| # See documentation in: | ||
| # http://doc.scrapy.org/en/latest/topics/items.html | ||
|
|
||
| from scrapy.item import Item | ||
|
|
||
|
|
||
| class ArbitraryItem(Item): | ||
| def __setitem__(self, key, value): | ||
| self._values[key] = value | ||
| self.fields[key] = {} | ||
|
|
||
| # class BandcampTag(scrapy.Item): | ||
| # machine_name = scrapy.Field() | ||
| # human_name = scrapy.Field() | ||
| # albums = scrapy.Field() | ||
| # total_albums = scrapy.Field() | ||
| # | ||
| # | ||
| #class BandcampAlbum(scrapy.Item): | ||
| # field = scrapy.Field() |
| @@ -0,0 +1,117 @@ | ||
| # Copyright 2014 Michael Malocha <michael@knockrentals.com> | ||
| # | ||
| # Expanded from the work by Julien Duponchelle <julien@duponchelle.info>. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Elastic Search Pipeline for scrappy expanded with support for multiple items""" | ||
|
|
||
| from datetime import datetime | ||
| from elasticsearch import Elasticsearch, helpers | ||
| import logging | ||
| import hashlib | ||
| import types | ||
|
|
||
| from transportNTLM import TransportNTLM | ||
|
|
||
|
|
||
| class InvalidSettingsException(Exception): | ||
| pass | ||
|
|
||
| class ElasticSearchPipeline(object): | ||
| settings = None | ||
| es = None | ||
| items_buffer = [] | ||
|
|
||
| @classmethod | ||
| def validate_settings(cls, settings): | ||
| def validate_setting(setting_key): | ||
| if settings[setting_key] is None: | ||
| raise InvalidSettingsException('%s is not defined in settings.py' % setting_key) | ||
|
|
||
| required_settings = {'ELASTICSEARCH_SERVERS', 'ELASTICSEARCH_INDEX', 'ELASTICSEARCH_TYPE'} | ||
|
|
||
| for required_setting in required_settings: | ||
| validate_setting(required_setting) | ||
|
|
||
| @classmethod | ||
| def from_crawler(cls, crawler): | ||
| ext = cls() | ||
| ext.settings = crawler.settings | ||
|
|
||
| cls.validate_settings(ext.settings) | ||
|
|
||
| es_servers = ext.settings['ELASTICSEARCH_SERVERS'] | ||
| es_servers = es_servers if isinstance(es_servers, list) else [es_servers] | ||
|
|
||
| authType = ext.settings['ELASTICSEARCH_AUTH'] | ||
| if authType == 'NTLM': | ||
| ext.es = Elasticsearch(hosts=es_servers, | ||
| transport_class=TransportNTLM, | ||
| ntlm_user= ext.settings['ELASTICSEARCH_USERNAME'], | ||
| ntlm_pass= ext.settings['ELASTICSEARCH_PASSWORD'], | ||
| timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT',60)) | ||
| else : | ||
| ext.es = Elasticsearch(hosts=es_servers, timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT', 60)) | ||
| return ext | ||
|
|
||
| def get_unique_key(self, unique_key): | ||
| if isinstance(unique_key, list): | ||
| unique_key = unique_key[0] | ||
| elif not isinstance(unique_key, str): | ||
| raise Exception('unique key must be str') | ||
|
|
||
| return unique_key | ||
|
|
||
| def index_item(self, item): | ||
|
|
||
| index_name = self.settings['ELASTICSEARCH_INDEX'] | ||
| index_suffix_format = self.settings.get('ELASTICSEARCH_INDEX_DATE_FORMAT', None) | ||
|
|
||
| if index_suffix_format: | ||
| index_name += "-" + datetime.strftime(datetime.now(),index_suffix_format) | ||
|
|
||
| index_action = { | ||
| '_index': index_name, | ||
| '_type': self.settings['ELASTICSEARCH_TYPE'], | ||
| '_source': dict(item) | ||
| } | ||
|
|
||
| if self.settings['ELASTICSEARCH_UNIQ_KEY'] is not None: | ||
| item_unique_key = item[self.settings['ELASTICSEARCH_UNIQ_KEY']] | ||
| unique_key = self.get_unique_key(item_unique_key) | ||
| item_id = hashlib.sha1(unique_key).hexdigest() | ||
| index_action['_id'] = item_id | ||
| logging.debug('Generated unique key %s' % item_id) | ||
|
|
||
| self.items_buffer.append(index_action) | ||
|
|
||
| if len(self.items_buffer) == self.settings.get('ELASTICSEARCH_BUFFER_LENGTH', 500): | ||
| self.send_items() | ||
| self.items_buffer = [] | ||
|
|
||
| def send_items(self): | ||
| helpers.bulk(self.es, self.items_buffer) | ||
|
|
||
| def process_item(self, item, spider): | ||
| if isinstance(item, types.GeneratorType) or isinstance(item, types.ListType): | ||
| for each in item: | ||
| self.process_item(each, spider) | ||
| else: | ||
| self.index_item(item) | ||
| logging.debug('Item sent to Elastic Search %s' % self.settings['ELASTICSEARCH_INDEX']) | ||
| return item | ||
|
|
||
| def close_spider(self, spider): | ||
| if len(self.items_buffer): | ||
| self.send_items() |
| @@ -0,0 +1,100 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Scrapy settings for scrapytest project | ||
| # | ||
| # For simplicity, this file contains only settings considered important or | ||
| # commonly used. You can find more settings consulting the documentation: | ||
| # | ||
| # http://doc.scrapy.org/en/latest/topics/settings.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
|
||
| BOT_NAME = 'scrapytest' | ||
|
|
||
| SPIDER_MODULES = ['scrapytest.spiders'] | ||
| NEWSPIDER_MODULE = 'scrapytest.spiders' | ||
|
|
||
| ITEM_PIPELINES = { | ||
| 'scrapytest.pipelines.ElasticSearchPipeline': 1 | ||
| } | ||
|
|
||
| ELASTICSEARCH_SERVERS = ['localhost:9200'] | ||
| ELASTICSEARCH_INDEX = 'scrapy2' | ||
| ELASTICSEARCH_TYPE = 'items' | ||
| ELASTICSEARCH_UNIQ_KEY = 'id' | ||
|
|
||
| LOG_ENABLED = True | ||
|
|
||
| # Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
| #USER_AGENT = 'scrapytest (+http://www.yourdomain.com)' | ||
|
|
||
| # Obey robots.txt rules | ||
| ROBOTSTXT_OBEY = False | ||
|
|
||
| # Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
| CONCURRENT_REQUESTS = 32 | ||
|
|
||
| # Configure a delay for requests for the same website (default: 0) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
| # See also autothrottle settings and docs | ||
| #DOWNLOAD_DELAY = 3 | ||
| # The download delay setting will honor only one of: | ||
| #CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
| #CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
|
||
| # Disable cookies (enabled by default) | ||
| #COOKIES_ENABLED = False | ||
|
|
||
| # Disable Telnet Console (enabled by default) | ||
| #TELNETCONSOLE_ENABLED = False | ||
|
|
||
| # Override the default request headers: | ||
| #DEFAULT_REQUEST_HEADERS = { | ||
| # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
| # 'Accept-Language': 'en', | ||
| #} | ||
|
|
||
| # Enable or disable spider middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
| #SPIDER_MIDDLEWARES = { | ||
| # 'scrapytest.middlewares.MyCustomSpiderMiddleware': 543, | ||
| #} | ||
|
|
||
| # Enable or disable downloader middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| #DOWNLOADER_MIDDLEWARES = { | ||
| # 'scrapytest.middlewares.MyCustomDownloaderMiddleware': 543, | ||
| #} | ||
|
|
||
| # Enable or disable extensions | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
| #EXTENSIONS = { | ||
| # 'scrapy.extensions.telnet.TelnetConsole': None, | ||
| #} | ||
|
|
||
| # Configure item pipelines | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
| #ITEM_PIPELINES = { | ||
| # 'scrapytest.pipelines.SomePipeline': 300, | ||
| #} | ||
|
|
||
| # Enable and configure the AutoThrottle extension (disabled by default) | ||
| # See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
| AUTOTHROTTLE_ENABLED = True | ||
| # The initial download delay | ||
| AUTOTHROTTLE_START_DELAY = 0.0 | ||
| # The maximum download delay to be set in case of high latencies | ||
| AUTOTHROTTLE_MAX_DELAY = 60.0 | ||
| # The average number of requests Scrapy should be sending in parallel to | ||
| # each remote server | ||
| AUTOTHROTTLE_TARGET_CONCURRENCY = 20.0 | ||
| # Enable showing throttling stats for every response received: | ||
| #AUTOTHROTTLE_DEBUG = True | ||
|
|
||
| # Enable and configure HTTP caching (disabled by default) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
| #HTTPCACHE_ENABLED = True | ||
| #HTTPCACHE_EXPIRATION_SECS = 0 | ||
| #HTTPCACHE_DIR = 'httpcache' | ||
| #HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
| #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
| @@ -0,0 +1,4 @@ | ||
| # This package will contain the spiders of your Scrapy project | ||
| # | ||
| # Please refer to the documentation for information on how to create and manage | ||
| # your spiders. |
| @@ -0,0 +1,42 @@ | ||
| import scrapy | ||
| import json | ||
| from math import ceil | ||
|
|
||
| from scrapytest.items import ArbitraryItem | ||
|
|
||
| api_url_format = "http://151.101.65.28/api/discover/3/get_web?t=%s&s=top&p=%s" | ||
| # bandcamp.com ip address to save dns lookups | ||
|
|
||
| class BandcampSpider(scrapy.Spider): | ||
| name = "bandcamp" | ||
| allowed_domains = ['bandcamp.com', '151.101.65.28'] | ||
| start_urls = [ | ||
| "http://bandcamp.com/tags" | ||
| ] | ||
|
|
||
| def parse(self, response): | ||
| for link in response.css("#tags_cloud > a"): | ||
| machine_tag_name = link.css("::attr('href')").extract()[0].replace("/tag/", "") | ||
| human_tag_name = link.xpath("text()").extract() | ||
| url = api_url_format % (machine_tag_name, 0) | ||
| meta = {"machine": machine_tag_name, "human": human_tag_name} | ||
| yield scrapy.Request(url, meta=meta, callback=self.parse_tag) | ||
|
|
||
| def parse_tag(self, response): | ||
| machine_tag_name = response.meta["machine"] | ||
| json_obj = json.loads(response.body_as_unicode()) | ||
| total_count = json_obj.get("total_count") | ||
| num_pages = int(ceil(total_count/48.0)) | ||
| for i in range(num_pages): | ||
| url = api_url_format % (machine_tag_name, i) | ||
| yield scrapy.Request(url, meta=response.meta, callback=self.parse_album) | ||
|
|
||
| def parse_album(self, response): | ||
| json_obj = json.loads(response.body_as_unicode()) | ||
| items = json_obj.get('items') | ||
| for item in items: | ||
| album = ArbitraryItem(item) | ||
| album['id'] = str(album['id']) | ||
| album['machine_tag_name'] = response.meta['machine'] | ||
| album['human_tag_name'] = response.meta['human'] | ||
| yield album |
| @@ -0,0 +1,358 @@ | ||
| import time | ||
| from itertools import chain | ||
|
|
||
| from elasticsearch.connection import RequestsHttpConnection | ||
| from elasticsearch.connection_pool import ConnectionPool, DummyConnectionPool | ||
| from elasticsearch.serializer import JSONSerializer, Deserializer, DEFAULT_SERIALIZERS | ||
| from elasticsearch.exceptions import ConnectionError, TransportError, SerializationError, \ | ||
| ConnectionTimeout, ImproperlyConfigured | ||
| import requests | ||
| from requests_ntlm import HttpNtlmAuth | ||
|
|
||
| def get_host_info(node_info, host): | ||
| """ | ||
| Simple callback that takes the node info from `/_cluster/nodes` and a | ||
| parsed connection information and return the connection information. If | ||
| `None` is returned this node will be skipped. | ||
| Useful for filtering nodes (by proximity for example) or if additional | ||
| information needs to be provided for the :class:`~elasticsearch.Connection` | ||
| class. By default master only nodes are filtered out since they shouldn't | ||
| typically be used for API operations. | ||
| :arg node_info: node information from `/_cluster/nodes` | ||
| :arg host: connection information (host, port) extracted from the node info | ||
| """ | ||
| attrs = node_info.get('attributes', {}) | ||
|
|
||
| # ignore master only nodes | ||
| if (attrs.get('data', 'true') == 'false' and | ||
| attrs.get('client', 'false') == 'false' and | ||
| attrs.get('master', 'true') == 'true'): | ||
| return None | ||
| return host | ||
|
|
||
| class TransportNTLM(object): | ||
| """ | ||
| Encapsulation of transport-related to logic. Handles instantiation of the | ||
| individual connections as well as creating a connection pool to hold them. | ||
| Main interface is the `perform_request` method. | ||
| """ | ||
| def __init__(self, hosts, connection_class=RequestsHttpConnection, | ||
| connection_pool_class=ConnectionPool, host_info_callback=get_host_info, | ||
| sniff_on_start=False, sniffer_timeout=None, sniff_timeout=.1, | ||
| sniff_on_connection_fail=False, serializer=JSONSerializer(), serializers=None, | ||
| default_mimetype='application/json', max_retries=3, retry_on_status=(503, 504, ), | ||
| retry_on_timeout=False, send_get_body_as='GET', **kwargs): | ||
| """ | ||
| :arg hosts: list of dictionaries, each containing keyword arguments to | ||
| create a `connection_class` instance | ||
| :arg connection_class: subclass of :class:`~elasticsearch.Connection` to use | ||
| :arg connection_pool_class: subclass of :class:`~elasticsearch.ConnectionPool` to use | ||
| :arg host_info_callback: callback responsible for taking the node information from | ||
| `/_cluser/nodes`, along with already extracted information, and | ||
| producing a list of arguments (same as `hosts` parameter) | ||
| :arg sniff_on_start: flag indicating whether to obtain a list of nodes | ||
| from the cluser at startup time | ||
| :arg sniffer_timeout: number of seconds between automatic sniffs | ||
| :arg sniff_on_connection_fail: flag controlling if connection failure triggers a sniff | ||
| :arg sniff_timeout: timeout used for the sniff request - it should be a | ||
| fast api call and we are talking potentially to more nodes so we want | ||
| to fail quickly. Not used during initial sniffing (if | ||
| ``sniff_on_start`` is on) when the connection still isn't | ||
| initialized. | ||
| :arg serializer: serializer instance | ||
| :arg serializers: optional dict of serializer instances that will be | ||
| used for deserializing data coming from the server. (key is the mimetype) | ||
| :arg default_mimetype: when no mimetype is specified by the server | ||
| response assume this mimetype, defaults to `'application/json'` | ||
| :arg max_retries: maximum number of retries before an exception is propagated | ||
| :arg retry_on_status: set of HTTP status codes on which we should retry | ||
| on a different node. defaults to ``(503, 504, )`` | ||
| :arg retry_on_timeout: should timeout trigger a retry on different | ||
| node? (default `False`) | ||
| :arg send_get_body_as: for GET requests with body this option allows | ||
| you to specify an alternate way of execution for environments that | ||
| don't support passing bodies with GET requests. If you set this to | ||
| 'POST' a POST method will be used instead, if to 'source' then the body | ||
| will be serialized and passed as a query parameter `source`. | ||
| Any extra keyword arguments will be passed to the `connection_class` | ||
| when creating and instance unless overriden by that connection's | ||
| options provided as part of the hosts parameter. | ||
| """ | ||
|
|
||
| # serialization config | ||
| _serializers = DEFAULT_SERIALIZERS.copy() | ||
| # if a serializer has been specified, use it for deserialization as well | ||
| _serializers[serializer.mimetype] = serializer | ||
| # if custom serializers map has been supplied, override the defaults with it | ||
| if serializers: | ||
| _serializers.update(serializers) | ||
| # create a deserializer with our config | ||
| self.deserializer = Deserializer(_serializers, default_mimetype) | ||
|
|
||
| self.max_retries = max_retries | ||
| self.retry_on_timeout = retry_on_timeout | ||
| self.retry_on_status = retry_on_status | ||
| self.send_get_body_as = send_get_body_as | ||
|
|
||
| # data serializer | ||
| self.serializer = serializer | ||
|
|
||
| # store all strategies... | ||
| self.connection_pool_class = connection_pool_class | ||
| self.connection_class = connection_class | ||
|
|
||
| # ...save kwargs to be passed to the connections | ||
| self.kwargs = kwargs | ||
| self.hosts = hosts | ||
|
|
||
| # ...and instantiate them | ||
| self.set_connections(hosts) | ||
| # retain the original connection instances for sniffing | ||
| self.seed_connections = self.connection_pool.connections[:] | ||
|
|
||
| # sniffing data | ||
| self.sniffer_timeout = sniffer_timeout | ||
| self.sniff_on_connection_fail = sniff_on_connection_fail | ||
| self.last_sniff = time.time() | ||
| self.sniff_timeout = sniff_timeout | ||
|
|
||
| # callback to construct host dict from data in /_cluster/nodes | ||
| self.host_info_callback = host_info_callback | ||
|
|
||
| if sniff_on_start: | ||
| self.sniff_hosts(True) | ||
|
|
||
|
|
||
| def add_connection(self, host): | ||
| """ | ||
| Create a new :class:`~elasticsearch.Connection` instance and add it to the pool. | ||
| :arg host: kwargs that will be used to create the instance | ||
| """ | ||
| self.hosts.append(host) | ||
| self.set_connections(self.hosts) | ||
|
|
||
| def set_connections(self, hosts): | ||
| """ | ||
| Instantiate all the connections and crate new connection pool to hold | ||
| them. Tries to identify unchanged hosts and re-use existing | ||
| :class:`~elasticsearch.Connection` instances. | ||
| :arg hosts: same as `__init__` | ||
| """ | ||
| # construct the connections | ||
| def _create_connection(host): | ||
| # if this is not the initial setup look at the existing connection | ||
| # options and identify connections that haven't changed and can be | ||
| # kept around. | ||
| if hasattr(self, 'connection_pool'): | ||
| for (connection, old_host) in self.connection_pool.connection_opts: | ||
| if old_host == host: | ||
| return connection | ||
|
|
||
| # previously unseen params, create new connection | ||
| kwargs = self.kwargs.copy() | ||
| kwargs.update(host) | ||
|
|
||
| if 'scheme' in host and host['scheme'] != self.connection_class.transport_schema: | ||
| raise ImproperlyConfigured( | ||
| 'Scheme specified in connection (%s) is not the same as the connection class (%s) specifies (%s).' % ( | ||
| host['scheme'], self.connection_class.__name__, self.connection_class.transport_schema | ||
| )) | ||
| ntlm_auth = HttpNtlmAuth(kwargs.get('ntlm_user',''), kwargs.get('ntlm_pass','')) | ||
| return self.connection_class(http_auth=ntlm_auth, **kwargs) | ||
| connections = map(_create_connection, hosts) | ||
|
|
||
| connections = list(zip(connections, hosts)) | ||
| if len(connections) == 1: | ||
| self.connection_pool = DummyConnectionPool(connections) | ||
| else: | ||
| # pass the hosts dicts to the connection pool to optionally extract parameters from | ||
| self.connection_pool = self.connection_pool_class(connections, **self.kwargs) | ||
|
|
||
| def get_connection(self): | ||
| """ | ||
| Retreive a :class:`~elasticsearch.Connection` instance from the | ||
| :class:`~elasticsearch.ConnectionPool` instance. | ||
| """ | ||
| if self.sniffer_timeout: | ||
| if time.time() >= self.last_sniff + self.sniffer_timeout: | ||
| self.sniff_hosts() | ||
| return self.connection_pool.get_connection() | ||
|
|
||
| def _get_sniff_data(self, initial=False): | ||
| """ | ||
| Perform the request to get sniffins information. Returns a list of | ||
| dictionaries (one per node) containing all the information from the | ||
| cluster. | ||
| It also sets the last_sniff attribute in case of a successful attempt. | ||
| In rare cases it might be possible to override this method in your | ||
| custom Transport class to serve data from alternative source like | ||
| configuration management. | ||
| """ | ||
| previous_sniff = self.last_sniff | ||
|
|
||
| try: | ||
| # reset last_sniff timestamp | ||
| self.last_sniff = time.time() | ||
| # go through all current connections as well as the | ||
| # seed_connections for good measure | ||
| for c in chain(self.connection_pool.connections, self.seed_connections): | ||
| try: | ||
| # use small timeout for the sniffing request, should be a fast api call | ||
| _, headers, node_info = c.perform_request('GET', '/_nodes/_all/clear', | ||
| timeout=self.sniff_timeout if not initial else None) | ||
| node_info = self.deserializer.loads(node_info, headers.get('content-type')) | ||
| break | ||
| except (ConnectionError, SerializationError): | ||
| pass | ||
| else: | ||
| raise TransportError("N/A", "Unable to sniff hosts.") | ||
| except: | ||
| # keep the previous value on error | ||
| self.last_sniff = previous_sniff | ||
| raise | ||
|
|
||
| return list(node_info['nodes'].values()) | ||
|
|
||
|
|
||
| def sniff_hosts(self, initial=False): | ||
| """ | ||
| Obtain a list of nodes from the cluster and create a new connection | ||
| pool using the information retrieved. | ||
| To extract the node connection parameters use the ``nodes_to_host_callback``. | ||
| :arg initial: flag indicating if this is during startup | ||
| (``sniff_on_start``), ignore the ``sniff_timeout`` if ``True`` | ||
| """ | ||
| node_info = self._get_sniff_data(initial) | ||
|
|
||
| hosts = [] | ||
| address_key = self.connection_class.transport_schema + '_address' | ||
| for n in node_info: | ||
| host = {} | ||
| address = n.get(address_key, '') | ||
| if '/' in address: | ||
| host['host'], address = address.split('/', 1) | ||
|
|
||
| # malformed address | ||
| if ':' not in address: | ||
| continue | ||
|
|
||
| ip, port = address.rsplit(':', 1) | ||
|
|
||
| # use the ip if not overridden by publish_host | ||
| host.setdefault('host', ip) | ||
| host['port'] = int(port) | ||
|
|
||
| host = self.host_info_callback(n, host) | ||
| if host is not None: | ||
| hosts.append(host) | ||
|
|
||
| # we weren't able to get any nodes, maybe using an incompatible | ||
| # transport_schema or host_info_callback blocked all - raise error. | ||
| if not hosts: | ||
| raise TransportError("N/A", "Unable to sniff hosts - no viable hosts found.") | ||
|
|
||
| self.set_connections(hosts) | ||
|
|
||
| def mark_dead(self, connection): | ||
| """ | ||
| Mark a connection as dead (failed) in the connection pool. If sniffing | ||
| on failure is enabled this will initiate the sniffing process. | ||
| :arg connection: instance of :class:`~elasticsearch.Connection` that failed | ||
| """ | ||
| # mark as dead even when sniffing to avoid hitting this host during the sniff process | ||
| self.connection_pool.mark_dead(connection) | ||
| if self.sniff_on_connection_fail: | ||
| self.sniff_hosts() | ||
|
|
||
| def perform_request(self, method, url, params=None, body=None): | ||
| """ | ||
| Perform the actual request. Retrieve a connection from the connection | ||
| pool, pass all the information to it's perform_request method and | ||
| return the data. | ||
| If an exception was raised, mark the connection as failed and retry (up | ||
| to `max_retries` times). | ||
| If the operation was succesful and the connection used was previously | ||
| marked as dead, mark it as live, resetting it's failure count. | ||
| :arg method: HTTP method to use | ||
| :arg url: absolute url (without host) to target | ||
| :arg params: dictionary of query parameters, will be handed over to the | ||
| underlying :class:`~elasticsearch.Connection` class for serialization | ||
| :arg body: body of the request, will be serializes using serializer and | ||
| passed to the connection | ||
| """ | ||
| if body is not None: | ||
| body = self.serializer.dumps(body) | ||
|
|
||
| # some clients or environments don't support sending GET with body | ||
| if method in ('HEAD', 'GET') and self.send_get_body_as != 'GET': | ||
| # send it as post instead | ||
| if self.send_get_body_as == 'POST': | ||
| method = 'POST' | ||
|
|
||
| # or as source parameter | ||
| elif self.send_get_body_as == 'source': | ||
| if params is None: | ||
| params = {} | ||
| params['source'] = body | ||
| body = None | ||
|
|
||
| if body is not None: | ||
| try: | ||
| body = body.encode('utf-8') | ||
| except (UnicodeDecodeError, AttributeError): | ||
| # bytes/str - no need to re-encode | ||
| pass | ||
|
|
||
| ignore = () | ||
| timeout = None | ||
| if params: | ||
| timeout = params.pop('request_timeout', None) | ||
| ignore = params.pop('ignore', ()) | ||
| if isinstance(ignore, int): | ||
| ignore = (ignore, ) | ||
|
|
||
| for attempt in range(self.max_retries + 1): | ||
| connection = self.get_connection() | ||
|
|
||
| try: | ||
| status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout) | ||
|
|
||
| except TransportError as e: | ||
| retry = False | ||
| if isinstance(e, ConnectionTimeout): | ||
| retry = self.retry_on_timeout | ||
| elif isinstance(e, ConnectionError): | ||
| retry = True | ||
| elif e.status_code in self.retry_on_status: | ||
| retry = True | ||
|
|
||
| if retry: | ||
| # only mark as dead if we are retrying | ||
| self.mark_dead(connection) | ||
| # raise exception on last retry | ||
| if attempt == self.max_retries: | ||
| raise | ||
| else: | ||
| raise | ||
|
|
||
| else: | ||
| # connection didn't fail, confirm it's live status | ||
| self.connection_pool.mark_live(connection) | ||
| if data: | ||
| data = self.deserializer.loads(data, headers.get('content-type')) | ||
| return status, data | ||
|
|
| @@ -0,0 +1,10 @@ | ||
| Metadata-Version: 1.0 | ||
| Name: project | ||
| Version: 1.0 | ||
| Summary: UNKNOWN | ||
| Home-page: UNKNOWN | ||
| Author: UNKNOWN | ||
| Author-email: UNKNOWN | ||
| License: UNKNOWN | ||
| Description: UNKNOWN | ||
| Platform: UNKNOWN |
| @@ -0,0 +1,14 @@ | ||
| setup.py | ||
| project.egg-info/PKG-INFO | ||
| project.egg-info/SOURCES.txt | ||
| project.egg-info/dependency_links.txt | ||
| project.egg-info/entry_points.txt | ||
| project.egg-info/top_level.txt | ||
| scrapytest/__init__.py | ||
| scrapytest/bandcamp-tagscrape.py | ||
| scrapytest/items.py | ||
| scrapytest/pipelines.py | ||
| scrapytest/settings.py | ||
| scrapytest/transportNTLM.py | ||
| scrapytest/spiders/__init__.py | ||
| scrapytest/spiders/bandcamp_spider.py |
| @@ -0,0 +1 @@ | ||
|
|
| @@ -0,0 +1,3 @@ | ||
| [scrapy] | ||
| settings = scrapytest.settings | ||
|
|
| @@ -0,0 +1 @@ | ||
| scrapytest |
| @@ -0,0 +1,2 @@ | ||
| projects: | ||
| default: 73419 |
| @@ -0,0 +1,17 @@ | ||
| # Automatically created by: scrapy startproject | ||
| # | ||
| # For more information about the [deploy] section see: | ||
| # https://scrapyd.readthedocs.org/en/latest/deploy.html | ||
|
|
||
| [settings] | ||
| default = scrapytest.settings | ||
|
|
||
| [deploy] | ||
| url = http://localhost:6800/ | ||
| project = scrapytest | ||
|
|
||
| [scrapyd] | ||
| eggs_dir = scrapy_dir/eggs | ||
| logs_dir = scrapy_dir/logs | ||
| items_dir = scrapy_dir/items | ||
| dbs_dir = scrapy_dir/dbs |
| @@ -0,0 +1,100 @@ | ||
| # The default ``config.py`` | ||
| # flake8: noqa | ||
|
|
||
|
|
||
| def set_prefs(prefs): | ||
| """This function is called before opening the project""" | ||
|
|
||
| # Specify which files and folders to ignore in the project. | ||
| # Changes to ignored resources are not added to the history and | ||
| # VCSs. Also they are not returned in `Project.get_files()`. | ||
| # Note that ``?`` and ``*`` match all characters but slashes. | ||
| # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' | ||
| # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' | ||
| # '.svn': matches 'pkg/.svn' and all of its children | ||
| # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' | ||
| # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' | ||
| prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', | ||
| '.hg', '.svn', '_svn', '.git', '.tox'] | ||
|
|
||
| # Specifies which files should be considered python files. It is | ||
| # useful when you have scripts inside your project. Only files | ||
| # ending with ``.py`` are considered to be python files by | ||
| # default. | ||
| #prefs['python_files'] = ['*.py'] | ||
|
|
||
| # Custom source folders: By default rope searches the project | ||
| # for finding source folders (folders that should be searched | ||
| # for finding modules). You can add paths to that list. Note | ||
| # that rope guesses project source folders correctly most of the | ||
| # time; use this if you have any problems. | ||
| # The folders should be relative to project root and use '/' for | ||
| # separating folders regardless of the platform rope is running on. | ||
| # 'src/my_source_folder' for instance. | ||
| #prefs.add('source_folders', 'src') | ||
|
|
||
| # You can extend python path for looking up modules | ||
| #prefs.add('python_path', '~/python/') | ||
|
|
||
| # Should rope save object information or not. | ||
| prefs['save_objectdb'] = True | ||
| prefs['compress_objectdb'] = False | ||
|
|
||
| # If `True`, rope analyzes each module when it is being saved. | ||
| prefs['automatic_soa'] = True | ||
| # The depth of calls to follow in static object analysis | ||
| prefs['soa_followed_calls'] = 0 | ||
|
|
||
| # If `False` when running modules or unit tests "dynamic object | ||
| # analysis" is turned off. This makes them much faster. | ||
| prefs['perform_doa'] = True | ||
|
|
||
| # Rope can check the validity of its object DB when running. | ||
| prefs['validate_objectdb'] = True | ||
|
|
||
| # How many undos to hold? | ||
| prefs['max_history_items'] = 32 | ||
|
|
||
| # Shows whether to save history across sessions. | ||
| prefs['save_history'] = True | ||
| prefs['compress_history'] = False | ||
|
|
||
| # Set the number spaces used for indenting. According to | ||
| # :PEP:`8`, it is best to use 4 spaces. Since most of rope's | ||
| # unit-tests use 4 spaces it is more reliable, too. | ||
| prefs['indent_size'] = 4 | ||
|
|
||
| # Builtin and c-extension modules that are allowed to be imported | ||
| # and inspected by rope. | ||
| prefs['extension_modules'] = [] | ||
|
|
||
| # Add all standard c-extensions to extension_modules list. | ||
| prefs['import_dynload_stdmods'] = True | ||
|
|
||
| # If `True` modules with syntax errors are considered to be empty. | ||
| # The default value is `False`; When `False` syntax errors raise | ||
| # `rope.base.exceptions.ModuleSyntaxError` exception. | ||
| prefs['ignore_syntax_errors'] = False | ||
|
|
||
| # If `True`, rope ignores unresolvable imports. Otherwise, they | ||
| # appear in the importing namespace. | ||
| prefs['ignore_bad_imports'] = False | ||
|
|
||
| # If `True`, rope will insert new module imports as | ||
| # `from <package> import <module>` by default. | ||
| prefs['prefer_module_from_imports'] = False | ||
|
|
||
| # If `True`, rope will transform a comma list of imports into | ||
| # multiple separate import statements when organizing | ||
| # imports. | ||
| prefs['split_imports'] = False | ||
|
|
||
| # If `True`, rope will sort imports alphabetically by module name | ||
| # instead of alphabetically by import statement, with from imports | ||
| # after normal imports. | ||
| prefs['sort_imports_alphabetically'] = False | ||
|
|
||
|
|
||
| def project_opened(project): | ||
| """This function is called after opening the project""" | ||
| # Do whatever you like here! |
| @@ -0,0 +1,24 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Define here the models for your scraped items | ||
| # | ||
| # See documentation in: | ||
| # http://doc.scrapy.org/en/latest/topics/items.html | ||
|
|
||
| from scrapy.item import Item | ||
|
|
||
|
|
||
| class ArbitraryItem(Item): | ||
| def __setitem__(self, key, value): | ||
| self._values[key] = value | ||
| self.fields[key] = {} | ||
|
|
||
| # class BandcampTag(scrapy.Item): | ||
| # machine_name = scrapy.Field() | ||
| # human_name = scrapy.Field() | ||
| # albums = scrapy.Field() | ||
| # total_albums = scrapy.Field() | ||
| # | ||
| # | ||
| #class BandcampAlbum(scrapy.Item): | ||
| # field = scrapy.Field() |
| @@ -0,0 +1,44 @@ | ||
| from scrapy.conf import settings | ||
| from scrapy.downloadermiddlewares.retry import RetryMiddleware | ||
| import telnetlib | ||
| import logging | ||
| import time | ||
| import random | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class ProxyMiddleware(object): | ||
|
|
||
| #Overide the request process by making it go through Tor | ||
| def process_request(self, request, spider): | ||
| request.meta['proxy'] = settings.get('HTTP_PROXY') | ||
|
|
||
|
|
||
| class RetryChangeProxyMiddleware(RetryMiddleware): | ||
|
|
||
| last = 0 | ||
| timelimit = 15 | ||
|
|
||
| def _retry(self, request, reason, spider): | ||
| settings = spider.settings | ||
|
|
||
| if isinstance(reason, basestring): | ||
| logger.debug('Valid retry, reason: ' + reason + ' for URL ' + request.url) | ||
| t = time.time() | ||
| diff = t - RetryChangeProxyMiddleware.last | ||
| if diff > RetryChangeProxyMiddleware.timelimit: | ||
| tn = telnetlib.Telnet('127.0.0.1', 9051) | ||
| tn.read_until("Escape character is '^]'.", 2) | ||
| tn.write('AUTHENTICATE "267765"\r\n') | ||
| tn.read_until("250 OK", 2) | ||
| tn.write("signal NEWNYM\r\n") | ||
| tn.read_until("250 OK", 2) | ||
| tn.write("quit\r\n") | ||
| tn.close() | ||
| #time.sleep(3) | ||
| RetryChangeProxyMiddleware.last = t | ||
| logger.info('Proxy changed! New last: %s' % time.strftime("%H:%M:%S")) | ||
| else: | ||
| logger.debug('Proxy not changed! Time difference is %s seconds' % ("{:.2f}".format(diff))) | ||
| return RetryMiddleware._retry(self, request, reason, spider) |
| @@ -0,0 +1,117 @@ | ||
| # Copyright 2014 Michael Malocha <michael@knockrentals.com> | ||
| # | ||
| # Expanded from the work by Julien Duponchelle <julien@duponchelle.info>. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """Elastic Search Pipeline for scrappy expanded with support for multiple items""" | ||
|
|
||
| from datetime import datetime | ||
| from elasticsearch import Elasticsearch, helpers | ||
| import logging | ||
| import hashlib | ||
| import types | ||
|
|
||
| from transportNTLM import TransportNTLM | ||
|
|
||
|
|
||
| class InvalidSettingsException(Exception): | ||
| pass | ||
|
|
||
| class ElasticSearchPipeline(object): | ||
| settings = None | ||
| es = None | ||
| items_buffer = [] | ||
|
|
||
| @classmethod | ||
| def validate_settings(cls, settings): | ||
| def validate_setting(setting_key): | ||
| if settings[setting_key] is None: | ||
| raise InvalidSettingsException('%s is not defined in settings.py' % setting_key) | ||
|
|
||
| required_settings = {'ELASTICSEARCH_SERVERS', 'ELASTICSEARCH_INDEX', 'ELASTICSEARCH_TYPE'} | ||
|
|
||
| for required_setting in required_settings: | ||
| validate_setting(required_setting) | ||
|
|
||
| @classmethod | ||
| def from_crawler(cls, crawler): | ||
| ext = cls() | ||
| ext.settings = crawler.settings | ||
|
|
||
| cls.validate_settings(ext.settings) | ||
|
|
||
| es_servers = ext.settings['ELASTICSEARCH_SERVERS'] | ||
| es_servers = es_servers if isinstance(es_servers, list) else [es_servers] | ||
|
|
||
| authType = ext.settings['ELASTICSEARCH_AUTH'] | ||
| if authType == 'NTLM': | ||
| ext.es = Elasticsearch(hosts=es_servers, | ||
| transport_class=TransportNTLM, | ||
| ntlm_user= ext.settings['ELASTICSEARCH_USERNAME'], | ||
| ntlm_pass= ext.settings['ELASTICSEARCH_PASSWORD'], | ||
| timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT',60)) | ||
| else : | ||
| ext.es = Elasticsearch(hosts=es_servers, timeout=ext.settings.get('ELASTICSEARCH_TIMEOUT', 60)) | ||
| return ext | ||
|
|
||
| def get_unique_key(self, unique_key): | ||
| if isinstance(unique_key, list): | ||
| unique_key = unique_key[0] | ||
| elif not isinstance(unique_key, str): | ||
| raise Exception('unique key must be str') | ||
|
|
||
| return unique_key | ||
|
|
||
| def index_item(self, item): | ||
|
|
||
| index_name = self.settings['ELASTICSEARCH_INDEX'] | ||
| index_suffix_format = self.settings.get('ELASTICSEARCH_INDEX_DATE_FORMAT', None) | ||
|
|
||
| if index_suffix_format: | ||
| index_name += "-" + datetime.strftime(datetime.now(),index_suffix_format) | ||
|
|
||
| index_action = { | ||
| '_index': index_name, | ||
| '_type': self.settings['ELASTICSEARCH_TYPE'], | ||
| '_source': dict(item) | ||
| } | ||
|
|
||
| if self.settings['ELASTICSEARCH_UNIQ_KEY'] is not None: | ||
| item_unique_key = item[self.settings['ELASTICSEARCH_UNIQ_KEY']] | ||
| unique_key = self.get_unique_key(item_unique_key) | ||
| item_id = hashlib.sha1(unique_key).hexdigest() | ||
| index_action['_id'] = item_id | ||
| logging.debug('Generated unique key %s' % item_id) | ||
|
|
||
| self.items_buffer.append(index_action) | ||
|
|
||
| if len(self.items_buffer) == self.settings.get('ELASTICSEARCH_BUFFER_LENGTH', 500): | ||
| self.send_items() | ||
| self.items_buffer = [] | ||
|
|
||
| def send_items(self): | ||
| helpers.bulk(self.es, self.items_buffer) | ||
|
|
||
| def process_item(self, item, spider): | ||
| if isinstance(item, types.GeneratorType) or isinstance(item, types.ListType): | ||
| for each in item: | ||
| self.process_item(each, spider) | ||
| else: | ||
| self.index_item(item) | ||
| logging.debug('Item sent to Elastic Search %s' % self.settings['ELASTICSEARCH_INDEX']) | ||
| return item | ||
|
|
||
| def close_spider(self, spider): | ||
| if len(self.items_buffer): | ||
| self.send_items() |
| @@ -0,0 +1,23 @@ | ||
| attrs==15.2.0 | ||
| cffi==1.5.2 | ||
| cryptography==1.3.1 | ||
| cssselect==0.9.1 | ||
| elasticsearch==2.3.0 | ||
| enum34==1.1.3 | ||
| idna==2.1 | ||
| ipaddress==1.0.16 | ||
| lxml==3.6.0 | ||
| pyasn1==0.1.9 | ||
| pyasn1-modules==0.0.8 | ||
| pycparser==2.14 | ||
| pyOpenSSL==16.0.0 | ||
| queuelib==1.4.2 | ||
| requests==2.10.0 | ||
| requests-ntlm==0.2.0 | ||
| Scrapy==1.0.5 | ||
| service-identity==16.0.0 | ||
| six==1.10.0 | ||
| Twisted==16.1.1 | ||
| urllib3==1.15.1 | ||
| w3lib==1.14.2 | ||
| zope.interface==4.1.3 |
| @@ -0,0 +1,113 @@ | ||
| #!/usr/bin/env bash | ||
| curl -XPUT "http://localhost:9200/scrapy4" -d ' | ||
| { | ||
| "mappings" : { | ||
| "items" : { | ||
| "dynamic": "true", | ||
| "properties" : { | ||
| "art_id" : { | ||
| "type" : "long" | ||
| }, | ||
| "band_id" : { | ||
| "type" : "long" | ||
| }, | ||
| "bio_image" : { | ||
| "properties" : { | ||
| "height" : { | ||
| "type" : "long" | ||
| }, | ||
| "image_id" : { | ||
| "type" : "long" | ||
| }, | ||
| "width" : { | ||
| "type" : "long" | ||
| } | ||
| } | ||
| }, | ||
| "category" : { | ||
| "type" : "string" | ||
| }, | ||
| "featured_track" : { | ||
| "properties" : { | ||
| "duration" : { | ||
| "type" : "double" | ||
| }, | ||
| "encodings_id" : { | ||
| "type" : "long" | ||
| }, | ||
| "file" : { | ||
| "properties" : { | ||
| "mp3-128" : { | ||
| "type" : "string", | ||
| "index": "not_analyzed" | ||
| } | ||
| } | ||
| }, | ||
| "id" : { | ||
| "type" : "long" | ||
| }, | ||
| "title" : { | ||
| "type" : "string" | ||
| } | ||
| } | ||
| }, | ||
| "genre_text" : { | ||
| "type" : "string" | ||
| }, | ||
| "human_tag_name" : { | ||
| "type" : "string", | ||
| "index" : "not_analyzed" | ||
| }, | ||
| "is_preorder" : { | ||
| "type" : "long" | ||
| }, | ||
| "item_type_id" : { | ||
| "type" : "string" | ||
| }, | ||
| "location_text" : { | ||
| "type" : "string" | ||
| }, | ||
| "machine_tag_name" : { | ||
| "type" : "string", | ||
| "index" : "not_analyzed" | ||
| }, | ||
| "primary_text" : { | ||
| "type" : "string" | ||
| }, | ||
| "publish_date" : { | ||
| "type" : "string", | ||
| "index": "not_analyzed" | ||
| }, | ||
| "score" : { | ||
| "type" : "double" | ||
| }, | ||
| "secondary_text" : { | ||
| "type" : "string" | ||
| }, | ||
| "type" : { | ||
| "type" : "string" | ||
| }, | ||
| "url_hints" : { | ||
| "properties" : { | ||
| "custom_domain" : { | ||
| "type" : "string" | ||
| }, | ||
| "custom_domain_verified" : { | ||
| "type" : "long" | ||
| }, | ||
| "item_type" : { | ||
| "type" : "string" | ||
| }, | ||
| "slug" : { | ||
| "type" : "string" | ||
| }, | ||
| "subdomain" : { | ||
| "type" : "string" | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }' |
| @@ -0,0 +1,102 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Scrapy settings for scrapytest project | ||
| # | ||
| # For simplicity, this file contains only settings considered important or | ||
| # commonly used. You can find more settings consulting the documentation: | ||
| # | ||
| # http://doc.scrapy.org/en/latest/topics/settings.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
|
||
| BOT_NAME = 'scrapytest' | ||
|
|
||
| SPIDER_MODULES = ['scrapytest.spiders'] | ||
| NEWSPIDER_MODULE = 'scrapytest.spiders' | ||
|
|
||
| ITEM_PIPELINES = { | ||
| 'scrapytest.pipelines.ElasticSearchPipeline': 1 | ||
| } | ||
|
|
||
| ELASTICSEARCH_SERVERS = ['localhost:9200'] | ||
| ELASTICSEARCH_INDEX = 'scrapy4' | ||
| ELASTICSEARCH_TYPE = 'items' | ||
| ELASTICSEARCH_UNIQ_KEY = 'id' | ||
|
|
||
| LOG_ENABLED = True | ||
| LOG_LEVEL = 'INFO' | ||
|
|
||
| # Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
| #USER_AGENT = 'scrapytest (+http://www.yourdomain.com)' | ||
|
|
||
| # Obey robots.txt rules | ||
| ROBOTSTXT_OBEY = False | ||
|
|
||
| # Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
| CONCURRENT_REQUESTS = 32 | ||
|
|
||
| # Configure a delay for requests for the same website (default: 0) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
| # See also autothrottle settings and docs | ||
| DOWNLOAD_DELAY = 0 | ||
| # The download delay setting will honor only one of: | ||
| #CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
| #CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
|
||
| # Disable cookies (enabled by default) | ||
| #COOKIES_ENABLED = False | ||
|
|
||
| # Disable Telnet Console (enabled by default) | ||
| #TELNETCONSOLE_ENABLED = False | ||
|
|
||
| # Override the default request headers: | ||
| #DEFAULT_REQUEST_HEADERS = { | ||
| # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
| # 'Accept-Language': 'en', | ||
| #} | ||
|
|
||
| # Enable or disable spider middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
| #SPIDER_MIDDLEWARES = { | ||
| # 'scrapytest.middlewares.MyCustomSpiderMiddleware': 543, | ||
| #} | ||
|
|
||
| # Enable or disable downloader middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| DOWNLOADER_MIDDLEWARES = { | ||
| 'scrapytest.middlewares.ProxyMiddleware': 543, | ||
| 'scrapytest.middlewares.RetryChangeProxyMiddleware': 600 | ||
| } | ||
|
|
||
| # Enable or disable extensions | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
| #EXTENSIONS = { | ||
| # 'scrapy.extensions.telnet.TelnetConsole': None, | ||
| #} | ||
|
|
||
| # Configure item pipelines | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
| #ITEM_PIPELINES = { | ||
| # 'scrapytest.pipelines.SomePipeline': 300, | ||
| #} | ||
|
|
||
| # Enable and configure the AutoThrottle extension (disabled by default) | ||
| # See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
| AUTOTHROTTLE_ENABLED = True | ||
| # The initial download delay | ||
| #AUTOTHROTTLE_START_DELAY = 2.0 | ||
| # The maximum download delay to be set in case of high latencies | ||
| #AUTOTHROTTLE_MAX_DELAY = 60.0 | ||
| # The average number of requests Scrapy should be sending in parallel to | ||
| # each remote server | ||
| #AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0 | ||
| # Enable showing throttling stats for every response received: | ||
| #AUTOTHROTTLE_DEBUG = True | ||
|
|
||
| # Enable and configure HTTP caching (disabled by default) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
| #HTTPCACHE_ENABLED = True | ||
| #HTTPCACHE_EXPIRATION_SECS = 0 | ||
| #HTTPCACHE_DIR = 'httpcache' | ||
| #HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
| #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
| @@ -0,0 +1,96 @@ | ||
| # The default ``config.py`` | ||
|
|
||
|
|
||
| def set_prefs(prefs): | ||
| """This function is called before opening the project""" | ||
|
|
||
| # Specify which files and folders to ignore in the project. | ||
| # Changes to ignored resources are not added to the history and | ||
| # VCSs. Also they are not returned in `Project.get_files()`. | ||
| # Note that ``?`` and ``*`` match all characters but slashes. | ||
| # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' | ||
| # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' | ||
| # '.svn': matches 'pkg/.svn' and all of its children | ||
| # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' | ||
| # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' | ||
| prefs['ignored_resources'] = [ | ||
| '*.pyc', '*~', '.ropeproject', '.hg', '.svn', '_svn', '.git', | ||
| '.tox', '.env', 'node_modules', 'bower_components'] | ||
|
|
||
| # Specifies which files should be considered python files. It is | ||
| # useful when you have scripts inside your project. Only files | ||
| # ending with ``.py`` are considered to be python files by | ||
| # default. | ||
| #prefs['python_files'] = ['*.py'] | ||
|
|
||
| # Custom source folders: By default rope searches the project | ||
| # for finding source folders (folders that should be searched | ||
| # for finding modules). You can add paths to that list. Note | ||
| # that rope guesses project source folders correctly most of the | ||
| # time; use this if you have any problems. | ||
| # The folders should be relative to project root and use '/' for | ||
| # separating folders regardless of the platform rope is running on. | ||
| # 'src/my_source_folder' for instance. | ||
| #prefs.add('source_folders', 'src') | ||
|
|
||
| # You can extend python path for looking up modules | ||
| #prefs.add('python_path', '~/python/') | ||
|
|
||
| # Should rope save object information or not. | ||
| prefs['save_objectdb'] = True | ||
| prefs['compress_objectdb'] = False | ||
|
|
||
| # If `True`, rope analyzes each module when it is being saved. | ||
| prefs['automatic_soa'] = True | ||
| # The depth of calls to follow in static object analysis | ||
| prefs['soa_followed_calls'] = 0 | ||
|
|
||
| # If `False` when running modules or unit tests "dynamic object | ||
| # analysis" is turned off. This makes them much faster. | ||
| prefs['perform_doa'] = True | ||
|
|
||
| # Rope can check the validity of its object DB when running. | ||
| prefs['validate_objectdb'] = True | ||
|
|
||
| # How many undos to hold? | ||
| prefs['max_history_items'] = 32 | ||
|
|
||
| # Shows whether to save history across sessions. | ||
| prefs['save_history'] = True | ||
| prefs['compress_history'] = False | ||
|
|
||
| # Set the number spaces used for indenting. According to | ||
| # :PEP:`8`, it is best to use 4 spaces. Since most of rope's | ||
| # unit-tests use 4 spaces it is more reliable, too. | ||
| prefs['indent_size'] = 4 | ||
|
|
||
| # Builtin and c-extension modules that are allowed to be imported | ||
| # and inspected by rope. | ||
| prefs['extension_modules'] = [] | ||
|
|
||
| # Add all standard c-extensions to extension_modules list. | ||
| prefs['import_dynload_stdmods'] = True | ||
|
|
||
| # If `True` modules with syntax errors are considered to be empty. | ||
| # The default value is `False`; When `False` syntax errors raise | ||
| # `rope.base.exceptions.ModuleSyntaxError` exception. | ||
| prefs['ignore_syntax_errors'] = False | ||
|
|
||
| # If `True`, rope ignores unresolvable imports. Otherwise, they | ||
| # appear in the importing namespace. | ||
| prefs['ignore_bad_imports'] = False | ||
|
|
||
| # If `True`, rope will transform a comma list of imports into | ||
| # multiple separate import statements when organizing | ||
| # imports. | ||
| prefs['split_imports'] = False | ||
|
|
||
| # If `True`, rope will sort imports alphabetically by module name | ||
| # instead of alphabetically by import statement, with from imports | ||
| # after normal imports. | ||
| prefs['sort_imports_alphabetically'] = False | ||
|
|
||
|
|
||
| def project_opened(project): | ||
| """This function is called after opening the project""" | ||
| # Do whatever you like here! |
| @@ -0,0 +1 @@ | ||
| �]q(]q]qe. |
| @@ -0,0 +1,4 @@ | ||
| # This package will contain the spiders of your Scrapy project | ||
| # | ||
| # Please refer to the documentation for information on how to create and manage | ||
| # your spiders. |
| @@ -0,0 +1,42 @@ | ||
| import scrapy | ||
| import json | ||
| from math import ceil | ||
|
|
||
| from scrapytest.items import ArbitraryItem | ||
|
|
||
| api_url_format = "http://151.101.65.28/api/discover/3/get_web?t=%s&s=top&p=%s" | ||
| # bandcamp.com ip address to save dns lookups | ||
|
|
||
| class BandcampSpider(scrapy.Spider): | ||
| name = "bandcamp" | ||
| allowed_domains = ['bandcamp.com', '151.101.65.28'] | ||
| start_urls = [ | ||
| "http://bandcamp.com/tags" | ||
| ] | ||
|
|
||
| def parse(self, response): | ||
| for link in response.css("#tags_cloud > a"): | ||
| machine_tag_name = link.css("::attr('href')").extract()[0].replace("/tag/", "") | ||
| human_tag_name = link.xpath("text()").extract() | ||
| url = api_url_format % (machine_tag_name, 0) | ||
| meta = {"machine": machine_tag_name, "human": human_tag_name} | ||
| yield scrapy.Request(url, meta=meta, callback=self.parse_tag) | ||
|
|
||
| def parse_tag(self, response): | ||
| machine_tag_name = response.meta["machine"] | ||
| json_obj = json.loads(response.body_as_unicode()) | ||
| total_count = json_obj.get("total_count") | ||
| num_pages = int(ceil(total_count/48.0)) | ||
| for i in range(num_pages): | ||
| url = api_url_format % (machine_tag_name, i) | ||
| yield scrapy.Request(url, meta=response.meta, callback=self.parse_album) | ||
|
|
||
| def parse_album(self, response): | ||
| json_obj = json.loads(response.body_as_unicode()) | ||
| items = json_obj.get('items') | ||
| for item in items: | ||
| album = ArbitraryItem(item) | ||
| album['id'] = str(album['id']) | ||
| album['machine_tag_name'] = response.meta['machine'] | ||
| album['human_tag_name'] = response.meta['human'] | ||
| yield album |
| @@ -0,0 +1,358 @@ | ||
| import time | ||
| from itertools import chain | ||
|
|
||
| from elasticsearch.connection import RequestsHttpConnection | ||
| from elasticsearch.connection_pool import ConnectionPool, DummyConnectionPool | ||
| from elasticsearch.serializer import JSONSerializer, Deserializer, DEFAULT_SERIALIZERS | ||
| from elasticsearch.exceptions import ConnectionError, TransportError, SerializationError, \ | ||
| ConnectionTimeout, ImproperlyConfigured | ||
| import requests | ||
| from requests_ntlm import HttpNtlmAuth | ||
|
|
||
| def get_host_info(node_info, host): | ||
| """ | ||
| Simple callback that takes the node info from `/_cluster/nodes` and a | ||
| parsed connection information and return the connection information. If | ||
| `None` is returned this node will be skipped. | ||
| Useful for filtering nodes (by proximity for example) or if additional | ||
| information needs to be provided for the :class:`~elasticsearch.Connection` | ||
| class. By default master only nodes are filtered out since they shouldn't | ||
| typically be used for API operations. | ||
| :arg node_info: node information from `/_cluster/nodes` | ||
| :arg host: connection information (host, port) extracted from the node info | ||
| """ | ||
| attrs = node_info.get('attributes', {}) | ||
|
|
||
| # ignore master only nodes | ||
| if (attrs.get('data', 'true') == 'false' and | ||
| attrs.get('client', 'false') == 'false' and | ||
| attrs.get('master', 'true') == 'true'): | ||
| return None | ||
| return host | ||
|
|
||
| class TransportNTLM(object): | ||
| """ | ||
| Encapsulation of transport-related to logic. Handles instantiation of the | ||
| individual connections as well as creating a connection pool to hold them. | ||
| Main interface is the `perform_request` method. | ||
| """ | ||
| def __init__(self, hosts, connection_class=RequestsHttpConnection, | ||
| connection_pool_class=ConnectionPool, host_info_callback=get_host_info, | ||
| sniff_on_start=False, sniffer_timeout=None, sniff_timeout=.1, | ||
| sniff_on_connection_fail=False, serializer=JSONSerializer(), serializers=None, | ||
| default_mimetype='application/json', max_retries=3, retry_on_status=(503, 504, ), | ||
| retry_on_timeout=False, send_get_body_as='GET', **kwargs): | ||
| """ | ||
| :arg hosts: list of dictionaries, each containing keyword arguments to | ||
| create a `connection_class` instance | ||
| :arg connection_class: subclass of :class:`~elasticsearch.Connection` to use | ||
| :arg connection_pool_class: subclass of :class:`~elasticsearch.ConnectionPool` to use | ||
| :arg host_info_callback: callback responsible for taking the node information from | ||
| `/_cluser/nodes`, along with already extracted information, and | ||
| producing a list of arguments (same as `hosts` parameter) | ||
| :arg sniff_on_start: flag indicating whether to obtain a list of nodes | ||
| from the cluser at startup time | ||
| :arg sniffer_timeout: number of seconds between automatic sniffs | ||
| :arg sniff_on_connection_fail: flag controlling if connection failure triggers a sniff | ||
| :arg sniff_timeout: timeout used for the sniff request - it should be a | ||
| fast api call and we are talking potentially to more nodes so we want | ||
| to fail quickly. Not used during initial sniffing (if | ||
| ``sniff_on_start`` is on) when the connection still isn't | ||
| initialized. | ||
| :arg serializer: serializer instance | ||
| :arg serializers: optional dict of serializer instances that will be | ||
| used for deserializing data coming from the server. (key is the mimetype) | ||
| :arg default_mimetype: when no mimetype is specified by the server | ||
| response assume this mimetype, defaults to `'application/json'` | ||
| :arg max_retries: maximum number of retries before an exception is propagated | ||
| :arg retry_on_status: set of HTTP status codes on which we should retry | ||
| on a different node. defaults to ``(503, 504, )`` | ||
| :arg retry_on_timeout: should timeout trigger a retry on different | ||
| node? (default `False`) | ||
| :arg send_get_body_as: for GET requests with body this option allows | ||
| you to specify an alternate way of execution for environments that | ||
| don't support passing bodies with GET requests. If you set this to | ||
| 'POST' a POST method will be used instead, if to 'source' then the body | ||
| will be serialized and passed as a query parameter `source`. | ||
| Any extra keyword arguments will be passed to the `connection_class` | ||
| when creating and instance unless overriden by that connection's | ||
| options provided as part of the hosts parameter. | ||
| """ | ||
|
|
||
| # serialization config | ||
| _serializers = DEFAULT_SERIALIZERS.copy() | ||
| # if a serializer has been specified, use it for deserialization as well | ||
| _serializers[serializer.mimetype] = serializer | ||
| # if custom serializers map has been supplied, override the defaults with it | ||
| if serializers: | ||
| _serializers.update(serializers) | ||
| # create a deserializer with our config | ||
| self.deserializer = Deserializer(_serializers, default_mimetype) | ||
|
|
||
| self.max_retries = max_retries | ||
| self.retry_on_timeout = retry_on_timeout | ||
| self.retry_on_status = retry_on_status | ||
| self.send_get_body_as = send_get_body_as | ||
|
|
||
| # data serializer | ||
| self.serializer = serializer | ||
|
|
||
| # store all strategies... | ||
| self.connection_pool_class = connection_pool_class | ||
| self.connection_class = connection_class | ||
|
|
||
| # ...save kwargs to be passed to the connections | ||
| self.kwargs = kwargs | ||
| self.hosts = hosts | ||
|
|
||
| # ...and instantiate them | ||
| self.set_connections(hosts) | ||
| # retain the original connection instances for sniffing | ||
| self.seed_connections = self.connection_pool.connections[:] | ||
|
|
||
| # sniffing data | ||
| self.sniffer_timeout = sniffer_timeout | ||
| self.sniff_on_connection_fail = sniff_on_connection_fail | ||
| self.last_sniff = time.time() | ||
| self.sniff_timeout = sniff_timeout | ||
|
|
||
| # callback to construct host dict from data in /_cluster/nodes | ||
| self.host_info_callback = host_info_callback | ||
|
|
||
| if sniff_on_start: | ||
| self.sniff_hosts(True) | ||
|
|
||
|
|
||
| def add_connection(self, host): | ||
| """ | ||
| Create a new :class:`~elasticsearch.Connection` instance and add it to the pool. | ||
| :arg host: kwargs that will be used to create the instance | ||
| """ | ||
| self.hosts.append(host) | ||
| self.set_connections(self.hosts) | ||
|
|
||
| def set_connections(self, hosts): | ||
| """ | ||
| Instantiate all the connections and crate new connection pool to hold | ||
| them. Tries to identify unchanged hosts and re-use existing | ||
| :class:`~elasticsearch.Connection` instances. | ||
| :arg hosts: same as `__init__` | ||
| """ | ||
| # construct the connections | ||
| def _create_connection(host): | ||
| # if this is not the initial setup look at the existing connection | ||
| # options and identify connections that haven't changed and can be | ||
| # kept around. | ||
| if hasattr(self, 'connection_pool'): | ||
| for (connection, old_host) in self.connection_pool.connection_opts: | ||
| if old_host == host: | ||
| return connection | ||
|
|
||
| # previously unseen params, create new connection | ||
| kwargs = self.kwargs.copy() | ||
| kwargs.update(host) | ||
|
|
||
| if 'scheme' in host and host['scheme'] != self.connection_class.transport_schema: | ||
| raise ImproperlyConfigured( | ||
| 'Scheme specified in connection (%s) is not the same as the connection class (%s) specifies (%s).' % ( | ||
| host['scheme'], self.connection_class.__name__, self.connection_class.transport_schema | ||
| )) | ||
| ntlm_auth = HttpNtlmAuth(kwargs.get('ntlm_user',''), kwargs.get('ntlm_pass','')) | ||
| return self.connection_class(http_auth=ntlm_auth, **kwargs) | ||
| connections = map(_create_connection, hosts) | ||
|
|
||
| connections = list(zip(connections, hosts)) | ||
| if len(connections) == 1: | ||
| self.connection_pool = DummyConnectionPool(connections) | ||
| else: | ||
| # pass the hosts dicts to the connection pool to optionally extract parameters from | ||
| self.connection_pool = self.connection_pool_class(connections, **self.kwargs) | ||
|
|
||
| def get_connection(self): | ||
| """ | ||
| Retreive a :class:`~elasticsearch.Connection` instance from the | ||
| :class:`~elasticsearch.ConnectionPool` instance. | ||
| """ | ||
| if self.sniffer_timeout: | ||
| if time.time() >= self.last_sniff + self.sniffer_timeout: | ||
| self.sniff_hosts() | ||
| return self.connection_pool.get_connection() | ||
|
|
||
| def _get_sniff_data(self, initial=False): | ||
| """ | ||
| Perform the request to get sniffins information. Returns a list of | ||
| dictionaries (one per node) containing all the information from the | ||
| cluster. | ||
| It also sets the last_sniff attribute in case of a successful attempt. | ||
| In rare cases it might be possible to override this method in your | ||
| custom Transport class to serve data from alternative source like | ||
| configuration management. | ||
| """ | ||
| previous_sniff = self.last_sniff | ||
|
|
||
| try: | ||
| # reset last_sniff timestamp | ||
| self.last_sniff = time.time() | ||
| # go through all current connections as well as the | ||
| # seed_connections for good measure | ||
| for c in chain(self.connection_pool.connections, self.seed_connections): | ||
| try: | ||
| # use small timeout for the sniffing request, should be a fast api call | ||
| _, headers, node_info = c.perform_request('GET', '/_nodes/_all/clear', | ||
| timeout=self.sniff_timeout if not initial else None) | ||
| node_info = self.deserializer.loads(node_info, headers.get('content-type')) | ||
| break | ||
| except (ConnectionError, SerializationError): | ||
| pass | ||
| else: | ||
| raise TransportError("N/A", "Unable to sniff hosts.") | ||
| except: | ||
| # keep the previous value on error | ||
| self.last_sniff = previous_sniff | ||
| raise | ||
|
|
||
| return list(node_info['nodes'].values()) | ||
|
|
||
|
|
||
| def sniff_hosts(self, initial=False): | ||
| """ | ||
| Obtain a list of nodes from the cluster and create a new connection | ||
| pool using the information retrieved. | ||
| To extract the node connection parameters use the ``nodes_to_host_callback``. | ||
| :arg initial: flag indicating if this is during startup | ||
| (``sniff_on_start``), ignore the ``sniff_timeout`` if ``True`` | ||
| """ | ||
| node_info = self._get_sniff_data(initial) | ||
|
|
||
| hosts = [] | ||
| address_key = self.connection_class.transport_schema + '_address' | ||
| for n in node_info: | ||
| host = {} | ||
| address = n.get(address_key, '') | ||
| if '/' in address: | ||
| host['host'], address = address.split('/', 1) | ||
|
|
||
| # malformed address | ||
| if ':' not in address: | ||
| continue | ||
|
|
||
| ip, port = address.rsplit(':', 1) | ||
|
|
||
| # use the ip if not overridden by publish_host | ||
| host.setdefault('host', ip) | ||
| host['port'] = int(port) | ||
|
|
||
| host = self.host_info_callback(n, host) | ||
| if host is not None: | ||
| hosts.append(host) | ||
|
|
||
| # we weren't able to get any nodes, maybe using an incompatible | ||
| # transport_schema or host_info_callback blocked all - raise error. | ||
| if not hosts: | ||
| raise TransportError("N/A", "Unable to sniff hosts - no viable hosts found.") | ||
|
|
||
| self.set_connections(hosts) | ||
|
|
||
| def mark_dead(self, connection): | ||
| """ | ||
| Mark a connection as dead (failed) in the connection pool. If sniffing | ||
| on failure is enabled this will initiate the sniffing process. | ||
| :arg connection: instance of :class:`~elasticsearch.Connection` that failed | ||
| """ | ||
| # mark as dead even when sniffing to avoid hitting this host during the sniff process | ||
| self.connection_pool.mark_dead(connection) | ||
| if self.sniff_on_connection_fail: | ||
| self.sniff_hosts() | ||
|
|
||
| def perform_request(self, method, url, params=None, body=None): | ||
| """ | ||
| Perform the actual request. Retrieve a connection from the connection | ||
| pool, pass all the information to it's perform_request method and | ||
| return the data. | ||
| If an exception was raised, mark the connection as failed and retry (up | ||
| to `max_retries` times). | ||
| If the operation was succesful and the connection used was previously | ||
| marked as dead, mark it as live, resetting it's failure count. | ||
| :arg method: HTTP method to use | ||
| :arg url: absolute url (without host) to target | ||
| :arg params: dictionary of query parameters, will be handed over to the | ||
| underlying :class:`~elasticsearch.Connection` class for serialization | ||
| :arg body: body of the request, will be serializes using serializer and | ||
| passed to the connection | ||
| """ | ||
| if body is not None: | ||
| body = self.serializer.dumps(body) | ||
|
|
||
| # some clients or environments don't support sending GET with body | ||
| if method in ('HEAD', 'GET') and self.send_get_body_as != 'GET': | ||
| # send it as post instead | ||
| if self.send_get_body_as == 'POST': | ||
| method = 'POST' | ||
|
|
||
| # or as source parameter | ||
| elif self.send_get_body_as == 'source': | ||
| if params is None: | ||
| params = {} | ||
| params['source'] = body | ||
| body = None | ||
|
|
||
| if body is not None: | ||
| try: | ||
| body = body.encode('utf-8') | ||
| except (UnicodeDecodeError, AttributeError): | ||
| # bytes/str - no need to re-encode | ||
| pass | ||
|
|
||
| ignore = () | ||
| timeout = None | ||
| if params: | ||
| timeout = params.pop('request_timeout', None) | ||
| ignore = params.pop('ignore', ()) | ||
| if isinstance(ignore, int): | ||
| ignore = (ignore, ) | ||
|
|
||
| for attempt in range(self.max_retries + 1): | ||
| connection = self.get_connection() | ||
|
|
||
| try: | ||
| status, headers, data = connection.perform_request(method, url, params, body, ignore=ignore, timeout=timeout) | ||
|
|
||
| except TransportError as e: | ||
| retry = False | ||
| if isinstance(e, ConnectionTimeout): | ||
| retry = self.retry_on_timeout | ||
| elif isinstance(e, ConnectionError): | ||
| retry = True | ||
| elif e.status_code in self.retry_on_status: | ||
| retry = True | ||
|
|
||
| if retry: | ||
| # only mark as dead if we are retrying | ||
| self.mark_dead(connection) | ||
| # raise exception on last retry | ||
| if attempt == self.max_retries: | ||
| raise | ||
| else: | ||
| raise | ||
|
|
||
| else: | ||
| # connection didn't fail, confirm it's live status | ||
| self.connection_pool.mark_live(connection) | ||
| if data: | ||
| data = self.deserializer.loads(data, headers.get('content-type')) | ||
| return status, data | ||
|
|
| @@ -0,0 +1,10 @@ | ||
| # Automatically created by: scrapyd-deploy | ||
|
|
||
| from setuptools import setup, find_packages | ||
|
|
||
| setup( | ||
| name = 'project', | ||
| version = '1.0', | ||
| packages = find_packages(), | ||
| entry_points = {'scrapy': ['settings = scrapytest.settings']}, | ||
| ) |