initial commit

scrapy · Apr 28, 2011 · 9d4e5a9 · 9d4e5a9
commit 9d4e5a9
Show file tree

Hide file tree

Showing 9 changed files with 176 additions and 0 deletions.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,71 @@
+======
+dirbot
+======
+
+This is a Scrapy project to scrape websites from public web directories.
+
+This project is only meant for educational purposes.
+
+Items
+=====
+
+The items scraped by this project are websites, and the item is defined in the
+class::
+
+    dirbot.items.Website
+
+See the source code for more details.
+
+Spiders
+=======
+
+This project contains two spiders: ``googledir`` and ``dmoz``. When in doubt,
+you can check the available spiders with::
+
+    scrapy list
+
+Spider: googledir
+-----------------
+
+The ``googledir`` spider crawls the entire Google Directory, though you may
+want to try it by limiting the crawl to a certain number of items.
+
+For example, to run the ``googledir`` spider limited to scrape 20 items use::
+
+    scrapy crawl googledir --set CLOSESPIDER_ITEMPASSED=20
+
+Spider: dmoz
+------------
+
+The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's
+based on the dmoz spider described in the `Scrapy tutorial`_
+
+Unlike the ``googledir`` spider, this spider doesn't crawl the entire dmoz.org
+site but only a few pages by default (defined in the ``start_pages``
+attribute). These pages are:
+
+* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
+* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
+
+So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape
+only those two pages. However, you can scrape any dmoz.org page by passing the
+url instead of the spider name. Scrapy internally resolves the spider to use by
+looking at the allowed domains of each spider.
+
+For example, to scrape a different URL use::
+
+    scrapy crawl http://www.dmoz.org/Computers/Programming/Languages/Erlang/
+
+You can scrape any URL from dmoz.org using this spider
+
+* ``googledir`` - for scraping Google Directory (directory.google.com)
+
+.. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html 
+
+Pipelines
+=========
+
+This project uses a pipeline to filter out websites containing certain
+forbidden words in their description. This pipeline is defined in the class::
+
+    dirbot.pipelines.FilterWordsPipeline
diff --git a/dirbot/__init__.py b/dirbot/__init__.py
diff --git a/dirbot/items.py b/dirbot/items.py
@@ -0,0 +1,10 @@
+from scrapy.item import Item, Field
+
+class Website(Item):
+
+    name = Field()
+    url = Field()
+    description = Field()
+
+    def __str__(self):
+        return "Website: name=%s url=%s" % (self['name'], self['url'])
diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py
@@ -0,0 +1,15 @@
+from scrapy.exceptions import DropItem
+
+class FilterWordsPipeline(object):
+    """A pipeline for filtering out items which contain certain words in their
+    description"""
+
+    # put all words in lowercase
+    words_to_filter = ['politics', 'religion']
+
+    def process_item(self, item, spider):
+        for word in self.words_to_filter:
+            if word in unicode(item['description']).lower():
+                raise DropItem("Contains forbidden word: %s" % word)
+        else:
+            return item
diff --git a/dirbot/settings.py b/dirbot/settings.py
@@ -0,0 +1,12 @@
+# Scrapy settings for dirbot project
+
+BOT_NAME = 'dirbot'
+BOT_VERSION = '1.0'
+
+SPIDER_MODULES = ['dirbot.spiders']
+NEWSPIDER_MODULE = 'dirbot.spiders'
+DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
+USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline']
+
diff --git a/dirbot/spiders/__init__.py b/dirbot/spiders/__init__.py
@@ -0,0 +1 @@
+# Place here all your scrapy spiders
diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
@@ -0,0 +1,24 @@
+from scrapy.spider import BaseSpider
+from scrapy.selector import HtmlXPathSelector
+
+from dirbot.items import Website
+
+class DmozSpider(BaseSpider):
+   name = "dmoz"
+   allowed_domains = ["dmoz.org"]
+   start_urls = [
+       "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
+       "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
+   ]
+
+   def parse(self, response):
+       hxs = HtmlXPathSelector(response)
+       sites = hxs.select('//ul/li')
+       items = []
+       for site in sites:
+           item = Website()
+           item['name'] = site.select('a/text()').extract()
+           item['url'] = site.select('a/@href').extract()
+           item['description'] = site.select('text()').extract()
+           items.append(item)
+       return items
diff --git a/dirbot/spiders/googledir.py b/dirbot/spiders/googledir.py
@@ -0,0 +1,41 @@
+from scrapy.selector import HtmlXPathSelector
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.loader import XPathItemLoader
+
+from dirbot.items import Website
+
+class GoogledirSpider(CrawlSpider):
+
+    name = 'googledir'
+    allowed_domains = ['directory.google.com']
+    start_urls = ['http://directory.google.com/']
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow='directory.google.com/[A-Z][a-zA-Z_/]+$'),
+            'parse_category',
+            follow=True,
+        ),
+    )
+
+    def parse_category(self, response):
+        # The main selector we're using to extract data from the page
+        main_selector = HtmlXPathSelector(response)
+
+        # The XPath to website links in the directory page
+        xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font'
+
+        # Get a list of (sub) selectors to each website node pointed by the XPath
+        sub_selectors = main_selector.select(xpath)
+
+        # Iterate over the sub-selectors to extract data for each website
+        for selector in sub_selectors:
+            item = Website()
+
+            l = XPathItemLoader(item=item, selector=selector)
+            l.add_xpath('name', 'a/text()')
+            l.add_xpath('url', 'a/@href')
+            l.add_xpath('description', 'font[2]/text()')
+
+            # Here we populate the item and yield it
+            yield l.load_item()
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = dirbot.settings