Skip to content

Commit

Permalink
Merge pull request #21 from redapple/py3-support
Browse files Browse the repository at this point in the history
Changes for new DMOZ layout + Python3 compatibility
  • Loading branch information
eliasdorneles committed Sep 27, 2016
2 parents 46c0b75 + 810eae1 commit ec32684
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
2 changes: 1 addition & 1 deletion dirbot/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class FilterWordsPipeline(object):

def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode(item['description']).lower():
if word in item['description'].lower():
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
12 changes: 7 additions & 5 deletions dirbot/spiders/dmoz.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ def parse(self, response):
@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
@scrapes name
"""
sel = Selector(response)
sites = sel.xpath('//ul[@class="directory-url"]/li')
sites = response.css('#site-list-content > div.site-item > div.title-and-desc')
items = []

for site in sites:
item = Website()
item['name'] = site.xpath('a/text()').extract()
item['url'] = site.xpath('a/@href').extract()
item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
item['name'] = site.css(
'a > div.site-title::text').extract_first().strip()
item['url'] = site.xpath(
'a/@href').extract_first().strip()
item['description'] = site.css(
'div.site-descr::text').extract_first().strip()
items.append(item)

return items

0 comments on commit ec32684

Please sign in to comment.