Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
rabimba committed Jun 17, 2014
1 parent ecacf68 commit 28bd43c
Showing 1 changed file with 5 additions and 6 deletions.
11 changes: 5 additions & 6 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,15 @@
import json
import re
import urlparse
import lxml.html
from lxml import etree

def scrape_laptop(url):
html = scraperwiki.scrape(url)
tree = lxml.html.fromstring(html)
tree = etree.HTML(html)
title = tree.find('.//h1')
price = tree.find('.//span[@id="fk-mprod-our-id"]')
data = {
'title': title.text if title is not None else '',
'url': url,
'price': price.text_content() if price is not None else ''
}
for row in tree.findall('.//table[@class="fk-specs-type2"]//tr'):
label = row.find('th')
Expand All @@ -25,10 +23,11 @@ def scrape_laptop(url):

start = 0
while True:
data = scraperwiki.scrape('http://www.flipkart.com/computers/laptops/all?response-type=json&inf-start=%d' % start)
data = scraperwiki.scrape('http://www.flipkart.com/laptops/all?response-type=json&inf-start=%d' % start)
data = json.loads(data)
if data['count'] <= 0:
break
tree = lxml.html.fromstring(data['html'])
tree = etree.HTML(data['html'])
for link in tree.findall('.//a[@class="prd-img"]'):
url = link.get('href', '')
if not url:
Expand Down

0 comments on commit 28bd43c

Please sign in to comment.