Skip to content
This repository has been archived by the owner on Jun 4, 2020. It is now read-only.

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenday committed Feb 27, 2014
0 parents commit 1f2265c
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
86 changes: 86 additions & 0 deletions scraper.py
@@ -0,0 +1,86 @@
# Forked from Anna PS's Rightmove scraper, simplified because we're only
# looking in one place.
from lxml import etree
from lxml.etree import tostring
from datetime import datetime
import scraperwiki
import StringIO

MIN_PRICE = 0
MAX_PRICE = 250000
MIN_BEDROOMS = 2
RADIUS_MILES = 3.0
SEARCH_PHRASES = [
# Houses needing work
"in need of updating",
"in need of some updating",
"requiring updating",
"requiring some updating",
"in need of modernisation",
"in need of some modernisation",
"requiring modernisation",
"requiring some modernisation",
"in need of renovation",
"in need of some renovation",
"requiring renovation",
"requiring some renovation",
"renovation project",
# Houses with Land
"acre",
"additional land",
"very large garden",
"extremely large garden",
"paddock"
]
DOMAIN = 'http://www.rightmove.co.uk'

def scrape_individual_house(house_url):
HOUSE_URL = (DOMAIN + house_url).split('/svr/')[0]
print 'Scraping %s' % HOUSE_URL
house_html = scraperwiki.scrape(HOUSE_URL)
house_parser = etree.HTMLParser()
house_tree = etree.parse(StringIO.StringIO(house_html), house_parser)
house_text = house_tree.xpath('string(//div[@class="propertyDetailDescription"])')
title = house_tree.xpath('string(//h1[@id="propertytype"])')
# Check for search phrases
for sp in SEARCH_PHRASES:
if sp in house_text.lower() or sp in title.lower():
house = {}
image_url = tostring(house_tree.xpath('//img[@id="mainphoto"]')[0])
price = house_tree.xpath('string(//div[@id="amount"])')
map_img = house_tree.xpath('//a[@id="minimapwrapper"]/img')
if map_img:
map_img = tostring(house_tree.xpath('//a[@id="minimapwrapper"]/img')[0])
else:
map_img = ''
location = house_tree.xpath('string(//div[@id="addresscontainer"]/h2)')
house['title'] = "%s - %s - %s" % (title, location, price)
print 'HOUSE FOUND! %s, %s ' % (house['title'], HOUSE_URL)
item_text = '<a href="' + HOUSE_URL + '">' + image_url + '</a>'
item_text += '<a href="' + HOUSE_URL + '">' + map_img + '</a>'
item_text += house_text
house['description'] = item_text.replace(sp,"<span style='font-weight:bold;color:red;'>%s</span>" % sp)
house['link'] = HOUSE_URL
house['pubDate'] = datetime.now()
scraperwiki.sqlite.save(['link'], house)

# Gather list of results for an individual station.
def scrape_results_page(results_url, initial=False):
results_url = DOMAIN + results_url
html = scraperwiki.scrape(results_url)
parser = etree.HTMLParser()
tree = etree.parse(StringIO.StringIO(html), parser)
house_links = tree.xpath('//ol[@id="summaries"]//a[starts-with(text(), "More details")]/@href')
for house_link in house_links:
scrape_individual_house(house_link)
if initial:
results_links = tree.xpath('//ul[@class="items"]//a/@href')
for r in results_links:
scrape_results_page(r)

# Do the actual scraping
for property_type in ('houses', 'land'):
url1 = '/property-for-sale/find.html?locationIdentifier=REGION^494&minPrice=%s&maxPrice=%s' % (MIN_PRICE, MAX_PRICE)
url2 = '&radius=%s&displayPropertyType=%s&numberOfPropertiesPerPage=50' % (RADIUS_MILES, property_type)
INITIAL_URL = url1 + url2
scrape_results_page(INITIAL_URL, initial=True)

0 comments on commit 1f2265c

Please sign in to comment.