This repository has been archived by the owner on Jun 4, 2020. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
- Loading branch information
0 parents
commit 1f2265c
Showing
2 changed files
with
88 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Ignore output of scraper | ||
data.sqlite |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
# Forked from Anna PS's Rightmove scraper, simplified because we're only | ||
# looking in one place. | ||
from lxml import etree | ||
from lxml.etree import tostring | ||
from datetime import datetime | ||
import scraperwiki | ||
import StringIO | ||
|
||
MIN_PRICE = 0 | ||
MAX_PRICE = 250000 | ||
MIN_BEDROOMS = 2 | ||
RADIUS_MILES = 3.0 | ||
SEARCH_PHRASES = [ | ||
# Houses needing work | ||
"in need of updating", | ||
"in need of some updating", | ||
"requiring updating", | ||
"requiring some updating", | ||
"in need of modernisation", | ||
"in need of some modernisation", | ||
"requiring modernisation", | ||
"requiring some modernisation", | ||
"in need of renovation", | ||
"in need of some renovation", | ||
"requiring renovation", | ||
"requiring some renovation", | ||
"renovation project", | ||
# Houses with Land | ||
"acre", | ||
"additional land", | ||
"very large garden", | ||
"extremely large garden", | ||
"paddock" | ||
] | ||
DOMAIN = 'http://www.rightmove.co.uk' | ||
|
||
def scrape_individual_house(house_url): | ||
HOUSE_URL = (DOMAIN + house_url).split('/svr/')[0] | ||
print 'Scraping %s' % HOUSE_URL | ||
house_html = scraperwiki.scrape(HOUSE_URL) | ||
house_parser = etree.HTMLParser() | ||
house_tree = etree.parse(StringIO.StringIO(house_html), house_parser) | ||
house_text = house_tree.xpath('string(//div[@class="propertyDetailDescription"])') | ||
title = house_tree.xpath('string(//h1[@id="propertytype"])') | ||
# Check for search phrases | ||
for sp in SEARCH_PHRASES: | ||
if sp in house_text.lower() or sp in title.lower(): | ||
house = {} | ||
image_url = tostring(house_tree.xpath('//img[@id="mainphoto"]')[0]) | ||
price = house_tree.xpath('string(//div[@id="amount"])') | ||
map_img = house_tree.xpath('//a[@id="minimapwrapper"]/img') | ||
if map_img: | ||
map_img = tostring(house_tree.xpath('//a[@id="minimapwrapper"]/img')[0]) | ||
else: | ||
map_img = '' | ||
location = house_tree.xpath('string(//div[@id="addresscontainer"]/h2)') | ||
house['title'] = "%s - %s - %s" % (title, location, price) | ||
print 'HOUSE FOUND! %s, %s ' % (house['title'], HOUSE_URL) | ||
item_text = '<a href="' + HOUSE_URL + '">' + image_url + '</a>' | ||
item_text += '<a href="' + HOUSE_URL + '">' + map_img + '</a>' | ||
item_text += house_text | ||
house['description'] = item_text.replace(sp,"<span style='font-weight:bold;color:red;'>%s</span>" % sp) | ||
house['link'] = HOUSE_URL | ||
house['pubDate'] = datetime.now() | ||
scraperwiki.sqlite.save(['link'], house) | ||
|
||
# Gather list of results for an individual station. | ||
def scrape_results_page(results_url, initial=False): | ||
results_url = DOMAIN + results_url | ||
html = scraperwiki.scrape(results_url) | ||
parser = etree.HTMLParser() | ||
tree = etree.parse(StringIO.StringIO(html), parser) | ||
house_links = tree.xpath('//ol[@id="summaries"]//a[starts-with(text(), "More details")]/@href') | ||
for house_link in house_links: | ||
scrape_individual_house(house_link) | ||
if initial: | ||
results_links = tree.xpath('//ul[@class="items"]//a/@href') | ||
for r in results_links: | ||
scrape_results_page(r) | ||
|
||
# Do the actual scraping | ||
for property_type in ('houses', 'land'): | ||
url1 = '/property-for-sale/find.html?locationIdentifier=REGION^494&minPrice=%s&maxPrice=%s' % (MIN_PRICE, MAX_PRICE) | ||
url2 = '&radius=%s&displayPropertyType=%s&numberOfPropertiesPerPage=50' % (RADIUS_MILES, property_type) | ||
INITIAL_URL = url1 + url2 | ||
scrape_results_page(INITIAL_URL, initial=True) |