Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…

…pers/houses/
stevenday · Feb 27, 2014 · 1f2265c · 1f2265c
commit 1f2265c
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# Ignore output of scraper
+data.sqlite
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,86 @@
+# Forked from Anna PS's Rightmove scraper, simplified because we're only
+# looking in one place.
+from lxml import etree
+from lxml.etree import tostring
+from datetime import datetime
+import scraperwiki
+import StringIO
+
+MIN_PRICE = 0
+MAX_PRICE = 250000
+MIN_BEDROOMS = 2
+RADIUS_MILES = 3.0
+SEARCH_PHRASES = [ 
+    # Houses needing work
+    "in need of updating", 
+    "in need of some updating",
+    "requiring updating", 
+    "requiring some updating", 
+    "in need of modernisation",
+    "in need of some modernisation", 
+    "requiring modernisation",
+    "requiring some modernisation",
+    "in need of renovation", 
+    "in need of some renovation", 
+    "requiring renovation", 
+    "requiring some renovation", 
+    "renovation project",
+    # Houses with Land
+    "acre", 
+    "additional land", 
+    "very large garden", 
+    "extremely large garden", 
+    "paddock" 
+]
+DOMAIN = 'http://www.rightmove.co.uk'
+
+def scrape_individual_house(house_url):
+    HOUSE_URL = (DOMAIN + house_url).split('/svr/')[0]
+    print 'Scraping %s' % HOUSE_URL
+    house_html = scraperwiki.scrape(HOUSE_URL)
+    house_parser = etree.HTMLParser()
+    house_tree = etree.parse(StringIO.StringIO(house_html), house_parser)
+    house_text = house_tree.xpath('string(//div[@class="propertyDetailDescription"])')
+    title = house_tree.xpath('string(//h1[@id="propertytype"])')
+    # Check for search phrases
+    for sp in SEARCH_PHRASES:
+        if sp in house_text.lower() or sp in title.lower():
+            house = {}
+            image_url = tostring(house_tree.xpath('//img[@id="mainphoto"]')[0])
+            price = house_tree.xpath('string(//div[@id="amount"])')
+            map_img = house_tree.xpath('//a[@id="minimapwrapper"]/img')
+            if map_img:
+                map_img = tostring(house_tree.xpath('//a[@id="minimapwrapper"]/img')[0])
+            else:
+                map_img = ''
+            location = house_tree.xpath('string(//div[@id="addresscontainer"]/h2)')
+            house['title'] = "%s - %s - %s" % (title, location, price)
+            print 'HOUSE FOUND! %s, %s ' % (house['title'], HOUSE_URL)
+            item_text = '<a href="' + HOUSE_URL + '">' + image_url + '</a>'
+            item_text += '<a href="' + HOUSE_URL + '">' + map_img + '</a>'
+            item_text += house_text
+            house['description'] = item_text.replace(sp,"<span style='font-weight:bold;color:red;'>%s</span>" % sp)
+            house['link'] = HOUSE_URL
+            house['pubDate'] = datetime.now()
+            scraperwiki.sqlite.save(['link'], house)
+
+# Gather list of results for an individual station.
+def scrape_results_page(results_url, initial=False):
+    results_url = DOMAIN + results_url
+    html = scraperwiki.scrape(results_url)
+    parser = etree.HTMLParser()
+    tree = etree.parse(StringIO.StringIO(html), parser)
+    house_links = tree.xpath('//ol[@id="summaries"]//a[starts-with(text(), "More details")]/@href')
+    for house_link in house_links:
+        scrape_individual_house(house_link)
+    if initial:
+        results_links = tree.xpath('//ul[@class="items"]//a/@href')
+        for r in results_links:
+            scrape_results_page(r)
+
+# Do the actual scraping
+for property_type in ('houses', 'land'):
+    url1 = '/property-for-sale/find.html?locationIdentifier=REGION^494&minPrice=%s&maxPrice=%s' % (MIN_PRICE, MAX_PRICE)
+    url2 = '&radius=%s&displayPropertyType=%s&numberOfPropertiesPerPage=50' % (RADIUS_MILES, property_type)
+    INITIAL_URL = url1 + url2
+    scrape_results_page(INITIAL_URL, initial=True)