In [1]:
from urllib.request import urlretrieve
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import sys

Download Edinburgh Postcode table that contains latitude and longitude

In [2]:
urlretrieve ('https://www.doogal.co.uk/AdministrativeAreasCSV.ashx?district=S12000036', 'EdinburghPostcodes.csv')

('EdinburghPostcodes.csv', <http.client.HTTPMessage at 0x7fc2ee7d19e8>)

Define function the extract data for properties from a BeautifulSoup of a html webpage

In [3]:
def get_property_type_from_sold_property_page(http_address):
    soup = BeautifulSoup(get(http_address).text, 'html.parser')
    return soup.find(id='propertydetails').find_all('h2')[1].text


def get_property_data_from_soup(soup):
    # Extract data from the http soup
    date = []
    address = []
    bedrooms = []
    price = []
    property_type = []
    for soup_property in soup.find_all(class_='soldDetails'):
        # Skip properties for which there is no link to post on RightMove website
        if not soup_property.find(class_='soldAddress').has_attr('href'):
            continue
        else:
            property_http_address = soup_property.find(class_='soldAddress')['href']
        # Skip properties for which there is no number of bedrooms information
        if len(soup_property.find(class_='noBed').text) == 0:
            continue
        # Collect data for the property
        date.append(soup_property.find(class_='soldDate').text)
        address.append(soup_property.find(class_='soldAddress').text)
        bedrooms.append(soup_property.find(class_='noBed').text)
        price.append(soup_property.find(class_='soldPrice').text)
        # Attempt to collect property type
        try:
            property_type.append(get_property_type_from_sold_property_page(property_http_address))        
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            property_type.append('')
            print('Error when collecting property type.')
            print(sys.exc_info()[0])
    # Format data into pandas.DataFrame
    df = pd.DataFrame({'date': date, 
                       'address': address, 
                       'bedrooms': bedrooms, 
                       'property_type': property_type, 
                       'price': price}, 
                      columns=['date', 'address', 'bedrooms', 'property_type', 'price'])
    # Sort the DataFrame by date as well as address
    df.sort_values(['date', 'address'], ascending=[False, True], inplace=True)
    
    return df

Create a class to manage web scraping rate

In [4]:
from time import time, sleep

class RateManager(object):
    
    def __init__(self, min_interval, max_interval):
        """
        min_interval - float - minimum delay between calls (in seconds)
        max_interval - float - maximum delay between calls before notification (in seconds)
        """
        self.min_interval = min_interval
        self.max_interval = max_interval
        self.checkpoint = None
        
    def continue_when_ready(self, sleep_interval=0.1, print_interval=False):
        # This is in case of first call to continue_when_ready
        if self.checkpoint is None:
            self.checkpoint = time()
            return None
        # Check if max_interval has been surpassed
        if time() - self.checkpoint > self.max_interval:
            if print_interval:
                print('Interval duration: {}'.format(time() - self.checkpoint))
            self.checkpoint = time()
            return 'timeout'
        # If not over max_interval, wait until min_interval is reached
        if print_interval:
            print('Interval duration: {}'.format(time() - self.checkpoint))
        while time() - self.checkpoint < self.min_interval:
            sleep(sleep_interval)
        self.checkpoint = time()
        return 'intime'


Acquire residential property sales prices from RightMove.

There are likely duplicates in the resulting DataFrame. These will be dealt with later.

In [5]:
# List the http addresses for different areas of interest in Edinburgh
http_addresses = {
    'Stockbridge': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E66977&searchLocation=Stockbridge&propertyType=3&year=2&referrer=listChangeCriteria', 
    'NewTown': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E79909&searchLocation=New+Town&propertyType=3&year=2&referrer=listChangeCriteria', 
    'Morningside': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E86881&searchLocation=Morningside&propertyType=3&year=2&referrer=listChangeCriteria', 
    'EdinburghNorth': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&referrer=listChangeCriteria', 
    'EdinburghEast': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93601&searchLocation=Edinburgh+East&propertyType=3&year=2&referrer=listChangeCriteria', 
    'EdinburghWest': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria', 
    'EdinburghSouth': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria', 
    'Edinburgh': 'https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria'
    }

# Specify in-line function for suffix that specifies property list page index
http_index_suffix = lambda index: '&index={}'.format(index)

# Specify indices to work through
indices = list(range(0, 1025, 25))

# Create empty pandas.DataFrame to append new data to
df_property = pd.DataFrame({'date': [], 
                            'address': [], 
                            'bedrooms': [], 
                            'property_type': [], 
                            'price': []}, 
                           columns=['date', 'address', 'bedrooms', 'property_type', 'price'])

# Use RateManager to avoid overwhelming the website
rate_manager = RateManager(min_interval=5, max_interval=20)
max_timeouts = 10
timeout_count = 0

# Loop through all http addresses of different areas and all possible page indices
df_prev_property_list = pd.DataFrame({})
for http_address in [http_addresses[x] for x in http_addresses]:
    for index in indices:
        full_http_address = http_address + http_index_suffix(index)
        print('Visiting webpage:\n' + full_http_address)
        # Make sure webpage is not visited too often and that it is not blocking
        if rate_manager.continue_when_ready(print_interval=True) == 'timeout':
            timeout_count += 1
            if timeout_count > max_timeouts:
                raise RuntimeError('Too many timeouts.')
        # Get website html as BeautifulSoup
        soup = BeautifulSoup(get(full_http_address).text, 'html.parser')
        # Check if there is a property price data list on this page
        if len(soup.find_all(class_='soldDetails')) == 0:
            print('No properties listed on this page. Stopping index iteration.')
            break
        df_next_property_list = get_property_data_from_soup(soup)
        # If the new DataFrame is equal to the previous one, stop checking further indices
        if df_prev_property_list.equals(df_next_property_list):
            print('Property list repeated. Stopping index iteration.')
            break
        else:
            # Append the new property list to main property list and store to check against next one
            print('Got {} properties.'.format(df_next_property_list.shape[0]))
            df_property = df_property.append(df_next_property_list)
            df_prev_property_list = df_next_property_list

# Save collected property data to disk
df_property.to_pickle('EdinburghPropertiesRaw.p')
print('Collected total of {} properties.'.format(df_property.shape[0]))

Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=0
Got 5 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=25
Interval duration: 3.1751508712768555
Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=50
Interval duration: 6.835038900375366
Got 11 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=75
Interval duration: 7.519270181655884
Got 8 properties.

Got 14 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=800
Interval duration: 9.02331280708313
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=825
Interval duration: 5.026792764663696
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=850
Interval duration: 4.836529970169067
Got 8 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E475&searchLocation=Edinburgh&propertyType=3&year=2&referrer=listChangeCriteria&index=8

Got 14 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E66977&searchLocation=Stockbridge&propertyType=3&year=2&referrer=listChangeCriteria&index=575
Interval duration: 6.014865398406982
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E66977&searchLocation=Stockbridge&propertyType=3&year=2&referrer=listChangeCriteria&index=600
Interval duration: 8.931237697601318
Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E66977&searchLocation=Stockbridge&propertyType=3&year=2&referrer=listChangeCriteria&index=625
Interval duration: 4.5148844718933105
Got 4 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E66977&searchLocation=Stockbridge&propertyType=3&year=2&referrer=listCha

Got 13 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=75
Interval duration: 8.83165192604065
Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=100
Interval duration: 4.677427291870117
Got 15 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=125
Interval duration: 9.749561786651611
Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&refer

Got 15 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=850
Interval duration: 6.289226055145264
Got 12 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=875
Interval duration: 5.929519414901733
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&referrer=listChangeCriteria&index=900
Interval duration: 6.099860906600952
Got 17 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93613&searchLocation=Edinburgh+West&propertyType=3&year=2&refe

Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93601&searchLocation=Edinburgh+East&propertyType=3&year=2&referrer=listChangeCriteria&index=600
Interval duration: 7.132136821746826
Got 6 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93601&searchLocation=Edinburgh+East&propertyType=3&year=2&referrer=listChangeCriteria&index=625
Interval duration: 4.738008737564087
Got 11 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93601&searchLocation=Edinburgh+East&propertyType=3&year=2&referrer=listChangeCriteria&index=650
Interval duration: 7.3049280643463135
Got 7 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93601&searchLocation=Edinburgh+East&propertyType=3&year=2&refe

Got 15 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E79909&searchLocation=New+Town&propertyType=3&year=2&referrer=listChangeCriteria&index=350
Interval duration: 9.065736770629883
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E79909&searchLocation=New+Town&propertyType=3&year=2&referrer=listChangeCriteria&index=375
Interval duration: 6.360553503036499
Got 11 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E79909&searchLocation=New+Town&propertyType=3&year=2&referrer=listChangeCriteria&index=400
Interval duration: 6.541202545166016
Got 11 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E79909&searchLocation=New+Town&propertyType=3&year=2&referrer=listChangeCriteria&

Got 13 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria&index=200
Interval duration: 9.480908393859863
Got 10 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria&index=225
Interval duration: 6.737649917602539
Got 8 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria&index=250
Interval duration: 6.7800962924957275
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&

Got 13 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria&index=975
Interval duration: 6.1692047119140625
Got 9 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93610&searchLocation=Edinburgh+South&propertyType=3&year=2&referrer=listChangeCriteria&index=1000
Interval duration: 4.61824369430542
Got 12 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&referrer=listChangeCriteria&index=0
Interval duration: 6.92267370223999
Got 11 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&re

Got 14 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&referrer=listChangeCriteria&index=725
Interval duration: 9.14725112915039
Got 12 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&referrer=listChangeCriteria&index=750
Interval duration: 5.891765117645264
Got 13 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&referrer=listChangeCriteria&index=775
Interval duration: 7.354235410690308
Got 12 properties.
Visiting webpage:
https://www.rightmove.co.uk/house-prices/detail.html?country=scotland&locationIdentifier=REGION%5E93604&searchLocation=Edinburgh+North&propertyType=3&year=2&

In [6]:
df_property

Unnamed: 0,date,address,bedrooms,property_type,price
0,31 Dec 2018,"184/6, Causewayside, Edinburgh, Midlothian EH9...",2 bedrooms,2 bedroom flat,"£220,000"
3,28 Dec 2018,"120/1, Willowbrae Road, Edinburgh EH8 7HW",2 bedrooms,2 bedroom flat,"£251,500"
1,28 Dec 2018,"2, Ashburnham Loan, South Queensferry, West Lo...",5 bedrooms,5 bedroom detached house,"£512,760"
2,28 Dec 2018,"6a, Considine Terrace, Edinburgh, Mid EH8 7EB",3 bedrooms,3 bedroom semi-detached house,"£268,379"
4,27 Dec 2018,"102a, Lower Granton Road, Edinburgh, Midlothia...",2 bedrooms,2 bedroom flat,"£255,000"
1,24 Dec 2018,"109, Carrick Knowe Drive, Edinburgh, Midlothia...",2 bedrooms,2 bedroom flat,"£152,500"
6,24 Dec 2018,"120a/3, Crewe Road North, Edinburgh, Midlothia...",2 bedrooms,2 bedroom apartment,"£178,000"
9,24 Dec 2018,"19, Carrick Knowe Avenue, Edinburgh, Midlothia...",2 bedrooms,2 bedroom flat,"£168,000"
5,24 Dec 2018,"201, Colinton Mains Drive, Edinburgh, Midlothi...",2 bedrooms,2 bedroom flat,"£157,500"
3,24 Dec 2018,"24, Greenbank Loan, Edinburgh, Midlothian EH10...",4 bedrooms,4 bedroom detached bungalow,"£610,000"
