In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from pprint import pprint as pprint
from re import sub
from collections import namedtuple
from urllib2 import urlopen
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
url = "https://nethouseprices.com/house-prices/london?page=1"
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [4]:
# Identified from element inspections via the Chrome developer console.
addresses = soup.find_all("strong", class_="street-details-head-row")
prices = soup.find_all("strong", class_="street-details-price-row")
details = soup.find_all("div", class_="street-details-row")

In [5]:
# Our scraped data series are set below.
addr = [i.find("a").string.replace(u"\xa0", " ") for i in addresses]
pxs = [float(i.string.replace(u"\xa3", "").replace(u",", "")) for i in prices]
property_characteristics = [[i.strip() for i in categories.string.split(",")] for categories in details]
flat_type = [i[0] for i in property_characteristics]
lease_type = [i[1] for i in property_characteristics]
build_status = [i[2] for i in property_characteristics]

In [6]:
geodata_urls = ["https://nominatim.openstreetmap.org/search?q=\"{}\"&format=json".format(i.replace(" ", "%20")) for i in addr]
    
def get_geodata_object(openstreetmap_api_url):
    url = openstreetmap_api_url
    json_response_text = BeautifulSoup(urlopen(url), "lxml").text
    return json.loads(json_response_text)

def convert_json_to_named_tuple(json_):
    """This is solely for our convenience when referencing JSON response attributes in dataset creation"""
    return json.loads(json_, object_hook=lambda dict_: namedtuple('X', dict_.keys())(*dict_.values()))

In [7]:
json_search_results = [get_geodata_object(url) for url in geodata_urls]
top_search_results = [result if len(result) == 0 else result[0] for result in json_search_results]
# for i in top_search_results:
#     pprint(i)

In [8]:
# We now convert these top search results back to JSON to make named tuples for ease of referencing in dataseries creation.
# top_search_results_as_strings = ['[{}]'.format(str(i)) if type(i) == dict else '{}'.format(i) for i in top_search_results]
top_search_results_as_json = [json.dumps(i).replace("class", "category").replace("type", "subcategory").replace("osm_subcategory", "osm_type") for i in top_search_results]
geodata_json = [convert_json_to_named_tuple(result) for result in top_search_results_as_json]

In [9]:
def load_geodata_attributes(geodata_obj):
    try:
        return (
                geodata_obj.category,
                geodata_obj.subcategory,
                float(geodata_obj.importance),
                float(geodata_obj.lon),
                float(geodata_obj.lat))
    except AttributeError: # this handles the case where our JSON loader did not find a JSON response from the API URL.
        return (np.nan, np.nan, np.nan, np.nan, np.nan)

#TO-DO: add data parsed from display_name about borough as alternative to k Mean Clustering, to data frame.
# json_search_results = [get_geodata_object(url) for url in geodata_urls]
geodata = [load_geodata_attributes(result_named_tuple) for result_named_tuple in geodata_json]

category = [i[0] for i in geodata]
subcategory = [i[1] for i in geodata]
importance = [i[2] for i in geodata]
longitude = [i[3] for i in geodata]
latitude = [i[4] for i in geodata]

In [10]:
print geodata

[(nan, nan, nan, nan, nan), (nan, nan, nan, nan, nan), (u'highway', u'secondary', 0.61, 0.0233915, 51.5539018), (u'highway', u'residential', 0.51, 0.017585, 51.5569088), (u'highway', u'residential', 0.5199999999999999, -0.110912, 51.4785331), (u'highway', u'residential', 0.51, -0.3132106, 51.4936125), (u'highway', u'residential', 0.61, -0.1197986, 51.4176125), (u'highway', u'residential', 0.61, -0.0974695, 51.5894258), (u'highway', u'residential', 0.51, -0.3059546, 51.4947089), (nan, nan, nan, nan, nan), (u'highway', u'primary', 0.6199999999999999, -0.0699216, 51.5462797), (u'highway', u'residential', 0.61, -0.0846013, 51.633648), (u'highway', u'residential', 0.41000000000000003, 0.0153564, 51.5948356), (u'place', u'house', 0.5209999999999999, -0.17676975, 51.48630785), (u'shop', u'convenience', 0.42099999999999993, -0.1621619, 51.352929), (u'highway', u'secondary', 0.71, -0.1991401, 51.4810658), (u'highway', u'residential', 0.51, -0.2609926, 51.4991882), (u'highway', u'tertiary', 0.51

In [11]:
# We construct our dataset.
variables = [addr, pxs, flat_type, lease_type, build_status, category, subcategory, importance, longitude, latitude]
series_names = ["addresses",
                "prices",
                "flat_type",
                "lease_type",
                "build_status",
                "category",
                "subcategory",
                "importance",
                "longitude",
                "latitude"]

if len(pxs) == sum([len(dataseries) for dataseries in variables])/len(variables): # Check all series are the same length.
    dataset = {series_name : series for series_name, series in zip(series_names, variables)} # Setup dictionary for dataframe.

In [12]:
# Beware some erroneous lon-lat data due to multiple search results and improper parsing of JSON response.
# NB: ignore the above, this was fixed. Now have to find a way of removing bad search data, say data 1SD away from mean lat lon.

dataset_frame = pd.DataFrame(dataset)
dataset_frame

Unnamed: 0,addresses,build_status,category,flat_type,importance,latitude,lease_type,longitude,prices,subcategory
0,"Flat 14 Buchanan House, 7 Troubridge Square, L...",Newbuild,,Flat,,,Leasehold,,517275.0,
1,"Flat 26 Buchanan House, 7 Troubridge Square, L...",Newbuild,,Flat,,,Leasehold,,527175.0,
2,"91 Dames Road, London, E7 0DW",Non-Newbuild,highway,Terraced,0.61,51.553902,Freehold,0.023391,420000.0,secondary
3,"82 Ramsay Road, London, E7 9EW",Non-Newbuild,highway,Terraced,0.51,51.556909,Freehold,0.017585,420000.0,residential
4,"21 Mandela Street, London, SW9 6EL",Non-Newbuild,highway,Terraced,0.52,51.478533,Freehold,-0.110912,560000.0,residential
5,"115 Murray Road, London, W5 4DB",Non-Newbuild,highway,Terraced,0.51,51.493612,Leasehold,-0.313211,495000.0,residential
6,"55 Briar Avenue, London, SW16 3AB",Non-Newbuild,highway,Semi Detached,0.61,51.417612,Freehold,-0.119799,535000.0,residential
7,"26 Carlingford Road, London, N15 3EH",Non-Newbuild,highway,Terraced,0.61,51.589426,Freehold,-0.09747,500000.0,residential
8,"30 Darwin Road, London, W5 4BD",Non-Newbuild,highway,Terraced,0.51,51.494709,Freehold,-0.305955,820000.0,residential
9,"Flat 5 Peace Court, 8 Swynford Gardens, London...",Non-Newbuild,,Flat,,,Leasehold,,260000.0,


In [13]:
# We use this cell to check errneous data entries.
check = 31
print "https://nominatim.openstreetmap.org/search?q=\"{}\"&format=json".format(dataset_frame.addresses[check].replace(" ", "%20"))
print dataset_frame.latitude[check], dataset_frame.longitude[check]

https://nominatim.openstreetmap.org/search?q="83%20Ridge%20Road,%20London,%20N21%203EL"&format=json
41.6153771584 -72.0913489705
