In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from pprint import pprint as pprint
from re import sub
from collections import namedtuple
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from datetime import datetime

%matplotlib inline

In [None]:
url = "https://nethouseprices.com/house-prices/london?page=1"
# headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'}
# req = Request(url, None, headers)
# html = urlopen(req)
html = urlopen(url)
soup = BeautifulSoup(html, "xml")
# TO-DO: segment HTML downloading into chunks, and see if HTML can be saved to disk, and scraped by BeautifulSoup from disk.
# Also determine how much of the data to use as a dataset, and turn all your processing code into functions, so you can do
# this processing across multiple URLs to get one dataset you can model on.
# The next step is to do k-Means Clustering and get dummies, then run Random Forest Regressor, and write RESTful API.
# return ["https://nethouseprices.com/house-prices/london?page={}".format(i) for i in xrange(1,12484)]

In [None]:
# we download our entire dataset HTMLs in chunks, and write them to disk, only opening them when we need them.
# def get_url_list(generic_url_string, start_page, end_page):
#     return ["https://nethouseprices.com/house-prices/london?page={}".format(i) for i in xrange(start_page, end_page)]

# def write_html_to_disk(url, start, end):
#     print start, end
#     f = urlopen(url)
#     data = f.read()
#     with open("{}_{}_raw.html".format(start, end), "a+") as html_file:
#         html_file.write(data)

# def write_multiple_html_to_disk_from_list(start, end, chunk_size, url_list):
#     start_indices = list(xrange(start, end, chunk_size))
#     end_index_offset = chunk_size-1
#     [[write_html_to_disk(url, s, s + end_index_offset) for url in url_list[start-1:end-1]] for s in start_indices]

    
# url_list = get_url_list("https://nethouseprices.com/house-prices/london?page={}", 1, 12484)
# write_multiple_html_to_disk_from_list(0, 10, 10, url_list)

In [None]:
# Identified from element inspections via the Chrome developer console.
addresses = soup.find_all("strong", class_="street-details-head-row")
prices = soup.find_all("strong", class_="street-details-price-row")
details = soup.find_all("div", class_="street-details-row")
print addresses

In [None]:
sale_dates_rows = soup.find_all("tr", class_="sold_price_row")
sale_date_strings = [i.findChildren('td')[-1].text for i in sale_dates_rows]
def parse_date(date_string):
    ds = date_string
    dsl = ds.split(" ")
    return " ".join(["".join([char if not char.isalpha() else "" for char in dsl[0]]), " ".join(dsl[1:])])

cleaned_sale_date_strings = [parse_date(i) for i in sale_date_strings]
# print cleaned_sale_date_strings
sale_dates = [datetime.strptime(i, "%d %B %Y") for i in cleaned_sale_date_strings]
# sale_dates

In [None]:
# Our scraped data series are set below.
addr = [i.find("a").string.replace(u"\xa0", " ") for i in addresses]
pxs = [float(i.string.replace(u"\xa3", "").replace(u",", "")) for i in prices]
property_characteristics = [[i.strip() for i in categories.string.split(",")] for categories in details]
flat_type = [i[0] for i in property_characteristics]
lease_type = [i[1] for i in property_characteristics]
build_status = [i[2] for i in property_characteristics]

In [None]:
geodata_urls = ["https://nominatim.openstreetmap.org/search?q=\"{}\"&format=json".format(i.replace(" ", "%20")) for i in addr]
    
def get_geodata_object(openstreetmap_api_url):
    url = openstreetmap_api_url
    json_response_text = BeautifulSoup(urlopen(url), "lxml").text
    return json.loads(json_response_text)

def convert_json_to_named_tuple(json_):
    """This is solely for our convenience when referencing JSON response attributes in dataset creation"""
    return json.loads(json_, object_hook=lambda dict_: namedtuple('X', dict_.keys())(*dict_.values()))

In [None]:
json_search_results = [get_geodata_object(url) for url in geodata_urls]
top_search_results = [result if len(result) == 0 else result[0] for result in json_search_results]
# for i in top_search_results:
#     pprint(i)

In [None]:
# We now convert these top search results back to JSON to make named tuples for ease of referencing in dataseries creation.
# top_search_results_as_strings = ['[{}]'.format(str(i)) if type(i) == dict else '{}'.format(i) for i in top_search_results]
top_search_results_as_json = [json.dumps(i).replace("class", "category").replace("type", "subcategory").replace("osm_subcategory", "osm_type") for i in top_search_results]
geodata_json = [convert_json_to_named_tuple(result) for result in top_search_results_as_json]

In [None]:
def load_geodata_attributes(geodata_obj):
    try:
        return (
                geodata_obj.category,
                geodata_obj.subcategory,
                float(geodata_obj.importance),
                float(geodata_obj.lon),
                float(geodata_obj.lat))
    except AttributeError: # this handles the case where our JSON loader did not find a JSON response from the API URL.
        return (np.nan, np.nan, np.nan, np.nan, np.nan)

#TO-DO: add data parsed from display_name about borough as alternative to k Mean Clustering, to data frame.
# json_search_results = [get_geodata_object(url) for url in geodata_urls]
geodata = [load_geodata_attributes(result_named_tuple) for result_named_tuple in geodata_json]

category = [i[0] for i in geodata]
subcategory = [i[1] for i in geodata]
importance = [i[2] for i in geodata]
longitude = [i[3] for i in geodata]
latitude = [i[4] for i in geodata]

In [None]:
# We construct our dataset.
variables = [addr, pxs, sale_dates, flat_type, lease_type, build_status, category, subcategory, importance, longitude, latitude]
series_names = ["addresses",
                "prices",
                "sale_dates",
                "flat_type",
                "lease_type",
                "build_status",
                "category",
                "subcategory",
                "importance",
                "longitude",
                "latitude"]

# Check all series are the same length.
if len(pxs) == sum([len(dataseries) for dataseries in variables])/len(variables):
    # Setup dictionary for dataframe.
    dataset = {series_name : series for series_name, series in zip(series_names, variables)} 

In [None]:
# Beware some erroneous lon-lat data due to multiple search results and improper parsing of JSON response.
# NB: ignore the above, this was fixed. Now have to find a way of removing bad search data, say data 1SD away from mean lat lon.

dataset_frame = pd.DataFrame(dataset)
dataset_frame.head()

In [None]:
# We add a new parameter, which we will use to exclude anomalous lat lon coordinates from bad Open Street Map API data.
    
dataset_frame["latitude_z_score"] = (dataset_frame.latitude - dataset_frame.latitude.mean()) / dataset_frame.latitude.std()
dataset_frame["longitude_z_score"] = (dataset_frame.longitude - dataset_frame.longitude.mean()) / dataset_frame.longitude.std()
dataset_frame.head()

In [None]:
# We create new columns which introduce NaN data where the lat lon absolute Z scores exceed 3, indicating likely bad data.
# By bad data, we mean assuming a normal distribution, the lat lon coordinates are statistically significantly different
# from the average coordinates of all other datapoints. Given all the houses should be in London, this is a good filter.

dataset_frame["lat_z_score_mask"] = dataset_frame.latitude_z_score.where(dataset_frame.latitude_z_score.abs() < 3)
dataset_frame["lon_z_score_mask"] = dataset_frame.longitude_z_score.where(dataset_frame.longitude_z_score.abs() < 3)
dataset_frame.head()

In [None]:
# We will also create dummy variables for all our non-numerical dataseries.
build_status_dummies = pd.get_dummies(dataset_frame.build_status)
flat_type_dummies = pd.get_dummies(dataset_frame.flat_type)
lease_type_dummies = pd.get_dummies(dataset_frame.lease_type)
category_dummies = pd.get_dummies(dataset_frame.category)
subcategory_dummies = pd.get_dummies(dataset_frame.subcategory)


dataframes_set = [dataset_frame, build_status_dummies, flat_type_dummies, lease_type_dummies, category_dummies, subcategory_dummies]
# for dataframe in dataframes_set:
#     dataframe.reset_index(drop=True)
    
dataset_frame = pd.concat(dataframes_set, axis=1)
# dataset_frame = pd.concat([dataset_frame, pd.get_dummies(dataset_frame.lease_type)], axis=1)
dataset_frame.head()

In [None]:
dataset_frame = dataset_frame.dropna()
cols_to_drop = ['build_status',
                'flat_type',
                'lease_type',
                'category',
                'subcategory',
                'latitude_z_score',
                'longitude_z_score',
                'lat_z_score_mask',
                'lon_z_score_mask']
final_dataset_frame = dataset_frame.drop(cols_to_drop, axis=1)
final_dataset_frame = final_dataset_frame.reset_index(drop=True)
final_dataset_frame.head()