In [1]:
import os
import json
import pickle
import requests
import urllib.parse
from enum import Enum
from urllib import parse
from collections import namedtuple

import folium
import xmltodict
import numpy as np
import pandas as pd
from numpy.linalg import norm
from branca import colormap


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
os.environ['ZILLOW_WSID'] = 'X1-ZWz17rx3br6ebv_1is62'

In [4]:
some_references = '''
https://www.zillow.com/research/data/
https://python-visualization.github.io/folium/plugins.html
https://python-graph-gallery.com/292-choropleth-map-with-folium/
http://colorbrewer2.org/#type=sequential&scheme=Oranges&n=3
https://github.com/python-visualization/folium/blob/master/examples/GeoJSON_and_choropleth.ipynb
https://nbviewer.jupyter.org/github/python-visualization/folium/blob/master/examples/GeoJSON_and_choropleth.ipynb
https://towardsdatascience.com/data-101s-spatial-visualizations-and-analysis-in-python-with-folium-39730da2adf
'''

# Create a Heatmap of Housing Prices in Atlanta, GA

In [5]:
ZillowHomeInfo = namedtuple('ZillowHomeInfo', 'zpid lat lon value')

In [6]:
session = requests.session()

def get_search_results(address, city, state):
    homes = list()
    
    params = {
        'zws-id': os.environ['ZILLOW_WSID'],
        'address': address,
        'citystatezip': '{}, {}'.format(city, state)
    }
    response = session.get('https://www.zillow.com/webservice/GetSearchResults.htm', params=params)
    if not response.ok:
        import traceback
        traceback.print_exc()
        return homes

    d = xmltodict.parse(response.text)
    try:
        results = d['SearchResults:searchresults']['response']['results']['result']
        code = int(d['SearchResults:searchresults']['message']['code'])
    except Exception as e:
        import traceback
        traceback.print_exc()
        code = -1
        
    if code == 0 and isinstance(results, list):
        for res in results:
            zpid = res.get('zpid')
            if zpid is not None:
                # look for real estate details
                real_estate = res.get('localRealEstate')
                if real_estate is not None:
                    # look for the house valuation
                    value = real_estate.get('region', {}).get('zindexValue')
                    if value is not None:
                        # if a valuation exists, parse it and lat/lon info
                        value = float(value.replace(',', ''))
                        address = res.get('address')
                        if address is not None:
                            lat, long = address.get('latitude'), address.get('longitude')
                            homes.append(ZillowHomeInfo(zpid=zpid, lat=float(lat), lon=float(long), value=value))
                    
    return homes


def get_comps(zpid):
    
    def inner(comp):
        if comp is not None:
            try:
                zpid = comp.get('zpid')
                lat = comp.get('address', {}).get('latitude')
                lon = comp.get('address', {}).get('longitude')
                value = comp.get('localRealEstate', {}).get('region', {}).get('zindexValue')
            except Exception as e:
                zpid = lat = lon = value = None
                
            if not any([e is None for e in [zpid, lat, lon, value]]):
                return {
                    'zpid': zpid,
                    'lat': float(lat),
                    'long': float(lon),
                    'value': float(value.replace(',', ''))
                }
    
    # make call to Zillow to get comps response
    params = {
        'zws-id': os.environ['ZILLOW_WSID'],
        'zpid': zpid,
        'count': 25
    }
    response = session.get('http://www.zillow.com/webservice/GetComps.htm', params=params)
    if not response.ok:
        import traceback
        traceback.print_exc()
        return list()
        
    # parse the response and create ZillowHomeInfo objects
    d = xmltodict.parse(response.text)
    code = int(d['Comps:comps']['message']['code'])
    if code == 0:
        comp_list = d['Comps:comps']['response']['properties']['comparables']['comp']
        for comp in map(inner, comp_list):
            if comp is not None:
                comp_info = ZillowHomeInfo(zpid=comp['zpid'], lat=comp['lat'], lon=comp['long'], value=comp['value'])
                yield comp_info

                
def make_criterion(x_ref, r_max):
    
    def inner(home):
        x_probe = np.array([home.lat, home.lon])
        _norm = norm(x_probe - x_ref)
        return _norm < r_max
    
    return inner


In [None]:
# set to True to generate a new heatmap for a city
data_fn = 'data/data2.pkl'
generate_new = True

if generate_new:
    node_map = dict()
    all_nodes = set()
    node_data = dict()
    address = 'Atlanta'
    city = 'Atlanta'
    state = 'GA'
    center = np.array([33.7763, -84.3855])
    r_max = 0.1
    
    i = 0
    n_iter = 0
    size_target = 25000
    max_iter = size_target

    # get initial results to seed the graph
    homes = get_search_results(
        address=address,
        city=city,
        state=state
    )
    if len(homes) == 0:
        raise ValueError('No results found for {} in {}.'.format(address, city))
    mean_position = np.array([[home.lat, home.lon] for home in homes]).mean(axis=0)
    print('Mean results position: {}'.format(mean_position))
    meets_criteria = make_criterion(center, r_max)

    for i, home in enumerate(homes):
        if meets_criteria(home):
            node_data[home.zpid] = home
        else:
            del homes[i]

    done = False
    while len(homes) > 0 and not done:
        n_iter += 1

        home = homes.pop(-1)

        comps = get_comps(home.zpid)
        for comp in comps:

            if comp.zpid not in all_nodes:

                if meets_criteria(comp):
                    # add to set of all nodes
                    all_nodes.add(comp.zpid)

                    # add to the queue
                    homes.append(comp)

                    # add to the id -> home info mapping
                    node_data[comp.zpid] = comp

            # add to the id -> comps mapping
            if home.zpid not in node_map:
                node_map[home.zpid] = list()
            node_map[home.zpid].append(comp.zpid)

        # iterate until we have reached the desired condition
        if len(node_data) >= size_target:
            print('Desired # elements reached')
            done = True
            print(len(node_data), len(homes))
        elif n_iter == max_iter:
            print('Maximum # iterations reached')
            done = True
            print(len(node_data), len(homes))
        elif len(homes) == 0:
            print('All homes traversed/dead end reached.')
            done = True
            print(len(node_data), len(homes))
        # randomly print out how many homes we've found
        elif np.random.normal() < -1:
            print(len(node_data))

    with open(data_fn, 'wb') as fp:
        pickle.dump({'node_data': node_data, 'node_map': node_map}, fp)
else:
    with open(data_fn, 'rb') as fp:
        data = pickle.load(fp)
    node_data, node_map = data['node_data'], data['node_map']
    
print('# houses: {}'.format(len(node_data)))


Mean results position: [ 33.76791759 -84.42061576]
43
83
153
186
199
235
243
263
515
612
618
629
630
631
632
685
690
699
765
765
775
966
970
976
984
990
996
1004
1076
1076
1505
1602
1620
1631
1641
1655
1800
1861
1892
1935
1968
2013
2016
2031
2031
2052
2096
2096
2102
2125
2139
2172
2194
2206
2306
2341
2352
2358
2403
2424
2437
2437
2437
2442
2442
2509
2536
2557
2570
2590
2591
2667
2686
2712
2718
2719
2724
2728
2728
2767
2767
2800
2828
2891
2946
2946
2954
2959
3088
3089
3091
3091
3097
3101
3106
3113
3209
3272
3290
3308
3309
3338
3407
3409
3413
3413
3441
3571
3606
3615
3632
3639
3640
3646
3653
3660
3663
3663
3678
3678
3678
3707
3710
3711
3763
3766
3766
3785
3789
3849
3906
3927
3972
4005
4018
4038
4055
4058
4105
4115
4121
4151
4168
4178
4178
4230
4246
4281
4340
4373
4414
4431
4435
4435
4444
4447
4448
4448
4451
4453
4461
4478
4481
4482
4483
4508
4559
4562
4571
4573
4573
4577
4577
4577
4579
4579
4591
4603
4626
4631
4641
4650
4653
4653
4660
4665
4671
4682
4682
4682
4682
4683
4683
4683
4687
468

In [8]:
from folium.plugins import HeatMap

data = np.array([
    [home.lat, home.lon, home.value]
    for home in node_data.values()
])

m = folium.Map(
    location=data[:, :2].mean(axis=0),
    control_scale=True,
    zoom_start=12
)

radius = 15
hm = HeatMap(
    data,
    radius=radius,
    blur=30
)
hm.add_to(m)

m.save('{}_heatmap.html'.format(city.lower()))

# uncomment here to see map in notebook
m

# Visualize the Price per Sq Ft of US States

In [9]:
def local_states_geo_json():
    fn_states_geo = os.path.join('data', 'us-states.json')
    if not os.path.exists(fn_states_geo):
        url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
        us_states = '{}/us-states.json'.format(url)
        geo_json_states = requests.get(us_states).json()
        with open(fn_states_geo, 'w') as fp:
            json.dump(geo_json_states, fp)
            
    return fn_states_geo


def make_color_map(df, col_keys, col_data):
    
    keys = df[col_keys].values
    values = df[col_data].values
    df.set_index('State', drop=True, inplace=True)
    state_to_value = {state: df.loc[state, 'MedianPPSQFT'] for state in df.index}
    
    cmap = colormap.linear.OrRd_07.scale(
        int(np.percentile(values, 5)),
        int(np.percentile(values, 95))
    )
    cmap.caption = 'Zillow Median Price Per Square Foot ($ in thousands)'
    
    def value_to_color(entry):
        return {
            'fillColor': cmap(state_to_value.get(entry['id'], 0)),
            'weight': 1,
            'fillOpacity': 1.0,
        }
    
    return cmap, value_to_color


In [12]:
# load data downloaded from Zillow
df = pd.read_csv('data/State_MedianValuePerSqft_AllHomes.csv')
data = df[['State', df.columns[-1]]]
data.rename(columns={df.columns[-1]: 'MedianPPSQFT'}, inplace=True)

# create a color map
cmap, f_value_to_color = make_color_map(data, col_keys='State', col_data='MedianPPSQFT')

# add the US states Geo JSON data to the Map
state_values = folium.GeoJson(
    local_states_geo_json(),
    style_function=f_value_to_color
)

m = folium.Map(location=[37, -102], zoom_start=4)
cmap.add_to(m)
state_values.add_to(m)
m.save('ppsqft_map.html')

# uncomment here to see map in notebook
m


In [11]:
data.sort_values(data.columns[0], ascending=False).head(10)

Unnamed: 0_level_0,MedianPPSQFT
State,Unnamed: 1_level_1
DC,507
HI,499
CA,355
MA,251
CO,248
WA,244
UT,236
OR,213
RI,198
NJ,191
