In [169]:
import os
import requests
import json
import yaml
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [170]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [171]:
maps_key = config['google_maps_api_key']

In [172]:
query_dir = 'queries'

In [173]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [174]:
query_client = bigquery.Client()

In [175]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [176]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.travel_times', query_client):
    request = queries['all_buildings_that_have_not_been_processed.sql']
else:
    request = queries['all_buildings.sql']

In [177]:
real_estate_raw = query_client.query(request).to_dataframe()

In [178]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466 entries, 0 to 465
Data columns (total 4 columns):
ad_id             466 non-null int64
new_building      466 non-null bool
property_attrs    466 non-null object
address           465 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 11.5+ KB


In [179]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [180]:
addresses = real_estate_raw.drop(['new_building', 'property_attrs'], axis=1).drop_duplicates().copy()

In [181]:
addresses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 410 entries, 78653360 to 151063588
Data columns (total 1 columns):
address    409 non-null object
dtypes: object(1)
memory usage: 6.4+ KB


In [182]:
addresses.head()

Unnamed: 0_level_0,address
ad_id,Unnamed: 1_level_1
78653360,"Strandvegen 1, 2005 Rælingen"
78866228,"FLÅTESTADVEIEN 3, 1415 Oppegård"
82173385,"Røakollen - Aslakveien 20 - hus B, 0753 Oslo"
84047772,"Røakollen - Aslakveien 20 - Hus A, 0753 Oslo"
85787019,"Kværnerdammen borettslag. Utsikt, kveldssol, f..."


In [183]:
def get_directions(origin, destination, mode, key):
    google_maps_url = 'https://maps.googleapis.com/maps/api/directions/json'
    params = {'destination': destination, 'origin': origin, 'key': key, 'mode': mode}
    response = requests.get(google_maps_url, params=params)
    return response

In [184]:
def get_min_travel_time_sec(response):
    routes = []
    for route in response['routes']:
        all_legs_duration = 0
        for leg in route['legs']:
            all_legs_duration += leg['duration']['value']
        routes.append(all_legs_duration)
    try:
        return min(routes)
    except ValueError:
        return None

In [197]:
def get_travel_times_table(maps_responses):
    cols = ['ad_id', 'destination', 'time_s']
    data = []
    for dest in maps_responses.destination.unique():
        sub_df = maps_responses[maps_responses.destination==dest].set_index('ad_id', drop=True)
        for ad_id, response in sub_df.maps_response.items():
            min_time = get_min_travel_time_sec(json.loads(response))
            data.append([ad_id, dest, min_time])
    return pd.DataFrame(data, columns=cols)

In [186]:
def get_directions_jsons(destinations, address_table):
    cols = ['ad_id', 'destination', 'maps_response']
    data = []
    for dest in destinations:
        for ad_id, addr in address_table.address.items():
            response = get_directions(addr, dest, 'transit', maps_key)
            data.append([ad_id, dest, json.dumps(response.json())])
    return pd.DataFrame(data, columns=cols)

In [187]:
def get_origin_coordinates(maps_responses):
    all_dest = maps_responses.destination.unique()
    sub_df = maps_responses[maps_responses.destination==all_dest[0]].set_index('ad_id', drop=True)
    cols = ['ad_id', 'lat', 'lng']
    data = []
    for ad_id, response in sub_df.maps_response.items():
        response_dict = json.loads(response)
        try:
            coords = response_dict['routes'][0]['legs'][0]['start_location']
            lat = coords['lat']
            lng = coords['lng']
        except IndexError as e:
            lat = None
            lng = None
        data.append([ad_id, lat, lng])
    return pd.DataFrame(data, columns=cols)

In [188]:
def get_destination_coordinates():
    pass

In [189]:
common_destination = 'Nationaltheatret, Oslo'

In [190]:
all_api_results = get_directions_jsons([common_destination], addresses)

In [191]:
all_api_results.head()

Unnamed: 0,ad_id,destination,maps_response
0,78653360,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
1,78866228,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
2,82173385,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
3,84047772,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
4,85787019,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."


In [192]:
all_api_results.to_gbq(destination_table='housing_data.google_maps_responses',
                       project_id='hde-test-clean',
                       if_exists='append')

In [198]:
travel_times = get_travel_times_table(all_api_results)

In [199]:
travel_times.head()

Unnamed: 0,ad_id,destination,time_s
0,78653360,"Nationaltheatret, Oslo",2329.0
1,78866228,"Nationaltheatret, Oslo",1918.0
2,82173385,"Nationaltheatret, Oslo",1097.0
3,84047772,"Nationaltheatret, Oslo",1097.0
4,85787019,"Nationaltheatret, Oslo",1135.0


In [200]:
travel_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 3 columns):
ad_id          410 non-null int64
destination    410 non-null object
time_s         407 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.7+ KB


In [201]:
travel_times.to_gbq(destination_table='housing_data.travel_times',
                    project_id='hde-test-clean',
                    if_exists='append')

In [202]:
ad_coordinates = get_origin_coordinates(all_api_results)

In [203]:
ad_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 3 columns):
ad_id    410 non-null int64
lat      407 non-null float64
lng      407 non-null float64
dtypes: float64(2), int64(1)
memory usage: 9.7 KB


In [204]:
ad_coordinates.to_gbq(destination_table='housing_data.ad_coordinates',
                      project_id='hde-test-clean',
                      if_exists='append')