In [43]:
import os
import requests
import json
import yaml
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [21]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [22]:
maps_key = config['google_maps_api_key']

In [23]:
query_dir = 'queries'

In [24]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [25]:
query_client = bigquery.Client()

In [26]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [27]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.travel_times', query_client):
    request = queries['all_buildings_that_have_not_been_processed.sql']
else:
    request = queries['all_buildings.sql']

In [28]:
real_estate_raw = query_client.query(request).to_dataframe()

In [29]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1572 entries, 0 to 1571
Data columns (total 4 columns):
ad_id             1572 non-null int64
new_building      1572 non-null bool
property_attrs    1572 non-null object
address           1572 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 38.5+ KB


In [30]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [31]:
addresses = real_estate_raw.drop(['new_building', 'property_attrs'], axis=1).drop_duplicates().copy()

In [32]:
addresses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 949 entries, 85287718 to 150541271
Data columns (total 1 columns):
address    949 non-null object
dtypes: object(1)
memory usage: 14.8+ KB


In [33]:
addresses.head()

Unnamed: 0_level_0,address
ad_id,Unnamed: 1_level_1
85287718,"Innspurten 6, 0663 Oslo"
90151696,"Ammerudveien 62, 0958 Oslo"
91411206,"Lillebergtunet borettslag, Hus A, trinn 1, 066..."
93367028,"Frysjaveien 42 - trinn 1, 0884 Oslo"
95247107,"Lillebergtunet borettlslag, Hus B, trinn 2, Ma..."


In [34]:
def get_directions(origin, destination, mode, key):
    google_maps_url = 'https://maps.googleapis.com/maps/api/directions/json'
    params = {'destination': destination, 'origin': origin, 'key': key, 'mode': mode}
    response = requests.get(google_maps_url, params=params)
    return response

In [52]:
def get_min_travel_time_sec(response):
    routes = []
    for route in response['routes']:
        all_legs_duration = 0
        for leg in route['legs']:
            all_legs_duration += leg['duration']['value']
        routes.append(all_legs_duration)
    try:
        return min(routes)
    except ValueError:
        return None

In [68]:
def get_travel_times_table(maps_responses):
    cols = ['ad_id', 'destination', 'time_s']
    data = []
    for dest in maps_responses.dest.unique():
        sub_df = maps_responses[maps_responses.dest==dest].set_index('ad_id', drop=True)
        for ad_id, response in sub_df.maps_response.items():
            min_time = get_min_travel_time_sec(json.loads(response))
            data.append([ad_id, dest, min_time])
    return pd.DataFrame(data, columns=cols)

In [39]:
def get_directions_jsons(destinations, address_table):
    cols = ['ad_id', 'destination', 'maps_response']
    data = []
    for dest in destinations:
        for ad_id, addr in address_table.address.items():
            response = get_directions(addr, dest, 'transit', maps_key)
            data.append([ad_id, dest, json.dumps(response.json())])
    return pd.DataFrame(data, columns=cols)

In [163]:
def get_origin_coordinates(maps_responses):
    all_dest = maps_responses.destination.unique()
    sub_df = maps_responses[maps_responses.destination==all_dest[0]].set_index('ad_id', drop=True)
    cols = ['ad_id', 'lat', 'lng']
    data = []
    for ad_id, response in sub_df.maps_response.items():
        response_dict = json.loads(response)
        try:
            coords = response_dict['routes'][0]['legs'][0]['start_location']
            lat = coords['lat']
            lng = coords['lng']
        except IndexError as e:
            lat = None
            lng = None
        data.append([ad_id, lat, lng])
    return pd.DataFrame(data, columns=cols)

In [84]:
def get_destination_coordinates():
    pass

In [40]:
common_destination = 'Nationaltheatret, Oslo'

In [44]:
all_api_results = get_directions_jsons([common_destination], addresses)

In [80]:
all_api_results.head()

Unnamed: 0,ad_id,destination,maps_response
0,85287718,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
1,90151696,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
2,91411206,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
3,93367028,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
4,95247107,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."


In [81]:
all_api_results.to_gbq(destination_table='housing_data.google_maps_responses',
                       project_id='hde-test-clean',
                       if_exists='append')

In [69]:
travel_times = get_travel_times_table(all_api_results)

In [70]:
travel_times.head()

Unnamed: 0,ad_id,destination,time_s
0,85287718,"Nationaltheatret, Oslo",1262.0
1,90151696,"Nationaltheatret, Oslo",2330.0
2,91411206,"Nationaltheatret, Oslo",811.0
3,93367028,"Nationaltheatret, Oslo",2075.0
4,95247107,"Nationaltheatret, Oslo",811.0


In [77]:
travel_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 949 entries, 0 to 948
Data columns (total 3 columns):
ad_id          949 non-null int64
destination    949 non-null object
time_s         939 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 22.3+ KB


In [82]:
travel_times.to_gbq(destination_table='housing_data.travel_times',
                    project_id='hde-test-clean',
                    if_exists='append')

In [166]:
ad_coordinates = get_origin_coordinates(all_api_results)

In [167]:
ad_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 949 entries, 0 to 948
Data columns (total 3 columns):
ad_id    949 non-null int64
lat      939 non-null float64
lng      939 non-null float64
dtypes: float64(2), int64(1)
memory usage: 22.3 KB


In [168]:
ad_coordinates.to_gbq(destination_table='housing_data.ad_coordinates',
                      project_id='hde-test-clean',
                      if_exists='append')