In [11]:
import os
import requests
import json
import yaml
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [12]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [13]:
maps_key = config['google_maps_api_key']

In [14]:
query_dir = 'queries'

In [15]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [16]:
query_client = bigquery.Client()

In [17]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [18]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.travel_times', query_client):
    request = queries['all_buildings_no_google_maps_response.sql']
else:
    request = queries['all_buildings.sql']

In [19]:
real_estate_raw = query_client.query(request).to_dataframe()

In [20]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 4 columns):
ad_id             102 non-null int64
new_building      102 non-null bool
property_attrs    102 non-null object
address           98 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 2.6+ KB


In [21]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [22]:
addresses = real_estate_raw.drop(['new_building', 'property_attrs'], axis=1).drop_duplicates().copy()

In [23]:
addresses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72 entries, 77962792 to 154024545
Data columns (total 1 columns):
address    71 non-null object
dtypes: object(1)
memory usage: 1.1+ KB


In [24]:
addresses.head()

Unnamed: 0_level_0,address
ad_id,Unnamed: 1_level_1
77962792,"Gamle Lommedalsvei 123, 1348 Rykkinn"
96760455,"Røa Torg - Vækerøveien 195, 0751 Oslo"
106540822,"Elveveien 26, Easy Living Fjellhamar, 1472 Fje..."
106544185,Vollebekk - Blomstertrappa Brl. Hus F - Brobek...
112196195,"Bjarne Haugens gate, 1475 Finstadjordet"


In [25]:
def get_directions(origin, destination, mode, key):
    google_maps_url = 'https://maps.googleapis.com/maps/api/directions/json'
    params = {'destination': destination, 'origin': origin, 'key': key, 'mode': mode}
    response = requests.get(google_maps_url, params=params)
    return response

In [26]:
def get_min_travel_time_sec(response):
    routes = []
    for route in response['routes']:
        all_legs_duration = 0
        for leg in route['legs']:
            all_legs_duration += leg['duration']['value']
        routes.append(all_legs_duration)
    try:
        return min(routes)
    except ValueError:
        return None

In [27]:
def get_travel_times_table(maps_responses):
    cols = ['ad_id', 'destination', 'time_s']
    data = []
    for dest in maps_responses.destination.unique():
        sub_df = maps_responses[maps_responses.destination==dest].set_index('ad_id', drop=True)
        for ad_id, response in sub_df.maps_response.items():
            min_time = get_min_travel_time_sec(json.loads(response))
            data.append([ad_id, dest, min_time])
    return pd.DataFrame(data, columns=cols)

In [28]:
def get_directions_jsons(destinations, address_table):
    cols = ['ad_id', 'destination', 'maps_response']
    data = []
    for dest in destinations:
        for ad_id, addr in address_table.address.items():
            response = get_directions(addr, dest, 'transit', maps_key)
            data.append([ad_id, dest, json.dumps(response.json())])
    return pd.DataFrame(data, columns=cols)

In [29]:
def get_origin_coordinates(maps_responses):
    all_dest = maps_responses.destination.unique()
    sub_df = maps_responses[maps_responses.destination==all_dest[0]].set_index('ad_id', drop=True)
    cols = ['ad_id', 'lat', 'lng']
    data = []
    for ad_id, response in sub_df.maps_response.items():
        response_dict = json.loads(response)
        try:
            coords = response_dict['routes'][0]['legs'][0]['start_location']
            lat = coords['lat']
            lng = coords['lng']
        except IndexError as e:
            lat = None
            lng = None
        data.append([ad_id, lat, lng])
    return pd.DataFrame(data, columns=cols)

In [30]:
def get_destination_coordinates():
    pass

In [31]:
common_destination = 'Nationaltheatret, Oslo'

In [32]:
all_api_results = get_directions_jsons([common_destination], addresses)

In [33]:
all_api_results.head()

Unnamed: 0,ad_id,destination,maps_response
0,77962792,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
1,96760455,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
2,106540822,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
3,106544185,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
4,112196195,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."


In [34]:
all_api_results.to_gbq(destination_table='housing_data.google_maps_responses',
                       project_id='hde-test-clean',
                       if_exists='append')

In [35]:
travel_times = get_travel_times_table(all_api_results)

In [36]:
travel_times.head()

Unnamed: 0,ad_id,destination,time_s
0,77962792,"Nationaltheatret, Oslo",2441.0
1,96760455,"Nationaltheatret, Oslo",1056.0
2,106540822,"Nationaltheatret, Oslo",2326.0
3,106544185,"Nationaltheatret, Oslo",1522.0
4,112196195,"Nationaltheatret, Oslo",2916.0


In [37]:
travel_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 3 columns):
ad_id          72 non-null int64
destination    72 non-null object
time_s         71 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.8+ KB


In [38]:
travel_times.to_gbq(destination_table='housing_data.travel_times',
                    project_id='hde-test-clean',
                    if_exists='append')

In [39]:
ad_coordinates = get_origin_coordinates(all_api_results)

In [40]:
ad_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 3 columns):
ad_id    72 non-null int64
lat      71 non-null float64
lng      71 non-null float64
dtypes: float64(2), int64(1)
memory usage: 1.8 KB


In [41]:
ad_coordinates.to_gbq(destination_table='housing_data.ad_coordinates',
                      project_id='hde-test-clean',
                      if_exists='append')