In [1]:
import os
import requests
import json
import yaml
import pandas as pd
from google.cloud import bigquery
import google.api_core.exceptions as google_exceptions

In [2]:
config = yaml.safe_load(open('config.yaml', 'r'))

In [3]:
maps_key = config['google_maps_api_key']

In [4]:
query_dir = 'queries'

In [5]:
queries = {}
for query_file in (os.listdir(query_dir)):
    with open(os.path.join(query_dir, query_file), 'r') as query:
        queries[query_file] = query.read()

In [6]:
query_client = bigquery.Client()

In [7]:
def exists_table(table_reference, client):
    try:
        client.get_table(table_reference)
        return True
    except google_exceptions.NotFound:
        return False

In [8]:
if exists_table(f'{config["project_id"]}.{config["dataset"]}.travel_times', query_client):
    request = queries['all_buildings_no_google_maps_response.sql']
else:
    request = queries['all_buildings.sql']

In [9]:
real_estate_raw = query_client.query(request).to_dataframe()

In [10]:
real_estate_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 4 columns):
ad_id             245 non-null int64
new_building      245 non-null bool
property_attrs    245 non-null object
address           240 non-null object
dtypes: bool(1), int64(1), object(2)
memory usage: 6.1+ KB


In [11]:
real_estate_raw.set_index(keys='ad_id', drop=True, inplace=True)

In [12]:
addresses = real_estate_raw.drop(['new_building', 'property_attrs'], axis=1).drop_duplicates().copy()

In [13]:
addresses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225 entries, 84585697 to 151748367
Data columns (total 1 columns):
address    224 non-null object
dtypes: object(1)
memory usage: 3.5+ KB


In [14]:
addresses.head()

Unnamed: 0_level_0,address
ad_id,Unnamed: 1_level_1
84585697,"Blomsterbakken 28, 1487 Hakadal"
86180027,"Grensevegen 13, 1929 Auli"
87708616,"Furumo - Furumogrenda, 1400 Ski"
88980906,"Krydderhagen - Hus D1-D2, 0579 Oslo"
88990917,"Ensjøhøyden, Stålverksveien 1, 0661 Oslo"


In [15]:
def get_directions(origin, destination, mode, key):
    google_maps_url = 'https://maps.googleapis.com/maps/api/directions/json'
    params = {'destination': destination, 'origin': origin, 'key': key, 'mode': mode}
    response = requests.get(google_maps_url, params=params)
    return response

In [16]:
def get_min_travel_time_sec(response):
    routes = []
    for route in response['routes']:
        all_legs_duration = 0
        for leg in route['legs']:
            all_legs_duration += leg['duration']['value']
        routes.append(all_legs_duration)
    try:
        return min(routes)
    except ValueError:
        return None

In [17]:
def get_travel_times_table(maps_responses):
    cols = ['ad_id', 'destination', 'time_s']
    data = []
    for dest in maps_responses.destination.unique():
        sub_df = maps_responses[maps_responses.destination==dest].set_index('ad_id', drop=True)
        for ad_id, response in sub_df.maps_response.items():
            min_time = get_min_travel_time_sec(json.loads(response))
            data.append([ad_id, dest, min_time])
    return pd.DataFrame(data, columns=cols)

In [18]:
def get_directions_jsons(destinations, address_table):
    cols = ['ad_id', 'destination', 'maps_response']
    data = []
    for dest in destinations:
        for ad_id, addr in address_table.address.items():
            response = get_directions(addr, dest, 'transit', maps_key)
            data.append([ad_id, dest, json.dumps(response.json())])
    return pd.DataFrame(data, columns=cols)

In [19]:
def get_origin_coordinates(maps_responses):
    all_dest = maps_responses.destination.unique()
    sub_df = maps_responses[maps_responses.destination==all_dest[0]].set_index('ad_id', drop=True)
    cols = ['ad_id', 'lat', 'lng']
    data = []
    for ad_id, response in sub_df.maps_response.items():
        response_dict = json.loads(response)
        try:
            coords = response_dict['routes'][0]['legs'][0]['start_location']
            lat = coords['lat']
            lng = coords['lng']
        except IndexError as e:
            lat = None
            lng = None
        data.append([ad_id, lat, lng])
    return pd.DataFrame(data, columns=cols)

In [20]:
def get_destination_coordinates():
    pass

In [21]:
common_destination = 'Nationaltheatret, Oslo'

In [22]:
all_api_results = get_directions_jsons([common_destination], addresses)

In [23]:
all_api_results.head()

Unnamed: 0,ad_id,destination,maps_response
0,84585697,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
1,86180027,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
2,87708616,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""Z..."
3,88980906,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""O..."
4,88990917,"Nationaltheatret, Oslo","{""geocoded_waypoints"": [{""geocoder_status"": ""Z..."


In [24]:
all_api_results.to_gbq(destination_table='housing_data.google_maps_responses',
                       project_id='hde-test-clean',
                       if_exists='append')

In [25]:
travel_times = get_travel_times_table(all_api_results)

In [26]:
travel_times.head()

Unnamed: 0,ad_id,destination,time_s
0,84585697,"Nationaltheatret, Oslo",4506.0
1,86180027,"Nationaltheatret, Oslo",8294.0
2,87708616,"Nationaltheatret, Oslo",
3,88980906,"Nationaltheatret, Oslo",1375.0
4,88990917,"Nationaltheatret, Oslo",


In [27]:
travel_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 3 columns):
ad_id          225 non-null int64
destination    225 non-null object
time_s         220 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 5.4+ KB


In [28]:
travel_times.to_gbq(destination_table='housing_data.travel_times',
                    project_id='hde-test-clean',
                    if_exists='append')

In [29]:
ad_coordinates = get_origin_coordinates(all_api_results)

In [30]:
ad_coordinates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 3 columns):
ad_id    225 non-null int64
lat      220 non-null float64
lng      220 non-null float64
dtypes: float64(2), int64(1)
memory usage: 5.4 KB


In [31]:
ad_coordinates.to_gbq(destination_table='housing_data.ad_coordinates',
                      project_id='hde-test-clean',
                      if_exists='append')