# OverPass API Comparison

## Used by both

In [1]:
import overpass
import geopandas as gpd
import lib.geoscraping as geo

from lib.osmtools import overpass_bounds
import lib.osmtools as ost

bounds = {
    'west': -87.57477, 
    'south': 33.16558, 
    'east': -87.48422, 
    'north': 33.23129
}

op = overpass.API()
response = op.get(
    "way" + overpass_bounds(bounds) + ";(._;>;);",
    verbosity="geom",
)

# Remove nodes before conversion, this is ~25x faster than removing from GDF
way_features = [f for f in response.features if f.geometry['type'] == "LineString"]
gdf = gpd.GeoDataFrame.from_features(way_features)
# Tuscaloosa-specific
gdf.drop([k for k in gdf.columns if 'tiger' in k], axis=1, inplace=True)

buildings = gdf[gdf.building.notnull()]
buildings = ost.drop_empty_cols(buildings)
buildings.reset_index(inplace=True)

roads = gdf[gdf.highway.notnull()]
roads = ost.drop_empty_cols(roads)
roads.reset_index(inplace=True)

other = gdf[gdf.highway.isnull() & gdf.building.isnull()]
other = ost.drop_empty_cols(other)
other.reset_index(inplace=True)

ost.gdf_overview(gdf, "Original")
ost.gdf_overview(buildings, "Buidlings")
ost.gdf_overview(roads,  "Roads")
ost.gdf_overview(other, "Other")

Original:
---------
- Rows: 26604
- Keys: 229

Buidlings:
----------
- Rows: 17201
- Keys: 110

Roads:
------
- Rows: 8150
- Keys: 76

Other:
------
- Rows: 1253
- Keys: 112



In [None]:
# Get raster code
import os, time

import ee
import pydrive

from pprint import pprint


## Testing Road, Building, and Misc OSM Parsing

Assuming only that a query to osm has been

### Get Raw GDF

In [1]:
import os, time
import overpass
import geopandas as gpd
import lib.geoscraping as geo

import lib.misc as m
import lib.osmtools as ost

city_name = 'tuscaloosa'
out_folder = f'./data/sets/{city_name}'
if not os.path.exists(out_folder):
    os.mkdir(out_folder)


bounds = {
    'west': -87.57477, 
    'south': 33.16558, 
    'east': -87.48422, 
    'north': 33.23129
}

from lib.authkit import ee_client, get_drive
import lib.imagetools as it

# User account must be used for drive export
ee_client(user=True, verbose=True)
gdrive = get_drive(settings_fp='./keys/pydrive/settings.yaml')

raster_name = it.export_naip_image(gdrive=gdrive, 
                                   drive_folder='data/geo_scraping', 
                                   filename=f'{city_name}.tif', 
                                   bounds=bounds)
filepath = it.download_raster(gdrive=gdrive, 
                           name=raster_name, 
                           out_path=f"./data/sets/{city_name}", 
                           force_equals=False, 
                           verbose=True)

op = overpass.API()
query_start = time.perf_counter()
response = op.get(
    "way" + ost.overpass_bounds(bounds) + ";(._;>;);",
    verbosity="geom",
)

convert_start = time.perf_counter()
# Remove nodes before conversion, this is ~25x faster than removing from GDF
way_features = [f for f in response.features if f.geometry['type'] == "LineString"]
gdf = gpd.GeoDataFrame.from_features(way_features)
#ost.gdf_overview(gdf, "Original")



build_gdf = gdf[gdf.building.notnull()].reset_index()
build_gdf = ost.parse_osm_gdf(gdf=build_gdf,
                             main_key='building')
build_gdf.set_crs(epsg=4326)
perimeter_fp = os.path.join(out_folder, 'building_perimeters.shp')
build_gdf.to_file(perimeter_fp)

build_gdf['geometry'] = ost.gdf_polygonize(build_gdf)
footprint_fp = os.path.join(out_folder, 'building_footprints.shp')
build_gdf.to_file(footprint_fp)

road_gdf = gdf[gdf.highway.notnull()].reset_index()
road_gdf = ost.parse_osm_gdf(gdf=road_gdf,
                             main_key='highway')
road_gdf.set_crs(epsg=4326, inplace=True)
raw_road_fp = os.path.join(out_folder, 'raw_roads.shp')
road_gdf.to_file(raw_road_fp)

other_gdf = gdf[gdf.highway.isnull() & gdf.building.isnull()]
other_gdf = ost.drop_empty_cols(other_gdf)
other_gdf.reset_index(inplace=True)
other_gdf.set_crs(epsg=4326, inplace=True)
other_fp = os.path.join(out_folder, 'other.shp')
other.to_file(other_fp)

Original:
---------
- Rows: 26604
- Keys: 271

time data:
----------
- Querying: 06s 
- Cleaning: 0.864s


In [3]:
import os, time
import overpass
import geopandas as gpd
import lib.geoscraping as geo

import lib.misc as m
import lib.osmtools as ost

out_folder = './data/sets/austin'
if not os.path.exists(out_folder):
    os.mkdir(out_folder)
bounds = {
    'west': -97.7699, 
    'south': 30.2237, 
    'east': -97.7212, 
    'north': 30.3040
}

op = overpass.API()
query_start = time.perf_counter()
response = op.get(
    "way" + ost.overpass_bounds(bounds) + ";(._;>;);",
    verbosity="geom",
)

convert_start = time.perf_counter()
# Remove nodes before conversion, this is ~25x faster than removing from GDF
way_features = [f for f in response.features if f.geometry['type'] == "LineString"]
gdf = gpd.GeoDataFrame.from_features(way_features)
ost.gdf_overview(gdf, "Original")



build_gdf = gdf[gdf.building.notnull()].reset_index()
build_gdf = ost.parse_osm_gdf(gdf=build_gdf,
                             main_key='building')
build_gdf.set_crs(epsg=4326)
perimeter_fp = os.path.join(out_folder, 'building_perimeters.shp')
build_gdf.to_file(perimeter_fp)

build_gdf['geometry'] = ost.gdf_polygonize(build_gdf)
footprint_fp = os.path.join(out_folder, 'building_footprints.shp')
build_gdf.to_file(footprint_fp)
ost.gdf_overview(build_gdf, "Buildings")

road_gdf = gdf[gdf.highway.notnull()].reset_index()
road_gdf = ost.parse_osm_gdf(gdf=road_gdf,
                             main_key='highway')
road_gdf.set_crs(epsg=4326, inplace=True)
raw_road_fp = os.path.join(out_folder, 'raw_roads.shp')
road_gdf.to_file(raw_road_fp)
ost.gdf_overview(build_gdf, "Roads")

other_gdf = gdf[gdf.highway.isnull() & gdf.building.isnull()]
other_gdf = ost.drop_empty_cols(other_gdf)
other_gdf.reset_index(inplace=True)
other_gdf.set_crs(epsg=4326, inplace=True)
other_fp = os.path.join(out_folder, 'other.shp')
other.to_file(other_fp)
ost.gdf_overview(build_gdf, "Misc")

In [None]:
import os, time
import overpass
import geopandas as gpd
import lib.geoscraping as geo

import lib.misc as m
import lib.osmtools as ost

out_folder = './data/sets/new_york'
if not os.path.exists(out_folder):
    os.mkdir(out_folder)
bounds = {
    'size': "NA",
    'west': -74.4034, 
    'south': 40.3712, 
    'east': -73.5918, 
    'north': 40.9359
}

op = overpass.API()
query_start = time.perf_counter()
response = op.get(
    "way" + ost.overpass_bounds(bounds) + ";(._;>;);",
    verbosity="geom",
)

convert_start = time.perf_counter()
# Remove nodes before conversion, this is ~25x faster than removing from GDF
way_features = [f for f in response.features if f.geometry['type'] == "LineString"]
gdf = gpd.GeoDataFrame.from_features(way_features)
ost.gdf_overview(gdf, "Original")



build_gdf = gdf[gdf.building.notnull()].reset_index()
build_gdf = ost.parse_osm_gdf(gdf=build_gdf,
                             main_key='building')
build_gdf.set_crs(epsg=4326)
perimeter_fp = os.path.join(out_folder, 'building_perimeters.shp')
build_gdf.to_file(perimeter_fp)

build_gdf['geometry'] = ost.gdf_polygonize(build_gdf)
footprint_fp = os.path.join(out_folder, 'building_footprints.shp')
build_gdf.to_file(footprint_fp)
ost.gdf_overview(build_gdf, "Buildings")

road_gdf = gdf[gdf.highway.notnull()].reset_index()
road_gdf = ost.parse_osm_gdf(gdf=road_gdf,
                             main_key='highway')
road_gdf.set_crs(epsg=4326, inplace=True)
raw_road_fp = os.path.join(out_folder, 'raw_roads.shp')
road_gdf.to_file(raw_road_fp)
ost.gdf_overview(build_gdf, "Roads")

other_gdf = gdf[gdf.highway.isnull() & gdf.building.isnull()]
other_gdf = ost.drop_empty_cols(other_gdf)
other_gdf.reset_index(inplace=True)
other_gdf.set_crs(epsg=4326, inplace=True)
other_fp = os.path.join(out_folder, 'other.shp')
other.to_file(other_fp)
ost.gdf_overview(build_gdf, "Misc")

### Parse Roads

In [4]:
parse_road_start = time.perf_counter()
road_gdf = gdf[gdf.highway.notnull()].reset_index()
road_gdf = ost.parse_osm_gdf(gdf=road_gdf,
                             main_key='highway')
road_gdf.set_crs(epsg=4326)

save_road_start = time.perf_counter()
raw_road_fp = os.path.join(out_folder, 'raw_roads.shp')
road_gdf.to_file(raw_road_fp)


print(f"Parsed Roads ({type(road_gdf)}):")
print(f"- rows: {len(road_gdf.geometry)}")
print(f"- cols: {len(road_gdf.columns)}")
print(f"- keys: {road_gdf.columns.to_list()}")
print(f"- Label Info:")
print(f"  - Filled categories: {(road_gdf['category'].notnull().sum())}")
print(f"  - Filled names: {(road_gdf['name'].notnull().sum())}")
print("- Time Data:")
print(f"  - parse: {m.fmt_time((parse_road_start, save_road_start))}")
print(f"  - save: {m.fmt_time((save_road_start, time.perf_counter()))}")

print(f"\nSaved raw roads to '{raw_road_fp}'")

Parsed Roads (<class 'geopandas.geodataframe.GeoDataFrame'>):
- rows: 8150
- cols: 5
- keys: ['index', 'geometry', 'category', 'label', 'name']
- Label Info:
  - Filled categories: 2036
  - Filled names: 1561
- Time Data:
  - parse: 01s 
  - save: 0.392s

Saved raw roads to './data/tests/tuscaloosa/raw_roads.shp'


### Parse Buildings

In [None]:
parse_build_start = time.perf_counter()
building_gdf = gdf[gdf.building.notnull()].reset_index()

building_gdf = ost.parse_osm_gdf(gdf=building_gdf,
                                 main_key='building')



building_gdf.set_crs(epsg=4326, inplace=True)

save_build_start = time.perf_counter()
raw_build_fp = os.path.join(out_folder, 'raw_buildings.shp')
building_gdf.to_file(raw_build_fp)


print(f"Parsed Roads ({type(building_gdf)}):")
print(f"- rows: {len(building_gdf.geometry)}")
print(f"- cols: {len(building_gdf.columns)}")
print(f"- keys: {building_gdf.columns.to_list()}")
print(f"- Label Info:")
print(f"  - Filled categories: {(building_gdf['category'].notnull().sum())}")
print(f"  - Filled names: {(building_gdf['name'].notnull().sum())}")
print("- Time Data:")
print(f"  - parse: {m.fmt_time((parse_build_start, save_build_start))}")
print(f"  - save: {m.fmt_time((save_build_start, time.perf_counter()))}")

print(f"\nSaved raw roads to '{raw_build_fp}'")

### Parse Remaining

In [None]:
# Remaining Geom Parsing
other = gdf[gdf.highway.isnull() & gdf.building.isnull()].reset_index()

In [None]:
import pandas as pd
import lib.osmtools as ost

named_roads = roads[roads['name'].notnull()]
named_roads = ost.drop_empty_cols(named_roads)
named_roads.reset_index(inplace=True)

#all_names = named_roads['name'].to_list()
#ost.gdf_value_columns(named_roads, ['name'])

mcfarland = named_roads[named_roads['name'] == 'McFarland Boulevard East']
mcfarland = ost.drop_empty_cols(mcfarland)



print(mcfarland.columns)
ost.gdf_value_columns(mcfarland, ['highway', 'surface', 'oneway', 'lanes'])


print(f"Total cols: {len(mcfarland.columns)}")
print(f"Total rows: {len(mcfarland.geometry)}")
print("- col: filled")
for col in mcfarland.columns:
    print(f"- '{col}': {mcfarland[col].notnull().sum()}")

In [None]:
import shapely.geometry as shp
import shapely.ops as sops
import geopandas as gpd
mc = gpd.GeoDataFrame(geometry=mcfarland.geometry, crs='EPSG:4326')

lines = [l for l in mc.geometry]

merged = sops.linemerge(lines)

print('pre-merge: ',len(lines))
print('post-merge:',len(merged))

gpd.GeoDataFrame(geometry=list(merged), crs='EPSG:4326').to_file('data/tests/tuscaloosa/mcfar_merged.shp')

In [None]:


def road_merge(gdf,):
    

In [None]:
import lib.osmtools as ost
service_roads = roads[roads['highway'] == 'service']
service_roads = ost.drop_empty_cols(service_roads)

print("total cols", len(service_roads.columns))
print("total rows",len(service_roads.geometry))
print("Empty service values:",service_roads['service'].isnull().sum())



for k in ['parking_aisle', 'driveway', 'drive-through', 'Busway', 'alley', 'emergency_access']:
    if k in service_roads.columns:
        print('-',k)
print()

ost.gdf_value_columns(service_roads, ['service'])

In [None]:
import pandas as pd
def get_second_labels(gdf, main_key: str) -> pd.Series:
    """ 
    Gets the second-level labels as 
    a pandas series from OSM GDF.
    
    Args:
        gdf (GeoDataFrame): GeoDataFrame with labels to parse.
        main_key (str):     Key holding top-level labels.
    Returns:
        pd.Series: Series containing second-level labels as assigned by main column.
    """
    second_labels = []
    for r_idx, row in gdf.iterrows():
        filled = row.notnull()
        label = row[main_key]
        if (label in gdf.columns) and filled.get(label):
            second_labels.append(row[label])
        else:
            second_labels.append(None)
    
    return pd.Series(second_labels)
    
second_label_set = get_second_labels(roads, "highway")

In [None]:
def get_categories(gdf, key):
    """ prints info on sub-categories of GDF. """

    sub_labels = {}

    for r_idx, row in gdf.iterrows():
        hkey = row[key]
        if hkey in gdf.columns:
            if hkey not in sub_labels.keys():
                sub_labels.update({hkey: {'filled': 0, 'empty': 0}})
            
            if row.isnull().get(hkey):
                sub_labels[hkey]['empty'] += 1
            else:
                sub_labels[hkey]['filled'] += 1

    for key, item in sub_labels.items():
        print(key)
        for k, i in item.items():
            print(f"- {k}: {i}")

get_categories(roads, 'highway')

In [None]:
import pandas as pd

def swap_avail_col(row):
    nulls = row.isnull()
    if row['X'] in 
    if nulls.get('X') 
    return

my_df = pd.DataFrame({'X': ['A', 7, 'C', 'B'], 'A': [0,1,2,3], 'B': [0,2,4,6], 'C': [0,3,6,9], 'D': [0,0,0,0]})
print("Initial:")
print(my_df, '\n')

my_df['D'] = my_df['C']
print("After Col Swap:")
print(my_df, '\n')



my_df['D'] = my_df['D']


In [None]:
cycleways = roads[roads['cycleway'].notnull()]
cycleways = ost.drop_empty_cols(cycleways)
print(cycleways.columns)
print('total entries:', len(cycleways.geometry))
#print(ost.gdf_null_values(cycleways))
ost.gdf_value_columns(cycleways, ['highway', 'cycleway', 'name', 'lanes', 'bicycle'])

In [None]:
# Road GDF Analysis
null_counts =  ost.gdf_null_values(roads)
print(null_counts.head(10))
#value_counts = ost.gdf_value_columns(roads, ['highway', 'surface', 'service', 'access'])


In [None]:
print(null_counts[null_counts['key'] == 'cycleway'])

In [None]:
# Building GDF Analysis
null_counts =  ost.gdf_null_values(buildings)
print(null_counts.head(10))

In [None]:
# Other GDF Analysis
null_counts =  ost.gdf_null_values(other)
print(null_counts.head(10))

In [None]:
# Show how many rows only contain the top label
ninfo = {
    'smallest': 10,
    'smallest_idx': -1,
    'largest': 0,
    'largest_idx': -1,
    'has_1': 0,
    'has_2': 0,
    'has_3': 0,
    'has_4': 0,
}
for r_idx, row in other.iterrows():
    
    ns = row.notnull()
    filled = ns.sum()
    
    
    if filled < ninfo['smallest']:
        ninfo['smallest'] =  filled
        ninfo['smallest_idx'] = r_idx
    if filled > ninfo['largest']:
        ninfo['largest'] = filled
        ninfo['largest_idx'] = r_idx
    if (filled == 1):
        ninfo['has_1'] += 1
    if (filled == 2):
        ninfo['has_2'] += 1
    elif (filled == 3):
        ninfo['has_3'] += 1
    elif (filled == 4):
        ninfo['has_4'] += 1

print("\nFilled Values by rows:")
for key, item in ninfo.items():
    print(f"- {key}: {item}")

In [None]:
count = 0
for r_idx, row in roads.iterrows():
    ns = row.notnull()
    if (row['highway'] != 'service') and (ns.get('service')):
        count += 1
        print(r_idx)

In [None]:
null_service_roads = service_roads[service_roads['service'].isnull()]

print(len(null_service_roads.geometry))
null_counts =  ost.gdf_null_values(null_service_roads, blacklist=[k for k in service_roads.columns if 'tiger' in k])
print(null_counts.head(10))

In [None]:
road_blacklist_keys = [key for key in roads.columns if key not in ['name', 'highway', 'service', 'geometry', 'index']]
roads.drop(road_blacklist_keys, axis=1, inplace=True)
print(roads.columns)

In [None]:
print(len(gdf.geometry))
print(type(gdf.geometry))
print(len(response["features"]))

counts = {}
for f in response["features"]:
    type_key = f.geometry['type']
    if (type_key not in counts.keys()):
        counts.update({type_key: 1})
    else:
        counts[type_key] += 1

print(counts)

## Overpass (Raw)

Resources:
- [Github](https://github.com/mvexel/overpass-api-python-wrapper)
- [doc](https://wiki.openstreetmap.org/wiki/Overpass_API)

In [None]:
from time import perf_counter
import overpass
import geopandas as gpd
op = overpass.API()

    
api_start = perf_counter()
response = op.get(
    "way" + overpass_bounds(bounds) + ";(._;>;);",
    verbosity="geom",
)
filter_start = perf_counter()
way_features = [f for f in response.features if f.geometry['type'] == "LineString"]
convert_start = perf_counter()
gdf = gpd.GeoDataFrame.from_features(way_features)


print("Time Report:")
print(f"- api Query: {(filter_start - api_start)}")
print(f"- Feature Filter: {(convert_start - filter_start)}")
print(f"- Geom Conversion: {(perf_counter() - convert_start)}")

In [None]:
import time
def count_data(data, a):
    """ Recursively count any element in iterable? """
    def compare(b, a):
        if not isinstance(b, type(a)):
            return False
        return (b == a)
    count = 0
    if isinstance(data, list):
        for item in data:
            count += count_data(item, a)
    elif isinstance(data, dict):
        for _, item in data.items():
            count += count_data(item, a)
    elif compare(data, a):
        return 1
    return count

dtype_info = {key: {'filled': 'x', 'empty': 'x'} for key in report.keys()}
completed = 0

for index, row in gdf.iterrows():
    null_series = row.isnull()
    for key, item in dtype_info.items():
        if null_series.get(key):
            if (item['empty'] == 'x'):
                item.update({'empty': row[key]})
        elif (item['filled'] == 'x'):
            item.update({'filled': row[key]})
    if (count_data(dtype_info, 'x') == 0):
        break

In [None]:
for key, data in dtype_info.items():
    print(key)
    for skey, sdata in data.items():
        print(f"- {skey} ({type(sdata).__name__}): {sdata}")      

In [None]:
instance_report = {
    'building': None,
    'highway': None,
    'oneway': None,
    'service': None
}
for key in instance_report.keys():
    values = gdf.value_counts(key)
    print()
    print(f"{key} ({values.count()})")
    for value, count in values.iteritems():
        print(f"  - '{value}': {count}")
         
print(instance_report['building'])

In [None]:
print(report['highway'])

In [None]:
to_check = ["key", "highway", "amenity", "value", "area"]
for comp in to_check:
    if comp in gdf.columns:
        nulls = gdf[comp].isnull().sum()
        print(f"- found {comp} in raw keys")
        print(f"  - Filled: {(raw_total-nulls)}")
        print(f"  - Empty:  {nulls}")
        print(f"  - Total:  {raw_total}")

print("Raw:")
print(f"- Keys:  {len(raw_keys)}")
print(f"- Geoms: {len(gdf.geometry)}")
print(f"- crs:  {gdf.crs}")
print()

print("Clean:")
print(f"- Keys:  {len(raw_keys)}")
print(f"- Geoms: {len(gdf.geometry)}")
print(f"- crs:  {gdf.crs}")
print()

for key in clean_keys:
    if key not in raw_keys:
        print(f"Clean is Missing: {key}")
for key in raw_keys:
    if key not in clean_keys:
        print(f"Raw is Missing: {key}")

## Overpy (Wrapper)

Resources:
- [Github](https://github.com/DinoTools/python-overpy)
- [doc](https://python-overpy.readthedocs.io/en/latest/)