### Extract full list of pdf maps

In [172]:
import pandas as pd
import numpy as np

import io
import requests
from PyPDF2 import PdfReader

In [137]:
ca = pd.read_csv("data/California.csv", header=None)
fl = pd.read_csv("data/Florida.csv", header=None)
mi = pd.read_csv("data/Michigan.csv", header=None)
tx = pd.read_csv("data/Texas.csv", header=None)
ny = pd.read_csv("data/New York.csv", header=None)
all_subset = pd.concat([ca, fl, mi, tx, ny])[14]

### Extract coordinates from topographical maps

In [155]:
from pdfminer.high_level import extract_text
import geocoder
import re

In [156]:
abbr = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
states = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}

In [291]:
def extract_coor(url):
    
    # Cite open-source pdfminer.six
    response = requests.get(url)
    pdf= BytesIO(response.content)
    text = extract_text(pdf)
    
    # Parse out the four coordinate bounds
    matches = re.findall("(-?\d+\.\d*?°)", text) 
    matches = pd.unique(np.array(matches))
    pair1 = [matches[0], matches[2]]
    pair2 = [matches[1], matches[3]]
    pair1_min = min(pair1)
    pair1_max = max(pair1)
    pair2_min = min(pair2)
    pair2_max = max(pair2)

    # Parse out state map is in for coordinate-validation
    found = ""
    for ele in abbr:
        if re.search(f",\s+{ele}\s", text) != None:
            found = ele
        else:
            continue
    state = states[found] 

    # Figure out which are the latitudes and which are the longitudes using state
    test1 = geocoder.google(f"{pair1_min}, {pair2_min}", reverse = True)
    test2 = geocoder.google(f"{pair1_max}, {pair2_max}", reverse = True)
    if state == test1 and state == test2:
        latmin, latmax, longmin, longmax = pair1_min, pair1_max, pair2_min, pair2_max
    else:
        latmin, latmax, longmin, longmax = pair2_min, pair2_max, pair1_min, pair1_max

    return float(latmin[0:-1]), float(longmin[0:-1]), float(latmax[0:-1]), float(longmax[0:-1])

### Extract ground-truth roads from OSM

In [221]:
import overpass
import folium

In [285]:
def get_roads(coords):
    bbox = (",".join(map(str, coords)))
    api = overpass.API()
    query = """way["highway"]({bbox})""".format(bbox=bbox)
    result = api.get(query)
    return results

In [280]:
def make_map(coords):
    m = folium.Map(location = coords)

In [None]:
def plot_roads(feature):

    # Get coordinates of the road
    id = feature["id"]
    query = """way({id}); out ids geom;""".format(id=id)
    for node in query["features"]:
        coords = node["geometry"]["coordinates"]
        
    way_type = feature["properties"].get("highway", "")  
    color = "grey"  
    weight = 1
    if way_type == "motorway":
        color = "red" 
        weight = 3
    elif way_type == "trunk":
        color = "orange"
        weight = 2
    elif way_type == "primary":
        color = "gold"
        weight = 2
    elif way_type == "secondary":
        color = "yellow"      
        weight = 2
    elif way_type == "tertiary":
        weight = 2
        
    folium.PolyLine(coords, color=color, weight=weight).add_to(m)

# Testing (comment out)

In [292]:
url = all_subset.iloc[0]
coords = extract_coor(url)

Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Max retries exceeded with url: /maps/api/geocode/json?address=-122.7500%C2%B0%2C+39.6250%C2%B0&bounds=&components=&region=&language= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001BED9669390>: Failed to resolve 'maps.googleapis.com' ([Errno 11001] getaddrinfo failed)"))
Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Max retries exceeded with url: /maps/api/geocode/json?address=-122.8750%C2%B0%2C+39.7500%C2%B0&bounds=&components=&region=&language= (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001BED96C6F90>: Failed to resolve 'maps.googleapis.com' ([Errno 11001] getaddrinfo failed)"))


In [286]:
results = get_roads(coords)

OverpassSyntaxError: [out:json];way["highway"](39.6250°,-122.7500°,39.7500°,-122.8750°);out body;