### Extract full list of pdf maps

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
import requests
from PyPDF2 import PdfReader

import random

import networkx as nx

In [74]:
tx = pd.read_csv("data/Texas.csv", header=None)
tx_pdf = tx[14]
tx_img = tx[16]

### Extract coordinates from topographical maps

In [18]:
from pdfminer.high_level import extract_text
import geocoder
import re

In [19]:
abbr = ["AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"]
states = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut","DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana","IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts","MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada","NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota","OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina","SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington","WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}

In [37]:
def extract_coor(url):
    
    # Cite open-source pdfminer.six
    response = requests.get(url)
    pdf = io.BytesIO(response.content)
    text = extract_text(pdf)
    
    # Parse out the four coordinate bounds
    matches = re.findall("(-?\d+\.\d*?°)", text) 
    matches = pd.unique(np.array(matches))
    pair1 = [matches[0], matches[2]]
    pair2 = [matches[1], matches[3]]
    pair1_min = min(pair1)
    pair1_max = max(pair1)
    pair2_min = min(pair2)
    pair2_max = max(pair2)

    # Parse out state map is in for coordinate-validation
    found = ""
    for ele in abbr:
        if re.search(f",\s+{ele}\s", text) != None:
            found = ele
        else:
            continue
    state = states[found] 

    # Figure out which are the latitudes and which are the longitudes using state
    test1 = geocoder.google(f"{pair1_min}, {pair2_min}", reverse = True)
    test2 = geocoder.google(f"{pair1_max}, {pair2_max}", reverse = True)
    if state == test1 and state == test2:
        latmin, latmax, longmin, longmax = pair1_min, pair1_max, pair2_min, pair2_max
    else:
        latmin, latmax, longmin, longmax = pair2_min, pair2_max, pair1_min, pair1_max

    return float(latmin[0:-1]), float(latmax[0:-1]), float(longmin[0:-1]), float(longmax[0:-1])

### Extract ground-truth roads from OSM

In [104]:
def get_ground_truth(coords, index):
    G = ox.graph_from_bbox(bbox=coords, simplify = True, network_type="all")
    ox.io.save_graphml(G, f"data/training/tx_train_{index}.gml")

In [105]:
get_ground_truth(coords, "test")

# Data Pipeline

In [95]:
random.seed(428)
train_index = random.sample(range(len(tx)), 2000)
train_pdf = tx_pdf.iloc[train_index].reset_index()
train_pdf = train_pdf.rename(columns={14:"url"})
train_img = tx_img.iloc[train_index].reset_index()
train_img = train_img.rename(columns={14:"url"})

In [97]:
coord_list = [extract_coor(url) for url in train_pdf["url"]]

i = 0
for coord in coord_list:
    get_ground_truth(coord, i)
    i += 1

KeyboardInterrupt: 