In [1]:
import requests
from bs4 import BeautifulSoup
from pyproj import CRS
from shapely.geometry import box
import geopandas as gpd

PROJ_NAME_MAP = {
    'NAD83 / Conus Albers': ('aea', 'NAD83'),
    'Albers Conical Equal Area': ('aea', None),
    'Lambert Conformal Conic': ('lcc', None),
    'Transverse Mercator': ('tmerc', None),
    # extend as you encounter new projection names
}
gs_3dep_url = 'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects/ID_FEMAHQ_2018_D18/ID_FEMAHQ_2018/'
gs_3dep_meta_url = 'metadata/'
gs_3dep_browse_url = 'browse/'
gs_3dep_las_url = 'LAZ/'

output_geojson_geo = 'ID_FEMAHQ_2018_las_tiles.geojson'
output_geojson_proj = 'ID_FEMAHQ_2018_las_tiles_alb.geojson'

In [2]:
def parse_html(gs_3dep_url):
    
    reqs = requests.get(gs_3dep_url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    xml_links = []
    for link_tag in soup.find_all('a', href=True): # Find all <a> tags with an href attribute
        href = link_tag.get('href')
        if href and href.lower().endswith('.xml'): # Check if the link ends with '.xml'
            # You may need to construct absolute URLs if hrefs are relative
            if href.startswith('http') or href.startswith('https'):
                xml_links.append(href)
            else:
                # Handle relative URLs (basic example, might need more robust handling for complex cases)
                from urllib.parse import urljoin
                absolute_url = urljoin(gs_3dep_url, href)
                xml_links.append(absolute_url)
                
    return xml_links

In [3]:
def parse_tile_xml(url: str) -> dict:
    resp = requests.get(url, timeout=60)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, 'xml')

    bounds_xml = soup.find('spdom').find('bounding')
    bbox = {
        'west': float(bounds_xml.find('westbc').text),
        'east': float(bounds_xml.find('eastbc').text),
        'north': float(bounds_xml.find('northbc').text),
        'south': float(bounds_xml.find('southbc').text),
    }

    mapproj = soup.find('horizsys').find('planar').find('mapproj')
    proj_name = mapproj.find('mapprojn').text.strip()

    # prefer EPSG or WKT if present
    epsg_tag = soup.find('refsysid')
    if epsg_tag and epsg_tag.find('code'):
        crs = CRS.from_epsg(int(epsg_tag.find('code').text))
    elif mapproj.find('planarco'):
        crs = CRS.from_wkt(mapproj.find('planarco').text)
    else:
        proj_alias, datum = PROJ_NAME_MAP.get(proj_name, (None, None))
        params = {}
        if proj_alias == 'aea' and mapproj.albers:
            stdpars = mapproj.albers.find_all('stdparll')
            params = {
                'proj': 'aea',
                'lat_1': float(stdpars[0].text),
                'lat_2': float(stdpars[1].text),
                'lat_0': float(mapproj.albers.find('latprjo').text),
                'lon_0': float(mapproj.albers.find('longcm').text),
                'x_0': float(mapproj.albers.find('feast').text),
                'y_0': float(mapproj.albers.find('fnorth').text),
                'datum': datum or 'WGS84',
                'units': 'm',
            }
        # add elif blocks for other projections (lcc, tmerc, utm...) using their tags
        if params:
            crs = CRS.from_dict(params)
        else:
            crs = CRS.from_user_input(proj_name)  # final fallback

    return {'bbox': bbox, 'crs': crs}

In [4]:
# Example usage for an entire project
project_xmls = parse_html(gs_3dep_url+gs_3dep_meta_url) 

In [None]:
project_meta = parse_tile_xml(project_xmls[0])  # same CRS for all tiles in this project
project_crs = project_meta['crs']

records = []
for i, url in enumerate(project_xmls):
    meta = parse_tile_xml(url)
    records.append(
        {
            'tile_id': url[-14:-4],
            'xml_url': url,
            'browse_url': url.replace(gs_3dep_meta_url,gs_3dep_browse_url)[0:-4]+'.jpg',
            'laz_url': url.replace(gs_3dep_meta_url,gs_3dep_las_url)[0:-4]+'.laz',
            'geometry': box(meta['bbox']['west'], meta['bbox']['south'],
                            meta['bbox']['east'], meta['bbox']['north']),
        }
    )
    
    print(f'\rCompleted {i/len(project_xmls):.2%} of XML files...')

tiles_gdf_geo = gpd.GeoDataFrame(records, crs='EPSG:4326')
tiles_gdf_proj = tiles_gdf_geo.to_crs(project_crs)

In [None]:
tiles_gdf_geo.to_file(output_geojson_geo)
tiles_gdf_proj.to_file(output_geojson_proj)


In [6]:
test = project_xmls[0]

In [7]:
test

'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects/ID_FEMAHQ_2018_D18/ID_FEMAHQ_2018/metadata/USGS_LPC_ID_FEMAHQ_2018_D18_w1488n2451.xml'

In [9]:
test.replace(gs_3dep_meta_url,gs_3dep_browse_url)[0:-4]+'.jpg'

'https://rockyweb.usgs.gov/vdelivery/Datasets/Staged/Elevation/LPC/Projects/ID_FEMAHQ_2018_D18/ID_FEMAHQ_2018/browse/USGS_LPC_ID_FEMAHQ_2018_D18_w1488n2451.jpg'