In [11]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

### url format for station list 
#### (search by name and data type)
https://waterdata.usgs.gov/nwis/inventory?search_station_nm=Arkansas%20river%20at&search_station_nm_match_type=beginning&data_type=rt&group_key=NONE&format=sitefile_output&sitefile_output_format=html_table&column_name=site_no&column_name=station_nm&column_name=dec_lat_va&column_name=dec_long_va&list_of_search_criteria=search_station_nm%2Cdata_type

#### <font color='blue'> If using pandas to read html directly, the site number loses "0" at the beginning</font>

In [4]:
url = 'https://waterdata.usgs.gov/nwis/inventory?search_station_nm=Arkansas%20river%20at&search_station_nm_match_type=beginning&data_type=rt&group_key=NONE&format=sitefile_output&sitefile_output_format=html_table&column_name=site_no&column_name=station_nm&column_name=dec_lat_va&column_name=dec_long_va&list_of_search_criteria=search_station_nm%2Cdata_type'

tables = pd.read_html(url) # pd.read_html() returns a list of tables from the url
df = tables[1]
df.head()

Unnamed: 0,Site Number,Site Name,Dec. Lat.,Dec. Lon,Cooraccr.,Dec.lat/longdatum
0,7086000,"ARKANSAS RIVER AT GRANITE, CO.",39.042771,-106.265855,F,NAD83
1,7087200,"ARKANSAS RIVER AT BUENA VISTA, CO.",38.849162,-106.12474,F,NAD83
2,7091200,"ARKANSAS RIVER NEAR NATHROP, CO",38.652219,-106.051126,F,NAD83
3,7094500,"ARKANSAS RIVER AT PARKDALE, CO.",38.487219,-105.373604,F,NAD83
4,7096000,"ARKANSAS RIVER AT CANON CITY, CO.",38.433887,-105.257213,F,NAD83


In [10]:
df.set_index('Site Number').to_dict(orient='index')

{7086000: {'Cooraccr.': 'F',
  'Dec. Lat.': 39.04277116,
  'Dec. Lon': -106.2658553,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER AT GRANITE, CO.'},
 7087200: {'Cooraccr.': 'F',
  'Dec. Lat.': 38.8491622,
  'Dec. Lon': -106.1247397,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER AT BUENA VISTA, CO.'},
 7091200: {'Cooraccr.': 'F',
  'Dec. Lat.': 38.6522188,
  'Dec. Lon': -106.051126,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER NEAR NATHROP, CO'},
 7094500: {'Cooraccr.': 'F',
  'Dec. Lat.': 38.4872189,
  'Dec. Lon': -105.373604,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER AT PARKDALE, CO.'},
 7096000: {'Cooraccr.': 'F',
  'Dec. Lat.': 38.43388674,
  'Dec. Lon': -105.2572128,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER AT CANON CITY, CO.'},
 7097000: {'Cooraccr.': 'F',
  'Dec. Lat.': 38.38833418,
  'Dec. Lon': -105.0160961,
  'Dec.lat/longdatum': 'NAD83',
  'Site Name': 'ARKANSAS RIVER AT PORTLAND, 

In [7]:
dummy = tables[0]
dummy.head()

Unnamed: 0,0,1
0,Site name contains string =,Arkansas river at
1,Data type =,Current Conditions and Recent Daily Data


### <font color='blue'>Alternative to pandas</font>
#### <font color='blue'>Here it gets interesting: using Selenium to read "ghost" javascript tables, which are otherwise unseeable to beautifulsoup</font>

In [20]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Users/sealoving/Documents/GitHub/Liang_Metis/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

In [22]:
url = 'https://waterdata.usgs.gov/nwis/inventory?search_station_nm=cumberland%20river%20at&search_station_nm_match_type=beginning&data_type=rt&group_key=NONE&format=sitefile_output&sitefile_output_format=html_table&column_name=site_no&column_name=station_nm&column_name=dec_lat_va&column_name=dec_long_va&list_of_search_criteria=search_station_nm%2Cdata_type'
driver.get(url)

In [23]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [25]:
station_table = soup.find_all('table')[1]
#print(station_table.prettify())

In [32]:
stations = station_table.find_all('tr')[2:]
#print(stations[2].prettify()) #skip two header rows

In [33]:
# test table contents
for i in range(len(stations)):
    print(stations[i].find_all('td')[0].text.strip(),
          stations[i].find_all('td')[1].text.strip(),
          float(stations[i].find_all('td')[2].text.strip()),
          float(stations[i].find_all('td')[3].text.strip()),
          stations[i].find_all('td')[4].text.strip())

03402900 CUMBERLAND R AT PINE ST BR AT PINEVILLE, KY 36.7631412 -83.691862 F
03403500 CUMBERLAND RIVER AT BARBOURVILLE, KY 36.8623097 -83.8874248 F
03404000 CUMBERLAND RIVER AT WILLIAMSBURG, KY 36.743417 -84.1560449 F
03404500 CUMBERLAND RIVER AT CUMBERLAND FALLS, KY 36.8373036 -84.3432703 U
03414100 CUMBERLAND RIVER AT BURKESVILLE, KY 36.7867288 -85.3652406 F
03417500 CUMBERLAND RIVER AT CELINA, TN 36.55422759 -85.5144139 U
03425000 CUMBERLAND RIVER AT CARTHAGE, TN 36.24810996 -85.95526519 U
03426310 CUMBERLAND RIVER AT OLD HICKORY DAM (TW), TN 36.29711845 -86.6586324 U
03426490 CUMBERLAND RIVER AT EDENWOLD, TN 36.2875 -86.6883333 H
03430250 CUMBERLAND RIVER AT STONES RIVER NEAR HERMITAGE,TN 36.19166667 -86.6652778 H
03430320 CUMBERLAND RIVER AT BRILEY PKWY NR INGLEWOOD, TN 36.23527778 -86.7125 H
03431091 CUMBERLAND R AT OMOHUNDRO WTR PLT AT NASHVILLE, TN 36.1653343 -86.7213866 U
03431500 CUMBERLAND RIVER AT NASHVILLE, TN 36.1614984 -86.7726764 U
034315005 CUMBERLAND RIVER AT WOODLAND