In [17]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

#### Input information for scraping USGS water data

In [18]:
# Define geographical region to search for USGS stations
# These are the key words to put in the USGS search engine
watersheds = ['Missouri',
              'Yellowstone',
              'Platte',
              'Mississippi',
              'Illinois',
              'Ohio',
              'Cumberland',
              'Tennessee',
              'Arkansas',
              'Red']
# Define the study time period (10 years total, 2007-2017)
# A "water year" starts on Oct.1st
start_date = '2007-10-01'
end_date = '2017-09-30'
# Define the data type to be extracted
# only include discharge and river stage
data_types = ['discharge','gage']

#### Step 1: generate a station list for each geographical region (watershed)

In [23]:
# stations = dict.fromkeys(watersheds, {})

# for ws in watersheds:
#     url = ('https://waterdata.usgs.gov/nwis/inventory?search_station_nm='
#            +ws.split()[0]
#            +'%20river%20at&search_station_nm_match_type=beginning&data_type=rt&group_key=NONE&format=sitefile_output&sitefile_output_format=html_table&column_name=site_no&column_name=station_nm&column_name=dec_lat_va&column_name=dec_long_va&list_of_search_criteria=search_station_nm%2Cdata_type')
#     tables = pd.read_html(url) # pd.read_html() returns a list of tables from the url
#     df = tables[1]
#     df.columns = ['site_no','site_name','site_lat','site_lon','Cooraccr','latlongdatum']
#     stations[ws] = df.drop('Cooraccr',axis=1).set_index('site_no').to_dict(orient='index')
#     print('...%d stations were added to %s' %(len(df),ws))



...40 stations were added to Missouri
...8 stations were added to Yellowstone
...6 stations were added to Platte
...35 stations were added to Mississippi
...14 stations were added to Illinois
...40 stations were added to Ohio
...18 stations were added to Cumberland
...6 stations were added to Tennessee
...31 stations were added to Arkansas
...26 stations were added to Red


In [62]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Users/sealoving/Documents/GitHub/Liang_Metis/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

In [74]:
stations = {}

for ws in watersheds:
    url = ('https://waterdata.usgs.gov/nwis/inventory?search_station_nm='
           +ws.split()[0]
           +'%20river%20at&search_station_nm_match_type=beginning&data_type=rt&group_key=NONE&format=sitefile_output&sitefile_output_format=html_table&column_name=site_no&column_name=station_nm&column_name=dec_lat_va&column_name=dec_long_va&list_of_search_criteria=search_station_nm%2Cdata_type')
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    station_table = soup.find_all('table')[1].find_all('tr')[2:]
    
    for i in range(len(station_table)):
        site_no = station_table[i].find_all('td')[0].text.strip()
        name = station_table[i].find_all('td')[1].text.strip()
        lat = float(station_table[i].find_all('td')[2].text.strip())
        lon = float(station_table[i].find_all('td')[3].text.strip())
        stations[site_no] = {'watershed': ws,
                             'site_name': name,
                             'lat': lat,
                             'lon': lon}
    
    print('...%d stations were added to %s' %(len(station_table),ws))

...40 stations were added to Missouri
...8 stations were added to Yellowstone
...6 stations were added to Platte
...35 stations were added to Mississippi
...14 stations were added to Illinois
...40 stations were added to Ohio
...18 stations were added to Cumberland
...6 stations were added to Tennessee
...31 stations were added to Arkansas
...26 stations were added to Red


In [82]:
df = pd.DataFrame.from_dict(data=stations,orient='index')
df.head()

Unnamed: 0,watershed,site_name,lat,lon
2111500,Red,"REDDIES RIVER AT NORTH WILKESBORO, NC",36.175,-81.168889
3085730,Ohio,Ohio River at Emsworth Dam Upper Pool @ Emsworth,40.503889,-80.085556
3085734,Ohio,Ohio River at Emsworth Dam Lower Pool @ Emsworth,40.50525,-80.089833
3086000,Ohio,"Ohio River at Sewickley, PA",40.549234,-80.205615
3086001,Ohio,"Ohio River (lower pool) at Sewickley, PA",40.549722,-80.206944


#### Step 2: populate data inventory for each station in the lists

In [83]:
# A function that retrieves data inventory information based on station "site_no" 
# "site_no" is the key in the sub-dictionary for each watershed in the "stations"
def get_data_info(soup,data_type):
    '''Grab a value from USGS data invetory table HTML
    
    Takes a string data type of a station on the page and
    returns the string in the next/next/next object (not siblings, unfortunately)
    or None if nothing is found.
    '''
    obj = soup.find(text=re.compile(data_type))
    if not obj: 
        return None
    D_start = obj.findNext().text.strip()
    D_end = obj.findNext().findNext().text.strip()
    D_count = obj.findNext().findNext().findNext().text.strip()
    return([D_start,D_end,D_count])


    
def get_inventory(stations,data_types):
    '''Populate data info for each station in stations dictionary
    
    data_types: a list of data type to be extracted
    such as: 'Discharge','Gage','Salinity','Turbidity'
    '''
    for site_no in stations.keys():
        url = 'https://waterdata.usgs.gov/nwis/inventory/?site_no='+site_no+'&agency_cd=USGS'

        response = requests.get(url)
#         print(response.status_code)
        soup = BeautifulSoup(response.text,"lxml")

        description_table = soup.find(id='stationTable').find_all('dl')[0]
        drainage = description_table.find(text=re.compile('Drainage'))
        if drainage:
            stations[site_no]['drainage_area_sqmi'] = float(drainage.split()[2].replace(',',''))
        else:
            stations[site_no]['drainage_area_sqmi'] = None
        datum = description_table.find(text=re.compile('Datum'))
        if datum:
            stations[site_no]['gage_datum_ft'] = float(datum.split()[3].replace(',',''))
        else:
            stations[site_no]['gage_datum_ft'] = None


        inventory_table = soup.find(id='stationTable').find_all('dl')[1]
        for data_type in data_types:
            data_info = get_data_info(inventory_table, data_type)
            stations[site_no][data_type] = data_info
            
    return(stations)

In [84]:
stations = get_inventory(stations,['Discharge','Gage'])

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


In [86]:
df = pd.DataFrame.from_dict(data=stations,orient='index')
df.head(20)

Unnamed: 0,watershed,site_name,lat,lon,drainage_area_sqmi,gage_datum_ft,Discharge,Gage
2111500,Red,"REDDIES RIVER AT NORTH WILKESBORO, NC",36.175,-81.168889,89.2,978.62,"[2004-10-01, 2018-01-25, 38125]","[2004-10-01, 2018-01-25, 15598]"
3085730,Ohio,Ohio River at Emsworth Dam Upper Pool @ Emsworth,40.503889,-80.085556,19500.0,694.0,,
3085734,Ohio,Ohio River at Emsworth Dam Lower Pool @ Emsworth,40.50525,-80.089833,19500.0,680.0,,
3086000,Ohio,"Ohio River at Sewickley, PA",40.549234,-80.205615,19500.0,680.0,"[1933-10-01, 2018-01-25, 30793]",
3086001,Ohio,"Ohio River (lower pool) at Sewickley, PA",40.549722,-80.206944,,,,
3108500,Ohio,"Ohio River at Montgomery Lock & Dam, Lower Pool",40.647288,-80.388675,22960.0,653.6,,
3110685,Ohio,"OHIO R AT NEW CUMBERLAND LOCK & DAM (UPPER), OH",40.528399,-80.62674,23820.0,652.06,,"[2010-09-30, 2018-01-25, 2659]"
3110690,Ohio,"OHIO R AT NEW CUMBERLAND LOCK & DAM (LOWER), OH",40.528121,-80.625629,23820.0,631.56,,"[2010-09-30, 2018-01-25, 2630]"
3111515,Ohio,"OHIO R AT PIKE ISLAND DAM NR WHEELING (UPPER), WV",40.152849,-80.699802,24600.0,631.52,,"[2010-09-30, 2018-01-25, 2662]"
3111520,Ohio,"OHIO R AT PIKE ISLAND LOCK & DAM (LOWER), WV",40.149794,-80.701469,24600.0,610.62,,"[2010-09-30, 2018-01-25, 2651]"
