# EPA Metadata Scrape

In [1]:
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
from bs4 import BeautifulSoup


## Stream Data

#### The script reads a csv which was downloaded from the EPA Water Quality Portal. Stream sites within our HUCs that measured > 20 samples of Nitrogen / Nitrate were gathered.

In [36]:
wqp_n_station_list_stream = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_station_meta_stream.csv',converters={'USGSPCode': lambda x: str(x)})

#### The number of stream sites is printed.

In [37]:
print(len(wqp_n_station_list_stream))

205


#### The N data for the sites is downloaded as csv.

In [38]:
wqp_n_results_stream = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_results_meta_stream.csv', converters={'USGSPCode': lambda x: str(x)})

#### From the data, the number of samples per site are found and put into a table labled as "count_nu".

In [39]:
count_nu_stream = pd.DataFrame(wqp_n_results_stream['MonitoringLocationIdentifier'].value_counts())
count_nu_stream_df = count_nu_stream.rename(columns={'MonitoringLocationIdentifier':'count_nu'})

In [40]:
display(count_nu_stream_df)

Unnamed: 0,count_nu
USGS-02318500,694
USGS-02314500,473
USGS-02317797,391
USGS-02320500,290
21FLGW_WQX-3538,275
...,...
21FLSUW-DEP010C1,20
21FLSUW-OLS010C1,20
21FLTLHR_WQX-G1TLHR0047,20
21FLSUW-WIT030C1,20


#### Below, a breakdown of sites per organization is shown.

In [41]:
pd.DataFrame(wqp_n_station_list_stream['OrganizationFormalName'].value_counts())

Unnamed: 0,OrganizationFormalName
USGS Georgia Water Science Center,55
Suwannee River Water Management District (Florida),37
Georgia DNR Environmental Protection Division,30
Suwannee River Water Management District,25
USGS Florida Water Science Center,19
...,...
FL Dept. of Environmental Protection,8
HOWARD T ODUM FLORIDA SPRINGS INSTITUTE,6
"FL Dept. of Environmental Protection, Northeast District",6
FDEP TALLAHASSEE REGIONAL OPERATIONS CENTER,1


#### Utilizing the number of samples count from the results file, the samples count per site is added to the site metadata dataframe.

In [42]:
wqp_stream_site_list_count_df = pd.merge(count_nu_stream_df, wqp_n_station_list_stream, left_index=True, right_on='MonitoringLocationIdentifier')
wqp_stream_site_list_count_df = wqp_stream_site_list_count_df.sort_index()
display(wqp_stream_site_list_count_df)

Unnamed: 0,count_nu,OrganizationIdentifier,OrganizationFormalName,MonitoringLocationIdentifier,MonitoringLocationName,...,WellDepthMeasure/MeasureValue,WellDepthMeasure/MeasureUnitCode,WellHoleDepthMeasure/MeasureValue,WellHoleDepthMeasure/MeasureUnitCode,ProviderName
0,28,USGS-FL,USGS Florida Water Science Center,USGS-02314274,"SUWANNEE RIVER (AT SILL) NEAR FARGO, GA",...,,,,,NWIS
1,26,USGS-FL,USGS Florida Water Science Center,USGS-023142741,NORTH FORK SUWANNEE RIVER AT SILL NEAR FARGO GA,...,,,,,NWIS
2,46,USGS-FL,USGS Florida Water Science Center,USGS-02314986,"ROCKY CREEK NR BELMONT,FLA.",...,,,,,NWIS
3,104,USGS-FL,USGS Florida Water Science Center,USGS-02315000,SUWANNEE R NR BENTON FLA,...,,,,,NWIS
4,80,USGS-FL,USGS Florida Water Science Center,USGS-02315005,HUNTER CREEK NEAR BELMONT FLA,...,,,,,NWIS
...,...,...,...,...,...,...,...,...,...,...,...
200,86,21GAEPD_WQX,Georgia DNR Environmental Protection Division,21GAEPD_WQX-RV_09_3181,"Suwannee River - U.S. Highway 441 near Fargo, GA",...,,,,,STORET
201,22,21GAEPD_WQX,Georgia DNR Environmental Protection Division,21GAEPD_WQX-RV_09_3209,New River - U.S. Highway 82 Near Tifton,...,,,,,STORET
202,25,21GAEPD_WQX,Georgia DNR Environmental Protection Division,21GAEPD_WQX-RV_09_3230,"Piscola Creek at State Road 38 near Dixie, GA",...,,,,,STORET
203,74,21GAEPD_WQX,Georgia DNR Environmental Protection Division,21GAEPD_WQX-RV_09_3236,Withlacoochee River at Clyattsville-Nankin Roa...,...,,,,,STORET


## Spring Data

#### The script reads a csv which was downloaded from the EPA Water Quality Portal. Spring sites within our HUCs that measured > 20 samples of Nitrogen / Nitrate were gathered.

In [43]:
wqp_n_station_list_spring = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_station_meta_spring.csv',converters={'USGSPCode': lambda x: str(x)})

#### The number of spring sites is printed.

In [44]:
print(len(wqp_n_station_list_spring))

66


#### The N data for the sites is downloaded as csv.

In [45]:
wqp_n_results_spring = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_results_meta_spring.csv', converters={'USGSPCode': lambda x: str(x)})

#### From the data, the number of samples per site are found and put into a table labled as "count_nu".

In [46]:
count_nu_spring = pd.DataFrame(wqp_n_results_spring['MonitoringLocationIdentifier'].value_counts())
count_nu_spring_df = count_nu_spring.rename(columns={'MonitoringLocationIdentifier':'count_nu'})
display(count_nu_spring_df)

Unnamed: 0,count_nu
USGS-02322688,184
USGS-02319302,174
USGS-02320250,149
USGS-02319950,147
USGS-02323566,141
...,...
21FLSUW_WQX-ROY010C1,23
21FLGW_WQX-11407,23
21FLSUW_WQX-HOL010C1,22
21FLSUW_WQX-127898,21


#### Below, a breakdown of sites per organization is shown.

In [47]:
pd.DataFrame(wqp_n_station_list_spring['OrganizationFormalName'].value_counts())

Unnamed: 0,OrganizationFormalName
Suwannee River Water Management District,30
FL Dept. of Environmental Protection,18
USGS Florida Water Science Center,8
HOWARD T ODUM FLORIDA SPRINGS INSTITUTE,8
"FL Dept. of Environmental Protection, Northeast District",1
FDEP TALLAHASSEE REGIONAL OPERATIONS CENTER,1


#### Utilizing the number of samples count from the results file, the samples count per site is added to the site metadata dataframe.

In [48]:
wqp_spring_site_list_count_df = pd.merge(count_nu_spring_df, wqp_n_station_list_spring, left_index=True, right_on='MonitoringLocationIdentifier')
wqp_spring_site_list_count_df = wqp_spring_site_list_count_df.sort_index()
wqp_spring_site_list_count_df.to_csv('C:/Users/robert.taylor/Documents/SuwData/EPA/EPA_All_Sites_Meta_Final.csv')
display(wqp_spring_site_list_count_df)

Unnamed: 0,count_nu,OrganizationIdentifier,OrganizationFormalName,MonitoringLocationIdentifier,MonitoringLocationName,...,WellDepthMeasure/MeasureValue,WellDepthMeasure/MeasureUnitCode,WellHoleDepthMeasure/MeasureValue,WellHoleDepthMeasure/MeasureUnitCode,ProviderName
0,174,USGS-FL,USGS Florida Water Science Center,USGS-02319302,"MADISON BLUE SPRING NR BLUE SPRINGS, FL",...,,,,,NWIS
1,94,USGS-FL,USGS Florida Water Science Center,USGS-02319520,"FALMOUTH SPRING AT FALMOUTH, FL",...,,,,,NWIS
2,147,USGS-FL,USGS Florida Water Science Center,USGS-02319950,"BLUE SPRINGS NEAR DELL,FL",...,,,,,NWIS
3,149,USGS-FL,USGS Florida Water Science Center,USGS-02320250,TROY SPRING NEAR BRANFORD FLA,...,,,,,NWIS
4,59,USGS-FL,USGS Florida Water Science Center,USGS-02322400,GINNIE SPRING NR HIGH SPRINGS FLA,...,,,,,NWIS
...,...,...,...,...,...,...,...,...,...,...,...
61,26,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-SSS010C1,SUWANNEE SPRINGS,...,,,,,STORET
62,37,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-SUW718971,UN-NAMED SPRING,...,,,,,STORET
63,37,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-TEL010C1,TELFORD SPRINGS,...,,,,,STORET
64,47,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-TRY010C1,TROY SPRINGS,...,,,,,STORET


## Wendy Springs

In [49]:
searchfor = ['DEVIL', 'HORNSBY', 'ICHETUCKNEE', 'Ichetucknee', 'Blue Hole', 'BLUE HOLE', 'GINNIE', 'JULY']
search_results = wqp_spring_site_list_count_df[wqp_spring_site_list_count_df['MonitoringLocationName'].str.startswith(tuple(searchfor))]

In [50]:
display(search_results)
LocID = tuple(search_results['MonitoringLocationIdentifier'])
print(LocID)

Unnamed: 0,count_nu,OrganizationIdentifier,OrganizationFormalName,MonitoringLocationIdentifier,MonitoringLocationName,...,WellDepthMeasure/MeasureValue,WellDepthMeasure/MeasureUnitCode,WellHoleDepthMeasure/MeasureValue,WellHoleDepthMeasure/MeasureUnitCode,ProviderName
4,59,USGS-FL,USGS Florida Water Science Center,USGS-02322400,GINNIE SPRING NR HIGH SPRINGS FLA,...,,,,,NWIS
5,184,USGS-FL,USGS Florida Water Science Center,USGS-02322688,"BLUE HOLE SPRING NR HILDRETH, FL",...,,,,,NWIS
11,23,21FLFSI_WQX,HOWARD T ODUM FLORIDA SPRINGS INSTITUTE,21FLFSI_WQX-ICH BLUE HOLE SPRING,Blue Hole Spring,...,,,,,STORET
12,25,21FLFSI_WQX,HOWARD T ODUM FLORIDA SPRINGS INSTITUTE,21FLFSI_WQX-ICHETUCKNEE HEAD SPRING,Ichetucknee Head Spring,...,,,,,STORET
19,23,21FLGW_WQX,FL Dept. of Environmental Protection,21FLGW_WQX-11407,GINNIE SPRING,...,,,,,STORET
...,...,...,...,...,...,...,...,...,...,...,...
32,87,21FLGW_WQX,FL Dept. of Environmental Protection,21FLGW_WQX-9713,ICHETUCKNEE SPRING MAIN,...,,,,,STORET
34,76,21FLGW_WQX,FL Dept. of Environmental Protection,21FLGW_WQX-9743,BLUE HOLE SPRING VENT,...,,,,,STORET
46,31,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-GIN010C1,GINNIE SPRINGS,...,,,,,STORET
49,42,21FLSUW_WQX,Suwannee River Water Management District,21FLSUW_WQX-HOR010C1,HORNSBY SPRING NR HIGH SPRINGS,...,,,,,STORET


('USGS-02322400', 'USGS-02322688', '21FLFSI_WQX-ICH BLUE HOLE SPRING', '21FLFSI_WQX-ICHETUCKNEE HEAD SPRING', '21FLGW_WQX-11407', '21FLGW_WQX-11418', '21FLGW_WQX-9677', '21FLGW_WQX-9681', '21FLGW_WQX-9713', '21FLGW_WQX-9743', '21FLSUW_WQX-GIN010C1', '21FLSUW_WQX-HOR010C1', '21FLSUW_WQX-ICH002C1')


In [51]:
spring_search = wqp_n_results_spring.loc[wqp_n_results_spring['MonitoringLocationIdentifier'].str.startswith(LocID)]
spring_search=spring_search[['OrganizationIdentifier', 'OrganizationFormalName', 'MonitoringLocationIdentifier',
       'ActivityIdentifier', 'ActivityTypeCode', 'ActivityMediaName',
       'ActivityMediaSubdivisionName', 'ActivityStartDate',
       'ActivityStartTime/Time', 'ActivityStartTime/TimeZoneCode',
       'ActivityEndDate', 'ActivityEndTime/Time',
       'ActivityEndTime/TimeZoneCode',
       'ActivityDepthHeightMeasure/MeasureValue',
       'ActivityDepthHeightMeasure/MeasureUnitCode',
       'ActivityDepthAltitudeReferencePointText',
       'ActivityTopDepthHeightMeasure/MeasureValue',
       'ActivityTopDepthHeightMeasure/MeasureUnitCode',
       'ActivityBottomDepthHeightMeasure/MeasureValue',
       'ActivityBottomDepthHeightMeasure/MeasureUnitCode', 'ProjectIdentifier',
       'ActivityConductingOrganizationText',
       'ActivityCommentText', 'SampleAquifer', 'HydrologicCondition',
       'HydrologicEvent', 'SampleCollectionMethod/MethodIdentifier',
       'SampleCollectionMethod/MethodIdentifierContext',
       'SampleCollectionMethod/MethodName', 'SampleCollectionEquipmentName',
       'ResultDetectionConditionText', 'CharacteristicName',
       'ResultSampleFractionText', 'ResultMeasureValue',
       'ResultMeasure/MeasureUnitCode', 'MeasureQualifierCode',
       'ResultStatusIdentifier', 'StatisticalBaseCode', 'ResultValueTypeName',
       'ResultWeightBasisText', 'ResultTimeBasisText',
       'ResultTemperatureBasisText', 'ResultParticleSizeBasisText',
       'PrecisionValue', 'ResultCommentText', 'USGSPCode',
       'ResultDepthHeightMeasure/MeasureValue',
       'ResultDepthHeightMeasure/MeasureUnitCode',
       'ResultDepthAltitudeReferencePointText', 'SubjectTaxonomicName',
       'SampleTissueAnatomyName', 'ResultAnalyticalMethod/MethodIdentifier',
       'ResultAnalyticalMethod/MethodIdentifierContext',
       'ResultAnalyticalMethod/MethodName', 'MethodDescriptionText',
       'LaboratoryName', 'AnalysisStartDate', 'ResultLaboratoryCommentText',
       'DetectionQuantitationLimitTypeName',
       'DetectionQuantitationLimitMeasure/MeasureValue',
       'DetectionQuantitationLimitMeasure/MeasureUnitCode',
       'PreparationStartDate', 'ProviderName']]


## Well Data

#### The script reads a csv which was downloaded from the EPA Water Quality Portal. Well sites within our HUCs that measured > 20 samples of Nitrogen / Nitrate were gathered.

In [52]:
wqp_n_station_list_well = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_station_meta_well.csv',converters={'USGSPCode': lambda x: str(x)})

#### The number of well sites is printed.

In [53]:
print(len(wqp_n_station_list_well))

99


#### The N data for the sites is downloaded as csv.

In [54]:
wqp_n_results_well = pd.read_csv('C://Users/robert.taylor/Downloads//N_WQP_results_meta_well.csv', converters={'USGSPCode': lambda x: str(x)})

#### From the data, the number of samples per site are found and put into a table labled as "count_nu".

In [55]:
count_nu_well = pd.DataFrame(wqp_n_results_well['MonitoringLocationIdentifier'].value_counts())
count_nu_well_df = count_nu_well.rename(columns={'MonitoringLocationIdentifier':'count_nu'})

In [56]:
display(count_nu_well_df)

Unnamed: 0,count_nu
USGS-300800083080004,93
USGS-300200083090005,81
USGS-300800083150007,79
USGS-300200083090006,75
USGS-300800083080003,69
...,...
21FLSUW-11510003,21
21FLSUW--071419001,21
21FLSUW--0101303003,21
21FLSUW--081535002,20


#### Below, a breakdown of sites per organization is shown.

In [57]:
pd.DataFrame(wqp_n_station_list_well['OrganizationFormalName'].value_counts())

Unnamed: 0,OrganizationFormalName
Suwannee River Water Management District (Florida),59
USGS Florida Water Science Center,32
FL Dept. of Environmental Protection,6
USGS Georgia Water Science Center,2


#### Utilizing the number of samples count from the results file, the samples count per site is added to the site metadata dataframe.

In [58]:
wqp_well_site_list_count_df = pd.merge(count_nu_well_df, wqp_n_station_list_well, left_index=True, right_on='MonitoringLocationIdentifier')
wqp_well_site_list_count_df = wqp_well_site_list_count_df.sort_index()
display(wqp_well_site_list_count_df)

Unnamed: 0,count_nu,OrganizationIdentifier,OrganizationFormalName,MonitoringLocationIdentifier,MonitoringLocationName,...,WellDepthMeasure/MeasureValue,WellDepthMeasure/MeasureUnitCode,WellHoleDepthMeasure/MeasureValue,WellHoleDepthMeasure/MeasureUnitCode,ProviderName
0,24,USGS-FL,USGS Florida Water Science Center,USGS-300143082565903,ROBERTS-4 W-17269,...,42.6,ft,42.6,ft,NWIS
1,63,USGS-FL,USGS Florida Water Science Center,USGS-300200083090001,TRAWICK L3-1,...,,,,,NWIS
2,54,USGS-FL,USGS Florida Water Science Center,USGS-300200083090002,TRAWICK L3-2,...,,,,,NWIS
3,36,USGS-FL,USGS Florida Water Science Center,USGS-300200083090003,TRAWICK L3-3,...,,,,,NWIS
4,42,USGS-FL,USGS Florida Water Science Center,USGS-300200083090004,TRAWICK BARN WELL,...,63.0,ft,63.0,ft,NWIS
...,...,...,...,...,...,...,...,...,...,...,...
94,21,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-11510003,JOHN FOLKS-0DOF-0CHRISTIE TWR,...,,,,,STORET
95,23,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-11714002,HORACE HART OLD BENTON,...,,,,,STORET
96,27,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-21036001,WILLIAM LASSITER,...,,,,,STORET
97,22,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-21332004,JOHN FOLKS-0DOF-0ALAPAHA TOWER,...,,,,,STORET


## All Sites

#### The stream, spring, and well sites are all added to one table of metadata which includes the number of samples per site.

In [59]:
epa_all_sites_meta_df = pd.concat((wqp_spring_site_list_count_df,wqp_stream_site_list_count_df,wqp_well_site_list_count_df))
epa_all_sites_meta_df.to_csv('C:/Users/robert.taylor/Documents/SuwData/EPA/EPA_All_sites_Meta.csv')

In [60]:
display(epa_all_sites_meta_df)

Unnamed: 0,count_nu,OrganizationIdentifier,OrganizationFormalName,MonitoringLocationIdentifier,MonitoringLocationName,...,WellDepthMeasure/MeasureValue,WellDepthMeasure/MeasureUnitCode,WellHoleDepthMeasure/MeasureValue,WellHoleDepthMeasure/MeasureUnitCode,ProviderName
0,174,USGS-FL,USGS Florida Water Science Center,USGS-02319302,"MADISON BLUE SPRING NR BLUE SPRINGS, FL",...,,,,,NWIS
1,94,USGS-FL,USGS Florida Water Science Center,USGS-02319520,"FALMOUTH SPRING AT FALMOUTH, FL",...,,,,,NWIS
2,147,USGS-FL,USGS Florida Water Science Center,USGS-02319950,"BLUE SPRINGS NEAR DELL,FL",...,,,,,NWIS
3,149,USGS-FL,USGS Florida Water Science Center,USGS-02320250,TROY SPRING NEAR BRANFORD FLA,...,,,,,NWIS
4,59,USGS-FL,USGS Florida Water Science Center,USGS-02322400,GINNIE SPRING NR HIGH SPRINGS FLA,...,,,,,NWIS
...,...,...,...,...,...,...,...,...,...,...,...
94,21,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-11510003,JOHN FOLKS-0DOF-0CHRISTIE TWR,...,,,,,STORET
95,23,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-11714002,HORACE HART OLD BENTON,...,,,,,STORET
96,27,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-21036001,WILLIAM LASSITER,...,,,,,STORET
97,22,21FLSUW,Suwannee River Water Management District (Flor...,21FLSUW-21332004,JOHN FOLKS-0DOF-0ALAPAHA TOWER,...,,,,,STORET


#### All the results for all site types are combined into one file.

In [61]:
epa_all_results = wqp_n_results_stream.append(wqp_n_results_spring)
epa_all_results = epa_all_results.append(wqp_n_results_well)
epa_all_results.to_csv('C:/Users/robert.taylor/Documents/SuwData/EPA/EPA_All_Results.csv')

#### Below, a breakdown of sites per organization is shown.

In [62]:
pd.DataFrame(epa_all_sites_meta_df['OrganizationFormalName'].value_counts())

Unnamed: 0,OrganizationFormalName
Suwannee River Water Management District (Florida),96
USGS Florida Water Science Center,59
USGS Georgia Water Science Center,57
Suwannee River Water Management District,55
FL Dept. of Environmental Protection,32
...,...
Alachua County Environmental Protection Department (Florida),9
"ENVIRONMENTAL SERVICES AND PERMITTING, INC.",8
"FL Dept. of Environmental Protection, Northeast District",7
FDEP TALLAHASSEE REGIONAL OPERATIONS CENTER,2
