# Download Chesapeake Bay Program Water Quality Dataset

### Summary 
This notebook uses Python to ping the Chesapeake Bay Program (CBP) API and download temperature data.

### Outputs
* A csv file with the raw CBP data

### Notes
* URL for temperature, salinity, and dissolved oxygen: https://datahub.chesapeakebay.net/api.Tab/WaterQuality/WaterQuality/12-8-2018/12-8-2023/0/2,4,6/12,13,15,35,36,2,3,7,33,34,23,24/HUC8/2,4,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,60/31,123,83

In [1]:
import os
import requests
import tempfile
from pathlib import Path
from datetime import datetime

import pandas as pd

In [2]:
REPO_ROOT = Path('/Users/rwegener/repos/chesapeake_mhw/')

start_date = datetime(2003, 1, 1)
end_date = datetime(2023, 12, 31)

In [3]:
def format_request_temponly(start, end):
    '''
    Creating URL string for requesting water quality data from the Chesapeake Bay 
    Program.
    
    Start and end date formatted as `month-day-year`, or '%m-%d-%Y' 
    using https://strftime.org/
    '''
    return (
        'https://datahub.chesapeakebay.net/api.CSV/WaterQuality/WaterQuality/'
        f'{start}/{end}/0/2,4,6/12,13,15,35,36,2,3,7,33,34,23,24/HUC8/'
        '2,4,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31'
        ',32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,60/123'
    )

def format_request_tempDOsal(start, end):
    '''
    Creating URL string for requesting water quality data from the Chesapeake Bay 
    Program.
    
    Start and end date formatted as `month-day-year`, or '%m-%d-%Y' 
    using https://strftime.org/
    '''
    return (
        'https://datahub.chesapeakebay.net/api.CSV/WaterQuality/WaterQuality/'
        f'{start}/{end}/0/2,4,6/12,13,15,35,36,2,3,7,33,34,23,24/HUC8/'
        '2,4,6,7,8,9,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31'
        ',32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,60/31,123,83'
    )

format_request = format_request_temponly
# The API seems to get overwhelmed when requesting the full 20 years of data at once.
# The request is instead split into one request per ~6-7 years and the data frames are merged.

# Create temporary filepaths
scratch_dir = tempfile.TemporaryDirectory()
set1_path = os.path.join(scratch_dir.name, 'set1.csv')
set2_path = os.path.join(scratch_dir.name, 'set2.csv')
set3_path = os.path.join(scratch_dir.name, 'set3.csv')

# Request #1 -- ~2003-2008
response = requests.get(format_request(start_date.strftime('%m-%d-%Y'), '12-31-2008'))
response.raise_for_status() # ensure we notice bad responses
with open(set1_path, "w") as f:
    f.write(response.text)

# Request #2 -- ~2009-2015
response = requests.get(format_request('01-01-2009', '12-31-2015'))
response.raise_for_status() # ensure we notice bad responses
with open(set2_path, "w") as f:
    f.write(response.text)

# Request #3 -- ~2016-2023
response = requests.get(format_request('01-01-2016', end_date.strftime('%m-%d-%Y')))
response.raise_for_status() # ensure we notice bad responses
with open(set3_path, "w") as f:
    f.write(response.text)

In [4]:
# Open the csvs using pandas
set1_df = pd.read_csv(set1_path)
set2_df = pd.read_csv(set2_path)
set3_df = pd.read_csv(set3_path)

# Combine the datasets
full_df = pd.concat([set1_df, set2_df, set3_df])

# Sort by date and reset the index
full_df.SampleDate = pd.to_datetime(full_df.SampleDate)
full_df = full_df.sort_values('SampleDate').reset_index(drop=True)

# Remove rows with null temperature values
full_df = full_df[~full_df.MeasureValue.isnull()]

  set1_df = pd.read_csv(set1_path)
  set2_df = pd.read_csv(set2_path)
  set3_df = pd.read_csv(set3_path)


In [6]:
# Save to the raw data folder
filename = (
    'WaterQuality_ChesapeakeBayProgram_{}_{}_Temp.csv'
    ).format(start_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d'))
output_path = os.path.join(REPO_ROOT, 'data/01_raw', filename)

full_df.to_csv(output_path, index=False)

In [7]:
# Delete the temporary directory
scratch_dir.cleanup()