### Reservoir and Reservoir Stations Datasets

In [1]:
import sys
sys.path.append('..')

In [2]:
from lib.reservoir import ReservoirDataset



In [3]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import datetime
import altair as alt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

##### NOTE: The scraped data was cleaned merged and stored in 
- /work/milestone2_waterwells_deepnote/assets/outputs/weekly_reservoir_station_data.csv
- /work/milestone2_waterwells_deepnote/assets/outputs/reservoir_station_data.csv


In [4]:
reservoir_data = ReservoirDataset(2013, 2023, "../assets/outputs")

In [5]:
reservoir_data.scrape_weekly_reservoir_data()

Unnamed: 0,Reservoir Name,STATION_ID,Capacity(AF),Elevation(FT),Storage(AF),Storage Change,PCT_OF_CAPACITY,Average Storage,% of Average,Outflow(CFS),Inflow(CFS),Storage-Year Ago This Date,date,YEAR,MONTH
0,TRINITY LAKE,CLE,2447650,2333.38,1892349,23907,77,1948622,97,415,12468,2245651,2018-04-08,2018,4
1,WHISKEYTOWN,WHI,241100,1199.29,208016,1745,86,217023,96,714,1594,216744,2018-04-08,2018,4
2,LEWISTON,LEW,14660,1901.24,14091,-351,96,13745,103,752,575,13852,2018-04-08,2018,4
3,SONOMA(WARM SPRINGS),WRS,381000,443.48,225587,2536,59,233717,97,78,1362,249554,2018-04-08,2018,4
4,MENDOCINO (COYOTE),COY,122400,744.63,80519,4003,66,79957,101,32,2056,86702,2018-04-08,2018,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,DONNER LAKE,DNN,9700,5928.29,3336,-16,34,4119,81,---,---,3134,2022-03-13,2022,3
44,CACHUMA LAKE,CCH,193305,710.87,90807,-56,47,143366,63,45,22,121510,2022-03-13,2022,3
45,PYRAMID,PYM,180000,2571.92,162170,1037,90,164362,99,---,---,154127,2022-03-13,2022,3
46,CASTAIC,CAS,325000,1444.70,187522,-530,58,267588,70,---,---,254932,2022-03-13,2022,3


In [6]:
reservoir_data.get_reservoir_station_data()

Unnamed: 0,STATION,STATION_ID,ELEV(FEET),LATITUDE,LONGITUDE,COUNTY
2,DWINNELL RESERVOIR NEAR EDGEWOOD,DRE,2805,41.540894,-122.374550,SISKIYOU
3,TRINITY LAKE,CLE,2370,40.800999,-122.762001,TRINITY
4,LEWISTON,LEW,1870,40.727001,-122.792999,TRINITY
5,RUTH DAM,RTD,2675,40.367001,-123.432999,TRINITY
6,LAKE PILLSBURY NR POTTER VLY 24HR AVG,LPY,1828,39.408298,-122.958298,LAKE
...,...,...,...,...,...,...
203,TINEMAHA RESERVOIR,TNM,3882,37.057999,-118.224998,INYO
204,HAIWEE,HWE,3774,36.137001,-117.947998,INYO
205,SOUTH LAKE RESERVOIR,SKR,837,33.115040,-117.165588,INYO
206,LITTLE ROCK RESERVOIR,LRK,745,34.485001,-118.022003,LOS ANGELES


In [7]:

reservoir_data.retrieve_merge_reservoir_stations()

Unnamed: 0,STATION_ID,PCT_OF_CAPACITY,YEAR,LATITUDE,LONGITUDE,COUNTY
0,ANT,84.916667,2018,40.180000,-120.607002,PLUMAS
1,ANT,85.884615,2019,40.180000,-120.607002,PLUMAS
2,ANT,80.461538,2020,40.180000,-120.607002,PLUMAS
3,ANT,63.750000,2021,40.180000,-120.607002,PLUMAS
4,ANT,70.909091,2022,40.180000,-120.607002,PLUMAS
...,...,...,...,...,...,...
232,WRS,54.384615,2018,38.723000,-123.010002,SONOMA
233,WRS,61.769231,2019,38.723000,-123.010002,SONOMA
234,WRS,52.176471,2020,38.723000,-123.010002,SONOMA
235,WRS,35.384615,2021,38.723000,-123.010002,SONOMA


### SECTION 1 Scrape Reservoir data from daily reporting stations
- Web scraping with BeautifulSoup4 
- The trick is to understanding the HTML structure to rerieve the rows of data
-  Note it appears that there is no reservoir data prior to Apr 2018
- Data is retieved from the daily reporting site
-  We scrape the data at weekly level

##### NOTE: 
- Data is available only from April 2018. 
- The structure of the data for 2018 is different from the rest of the years
- The below function call takes about 10 mins to run


In [None]:
def scrape_weekly_reservoir_data():
    """
        This function loops through a set of years in a list
        It creates URLS at weekly intervals for reservoir data
        It creates one dataframe containing reservoir data at weekly level for years for which we have data
    """
    
    all_years_reservoir_data = pd.DataFrame()
    for year_start_date in [ "2013-01-01",  "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01", "2020-01-01", "2021-01-01", "2022-01-01"]:

        #inclusive controls whether to include start and end that are on the boundary. The default, “both”, includes boundary points on either end.
        date_list = pd.date_range(year_start_date, periods=53, freq='W')
        date_list = [week_date.strftime("%Y%m%d") for week_date in date_list if pd.to_datetime(week_date).year ==  pd.to_datetime(year_start_date).year]
  
        #Create a dataframe for all dates of a year
        full_year_dataframe = pd.DataFrame()
        for one_date in date_list:
            url=f"https://cdec.water.ca.gov/reportapp/javareports?name=RES.{one_date}"


            # Make a GET request to fetch the raw HTML content
            html_content = requests.get(url).text

            # Parse the html content
            soup = BeautifulSoup(html_content, "lxml")

            reservoir_table = soup.find("table", attrs={"id":"RES", "class": "data"})
            if reservoir_table is None:
                continue
            reservoir_table_header = reservoir_table.thead.find_all("th")  
            reservoir_table_header = [th.text.strip() for th in reservoir_table_header]
            reservoir_table_header = [elm.strip() for elm in reservoir_table_header[1:]]
            reservoir_table_rows = reservoir_table.find_all('tr', {'class': 'white'})
            all_rows_list = []
            for eachTableRow in reservoir_table_rows:
                this_row = []
                for td in eachTableRow.find_all("td"):
                    this_row.append(td.text.strip())

                if this_row and len(this_row) > 1:
                    all_rows_list.append(this_row)

            #Form a data_table for the collection of weekly rows        
            data_table = pd.DataFrame(all_rows_list )
            data_table['date'] = pd.to_datetime(f'{one_date}')
            data_table.columns = reservoir_table_header + ['date']
    
            #Form a yearly table
            if full_year_dataframe.empty:
                    full_year_dataframe = data_table
            else:
                    full_year_dataframe = full_year_dataframe.append(data_table)

        if full_year_dataframe.empty:
            continue

        #Combine this years data with past years
        if all_years_reservoir_data.empty:
               all_years_reservoir_data = full_year_dataframe 
        else:
               all_years_reservoir_data  = all_years_reservoir_data.append(full_year_dataframe)

       
    all_years_reservoir_data.rename(columns={'% of Capacity':'PCT_OF_CAPACITY'}, inplace=True)
    all_years_reservoir_data  = all_years_reservoir_data[~all_years_reservoir_data['Reservoir Name'].str.contains('Total')].copy()
    #Add a year and month column
    all_years_reservoir_data['YEAR']  = all_years_reservoir_data.date.dt.year
    all_years_reservoir_data['MONTH']  = all_years_reservoir_data.date.dt.month
    all_years_reservoir_data.rename(columns={'StaID': 'STATION_ID'}, inplace=True)
    return all_years_reservoir_data


In [None]:
scrape_weekly_reservoir_data()

Unnamed: 0,Reservoir Name,STATION_ID,Capacity(AF),Elevation(FT),Storage(AF),Storage Change,PCT_OF_CAPACITY,Average Storage,% of Average,Outflow(CFS),Inflow(CFS),Storage-Year Ago This Date,date,YEAR,MONTH
0,TRINITY LAKE,CLE,2447650,2333.38,1892349,23907,77,1948622,97,415,12468,2245651,2018-04-08,2018,4
1,WHISKEYTOWN,WHI,241100,1199.29,208016,1745,86,217023,96,714,1594,216744,2018-04-08,2018,4
2,LEWISTON,LEW,14660,1901.24,14091,-351,96,13745,103,752,575,13852,2018-04-08,2018,4
3,SONOMA(WARM SPRINGS),WRS,381000,443.48,225587,2536,59,233717,97,78,1362,249554,2018-04-08,2018,4
4,MENDOCINO (COYOTE),COY,122400,744.63,80519,4003,66,79957,101,32,2056,86702,2018-04-08,2018,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,DONNER LAKE,DNN,9700,5928.29,3336,-16,34,4119,81,---,---,3134,2022-03-13,2022,3
44,CACHUMA LAKE,CCH,193305,710.87,90807,-56,47,143366,63,45,22,121510,2022-03-13,2022,3
45,PYRAMID,PYM,180000,2571.92,162170,1037,90,164362,99,---,---,154127,2022-03-13,2022,3
46,CASTAIC,CAS,325000,1444.70,187522,-530,58,267588,70,---,---,254932,2022-03-13,2022,3


### Get Daily reporting Reservoir Station locations using webscraping

In [None]:
def get_reservoir_station_data():
    """
        This function retrieves the station location data for reservoirs through webscraping
    """
    
    url=f"https://cdec.water.ca.gov/reportapp/javareports?name=DailyRes"

    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")

    station_table = soup.find("table", attrs={"id":"DailyRes_LIST", "class": "data"})

    all_rows_list = []
    for eachRow in station_table.find_all("tr"):
        this_row = []
        for  td in eachRow.find_all("td"):
            this_row.append(td.text.strip())

        if this_row and len(this_row) > 1:
             all_rows_list.append(this_row)
    station_table = pd.DataFrame(all_rows_list )
    station_table.columns = station_table.iloc[0,:]
    station_table =  station_table.iloc[2:,:].copy()
    station_table.rename(columns={'ID':'STATION_ID'}, inplace=True)
    station_table.drop(columns=['OPERATOR AGENCY'], inplace = True)
    return station_table


In [None]:

def save_precipitation_data(reservoir_station_df, granularity):
    """
        This function saves weekly and yearly level reservoir data in separate CSV files
    """
    if granularity == 'weekly':
        reservoir_station_df.to_csv(r"/work/milestone2_waterwells_deepnote/assets/inputs/reservoir/weekly_reservoir_station_data.csv", index=False)
    else:
        reservoir_station_df.to_csv(r"/work/milestone2_waterwells_deepnote/assets/inputs/reservoir/reservoir_station_data.csv", index=False)

In [None]:
def retrieve_merge_precipitation_stations():
    """
        This function calls web scraping functions for weekly reservoir data and the stattion and merges the two
        It saves off the weekly data in a file
        It then merges the two dataframe to link stations to their locations
        It averages the reservoir percent of capacity storage to the yearly level
        It stores the file 
    """
    all_years_reservoir_data = scrape_weekly_reservoir_data() #Note this take about 10 mins to run

    #Save off the weekly data as a check
    save_precipitation_data(all_years_reservoir_data, 'weekly')
    
    station_table = get_reservoir_station_data()

    reservoir_station_df = all_years_reservoir_data.merge(station_table, how='inner', on='STATION_ID')
    reservoir_station_df['PCT_OF_CAPACITY'] = pd.to_numeric(reservoir_station_df['PCT_OF_CAPACITY'], errors='coerce')
  
    reservoir_station_df = reservoir_station_df.groupby(['STATION_ID', 'YEAR', 'LATITUDE' , 'LONGITUDE', 'COUNTY']).agg(PCT_OF_CAPACITY=('PCT_OF_CAPACITY', 'mean')).reset_index()
    
    reservoir_station_df = reservoir_station_df[['STATION_ID','PCT_OF_CAPACITY', 'YEAR', 'LATITUDE' , 'LONGITUDE', 'COUNTY'] ].copy()
    save_precipitation_data(reservoir_station_df, 'yearly')
    return reservoir_station_df

  

In [None]:
#list(set(all_years_reservoir_data.station_id) - (set(station_table.station_id)))
# We have info for all the stations

In [None]:
reservoir_station_df.sample(1)

Unnamed: 0,station_id,pct_of_capacity,date,year,month,LATITUDE,LONGITUDE,COUNTY
2661,ENG,98.0,2021-08-13,2021,8,39.238998,-121.266998,NEVADA


In [None]:
reservoir_station_df = pd.read_csv(r"/work/milestone2_waterwells_deepnote/assets/inputs/reservoir/reservoir_station_data.csv")


In [None]:
reservoir_station_df.sample()

Unnamed: 0,STATION_ID,PCT_OF_CAPACITY,YEAR,LATITUDE,LONGITUDE,COUNTY
66,DNN,56.865385,2019,39.322777,-120.264397,NEVADA


## Drought data retrieval through download

[Source for monthly drought area and severity since 1895][https://www.drought.gov/states/california]


### [Drought years](https://water.ca.gov/water-basics/drought) :

2019-2021

2018

2012-2016 

2007-09

1987-92

1976-77


A [drought](https://droughtmonitor.unl.edu/About/AbouttheData/DroughtClassification.aspx)  is a period of unusually persistant dry weather that persists long enough to cause serious problems such as crop damage and/or water supply shortages. The severity of the drought depends upon the degree of moisture deficiency, the duration, and the size of the affected area.

D2 Severe Drought

- Crop or pasture losses likely
- Water shortages common
- Water restrictions imposed

D3 Extreme Drought

- Major crop/pasture losses
- Widespread water shortages or restrictions

D4 Exceptional Drought

- Exceptional and widespread crop/pasture losses
- Shortages of water in reservoirs, streams, and wells creating water emergencies


Long-term effects
Excessive ground water pumping and aquifer depletion will lead to land sinking and permanent loss of groundwater storage. Decreasing groundwater levels lead to exposing of underground water storage areas, this will cause lack of soil structure strength and possible sinking if the land above is heavy enough.


In [None]:
import os 
drought_df = pd.read_csv(r"/work/milestone2_waterwells_deepnote/assets/outputs/california_weekly_drought_index.csv")

In [None]:
drought_df.DATE.unique()

array(['d_18950101', 'd_18950201', 'd_18950301', ..., 'd_20211101',
       'd_20211201', 'd_20220101'], dtype=object)

In [None]:
drought_df.DATE  = pd.to_datetime(drought_df.DATE.str.replace("d_", ""))

In [None]:
drought_df

Unnamed: 0,0,DATE,D0,D1,D2,D3,D4,-9,W0,W1,W2,W3,W4
0,0.0,1895-01-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1895-02-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1895-03-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1895-04-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1895-05-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,9.3,2021-09-01,90.6,76.6,46.2,26.7,2.4,0.0,0.0,0.0,0.0,0.0,0.0
1521,27.4,2021-10-01,72.5,55.2,27.3,9.3,1.3,0.0,0.0,0.0,0.0,0.0,0.0
1522,29.9,2021-11-01,69.9,54.6,23.1,7.4,1.3,0.0,0.1,0.0,0.0,0.0,0.0
1523,54.1,2021-12-01,11.2,3.2,0.1,0.0,0.0,0.0,34.7,16.5,2.2,0.1,0.0


In [None]:
drought_df = drought_df[drought_df.DATE.dt.year > 2000][['DATE', 'D0', 'D1', 'D2', 'D3', 'D4']].copy()

In [None]:
drought_df.sample(1)

Unnamed: 0,DATE,D0,D1,D2,D3,D4
1511,2020-12-01,84.0,72.7,40.9,24.1,8.3


In [None]:

alt.Chart(drought_df.melt(
                        id_vars='DATE',
                        value_vars=['D1', 'D2','D3','D4'],
                        var_name='DROUGHT_LEVEL',
                        value_name='DROUGHT_AREA',

                         )
).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='DATE:T',
    y='DROUGHT_AREA',
    color = 'DROUGHT_LEVEL',
    tooltip=['DATE','DROUGHT_LEVEL', 'DROUGHT_AREA']
).properties(
    width = 1000
)

In [None]:
drought_df[drought_df.DATE.dt.year == 2001].mean()

D0    36.950000
D1    26.616667
D2    14.050000
D3     8.233333
D4     3.108333
dtype: float64

In [None]:
drought_years = [2019, 2020, 2021, 2018, 2012, 2013, 2014,2015, 2016, 2007, 2008, 2009 ]

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>