In [51]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import datetime
import altair as alt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## NOTE: The scraped data was cleaned merged and stored in 
- "assets/clean_data/weekly_reservoir_station_data.csv"

### SECTION 1 Scrape Reservoir data from daily reporting stations
- Web scraping with BeautifulSoup4 
- The trick is to understanding the HTML structure to rerieve the rows of data
-  Note it appears that there is no reservoir data prior to Apr 2018
- Data is retieved from the daily reporting site
-  We scrape the data at weekly level

## The structure of the data for 2018 is different from the rest of the years


In [69]:
def scrape_weekly_reservoir_data():
    """
        This function loops through a set of years in a list
        It creates URLS at weekly intervals for reservoir data
        It creates one dataframe containing reservoir data at weekly level for years for which we have data
    """
    
    all_years_reservoir_data = pd.DataFrame()
    for year_start_date in [ "2013-01-01",  "2014-01-01", "2015-01-01", "2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01", "2020-01-01", "2021-01-01", "2022-01-01"]:
       #Generate dates for every week in each year
        week_days = 7
        total_days = 7 * 53
        start_date = datetime.datetime.strptime(year_start_date,"%Y-%m-%d")
        date_list = [start_date.strftime("%Y%m%d")]


        #Generate a list of weekly dates for retrieval
        for week_days in range( week_days, total_days, 7):
            time_delta = datetime.timedelta(days=week_days)
            next_start_date  = start_date + time_delta 
            date_list.append(next_start_date.strftime("%Y%m%d"))


        #Create a dataframe for all dates of a year
        full_year_dataframe = pd.DataFrame()
        for one_date in date_list:
            url=f"https://cdec.water.ca.gov/reportapp/javareports?name=RES.{one_date}"


            # Make a GET request to fetch the raw HTML content
            html_content = requests.get(url).text

            # Parse the html content
            soup = BeautifulSoup(html_content, "lxml")

            reservoir_table = soup.find("table", attrs={"id":"RES", "class": "data"})
            if reservoir_table is None:
                continue
            reservoir_table_header = reservoir_table.thead.find_all("th")  
            reservoir_table_header = [th.text for th in reservoir_table_header]
            reservoir_table_header = [elm.strip() for elm in reservoir_table_header[1:]]
            reservoir_table_rows = reservoir_table.find_all('tr', {'class': 'white'})
            all_rows_list = []
            for eachTableRow in reservoir_table_rows:
                this_row = []
                for td in eachTableRow.find_all("td"):
                    this_row.append(td.text.strip())

                if this_row and len(this_row) > 1:
                    all_rows_list.append(this_row)

            #Form a data_table for the collection of weekly rows        
            data_table = pd.DataFrame(all_rows_list )
            data_table['date'] = pd.to_datetime(f'{one_date}')
            data_table.columns = reservoir_table_header + ['date']
    
            #Form a yearly table
            if full_year_dataframe.empty:
                    full_year_dataframe = data_table
            else:
                    full_year_dataframe = full_year_dataframe.append(data_table)

        if full_year_dataframe.empty:
            continue

        #Combine this years data with past years
        if all_years_reservoir_data.empty:
               all_years_reservoir_data = full_year_dataframe 
        else:
               all_years_reservoir_data  = all_years_reservoir_data.append(full_year_dataframe)
    return all_years_reservoir_data


In [70]:
# In 2018, the table includes totals, remove those rows
all_years_reservoir_data = scrape_weekly_reservoir_data()
all_years_reservoir_data.columns = [col.strip() for col in all_years_reservoir_data.columns]

all_years_reservoir_data  = all_years_reservoir_data[~all_years_reservoir_data['Reservoir Name'].str.contains('Total')].copy()
#Add a year and month column
all_years_reservoir_data['year']  = all_years_reservoir_data.date.dt.year
all_years_reservoir_data['month']  = all_years_reservoir_data.date.dt.month

all_years_reservoir_data.rename(columns={'StaID': 'station_id'}, inplace=True)

### Get Daily reporting Reservoir Station locations using webscraping

In [78]:
def get_reservoir_station_data():
    url=f"https://cdec.water.ca.gov/reportapp/javareports?name=DailyRes"


    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")

    station_table = soup.find("table", attrs={"id":"DailyRes_LIST", "class": "data"})

    all_rows_list = []
    for eachRow in station_table.find_all("tr"):
        this_row = []
        for  td in eachRow.find_all("td"):
            this_row.append(td.text.strip())

        if this_row and len(this_row) > 1:
             all_rows_list.append(this_row)
    station_table = pd.DataFrame(all_rows_list )
    station_table.columns = station_table.iloc[0,:]
    station_table =  station_table.iloc[2:,:].copy()
    station_table.rename(columns={'ID':'station_id'}, inplace=True)
    station_table.drop(columns=['OPERATOR AGENCY'], inplace = True)
    return station_table


In [79]:
station_table = get_reservoir_station_data()

In [73]:
#list(set(all_years_reservoir_data.station_id) - (set(station_table.station_id)))
# We have info for all the stations

[]

In [87]:
reservoir_station_df = all_years_reservoir_data.merge(station_table, how='inner', on='station_id')
reservoir_station_df = reservoir_station_df[['station_id','% of Capacity', 'date', 'year', 'month', 'LATITUDE' , 'LONGITUDE', 'COUNTY'] ].copy()
reservoir_station_df.rename(columns={'% of Capacity':'pct_of_capacity'}, inplace=True)
reservoir_station_df['pct_of_capacity'] = pd.to_numeric(reservoir_station_df['pct_of_capacity'], errors='coerce')

In [88]:
reservoir_station_df.sample(1)

Unnamed: 0,station_id,pct_of_capacity,date,year,month,LATITUDE,LONGITUDE,COUNTY
2661,ENG,98.0,2021-08-13,2021,8,39.238998,-121.266998,NEVADA


In [89]:
reservoir_station_df.to_csv(r"assets/clean_data/weekly_reservoir_station_data.csv", index=False)

In [None]:
weekly_reservoir_station_data = pd.read_csv(r"assets/clean_data/weekly_reservoir_station_data.csv")


In [58]:
reservoir_station_df.sample()

Unnamed: 0,Reservoir Name,station_id,Capacity(AF),Storage(AF),% of Capacity,date,year,month,STATION,ELEV(FEET),LATITUDE,LONGITUDE,COUNTY,_merge
4010,BLACK BUTTE,BLB,143700,49377,34,2019-09-24,2019,9,BLACK BUTTE,426,39.807999,-122.329002,TEHAMA,both


## Drought data retrieval through download

[Source for monthly drought area and severity since 1895][https://www.drought.gov/states/california]


### [Drought years](https://water.ca.gov/water-basics/drought) :

2019-2021

2018

2012-2016 

2007-09

1987-92

1976-77


A [drought](https://droughtmonitor.unl.edu/About/AbouttheData/DroughtClassification.aspx)  is a period of unusually persistant dry weather that persists long enough to cause serious problems such as crop damage and/or water supply shortages. The severity of the drought depends upon the degree of moisture deficiency, the duration, and the size of the affected area.

D2 Severe Drought

- Crop or pasture losses likely
- Water shortages common
- Water restrictions imposed

D3 Extreme Drought

- Major crop/pasture losses
- Widespread water shortages or restrictions

D4 Exceptional Drought

- Exceptional and widespread crop/pasture losses
- Shortages of water in reservoirs, streams, and wells creating water emergencies


Long-term effects
Excessive ground water pumping and aquifer depletion will lead to land sinking and permanent loss of groundwater storage. Decreasing groundwater levels lead to exposing of underground water storage areas, this will cause lack of soil structure strength and possible sinking if the land above is heavy enough.


In [44]:
import os 
drought_df = pd.read_csv(r"assets/california_weekly_drought_index.csv")

In [45]:
drought_df.DATE.unique()

array(['d_18950101', 'd_18950201', 'd_18950301', ..., 'd_20211101',
       'd_20211201', 'd_20220101'], dtype=object)

In [46]:
drought_df.DATE  = pd.to_datetime(drought_df.DATE.str.replace("d_", ""))

In [47]:
drought_df

Unnamed: 0,0,DATE,D0,D1,D2,D3,D4,-9,W0,W1,W2,W3,W4
0,0.0,1895-01-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1895-02-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1895-03-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1895-04-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1895-05-01,0.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,9.3,2021-09-01,90.6,76.6,46.2,26.7,2.4,0.0,0.0,0.0,0.0,0.0,0.0
1521,27.4,2021-10-01,72.5,55.2,27.3,9.3,1.3,0.0,0.0,0.0,0.0,0.0,0.0
1522,29.9,2021-11-01,69.9,54.6,23.1,7.4,1.3,0.0,0.1,0.0,0.0,0.0,0.0
1523,54.1,2021-12-01,11.2,3.2,0.1,0.0,0.0,0.0,34.7,16.5,2.2,0.1,0.0


In [48]:
drought_df = drought_df[drought_df.DATE.dt.year > 2000][['DATE', 'D0', 'D1', 'D2', 'D3', 'D4']].copy()

In [49]:
drought_df.sample(1)

Unnamed: 0,DATE,D0,D1,D2,D3,D4
1352,2007-09-01,97.0,90.8,40.0,14.9,5.7


In [50]:

alt.Chart(drought_df.melt(
                        id_vars='DATE',
                        value_vars=['D1', 'D2','D3','D4'],
                        var_name='DROUGHT_LEVEL',
                        value_name='DROUGHT_AREA',

                         )
).mark_area(
    color="lightblue",
    interpolate='step-after',
    line=True
).encode(
    x='DATE:T',
    y='DROUGHT_AREA',
    color = 'DROUGHT_LEVEL',
    tooltip=['DATE','DROUGHT_LEVEL', 'DROUGHT_AREA']
).properties(
    width = 1000
)

In [70]:
drought_df[drought_df.DATE.dt.year == 2001].mean()

D0    36.950000
D1    26.616667
D2    14.050000
D3     8.233333
D4     3.108333
dtype: float64

In [71]:
drought_years = [2019, 2020, 2021, 2018, 2012, 2013, 2014,2015, 2016, 2007, 2008, 2009 ]