# Scrape current CA major reservoir metrics

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import datetime as dt
import matplotlib.pyplot as plt
import altair as alt
import requests
import lxml.html as html
from bs4 import BeautifulSoup

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

In [5]:
# http://cdec.water.ca.gov/resapp/RescondMain

In [6]:
reservoirs = pd.DataFrame(
    [
        {"name": "San Luis", "abbr": "SNL"},
        {"name": "Melones", "abbr": "NML"},
        {"name": "Trinity", "abbr": "CLE"},
        {"name": "Shasta", "abbr": "SHA"},
        {"name": "Oroville", "abbr": "ORO"},
        {"name": "Folson", "abbr": "FOL"},
        {"name": "Don Pedro", "abbr": "DNP"},
        {"name": "McClure", "abbr": "EXC"},
        {"name": "Pine Flat", "abbr": "PNF"},
        {"name": "Castaic", "abbr": "CAS"},
        {"name": "Perris", "abbr": "PRR"},
        {"name": "Millerton", "abbr": "MIL"},
    ]
)

In [7]:
urls = []
for r in reservoirs.abbr:
    urls.append("http://cdec.water.ca.gov/resapp/ResDetail?resid=" + r)

In [8]:
headers = {
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Methods": "GET",
    "Access-Control-Allow-Headers": "Content-Type",
    "Access-Control-Max-Age": "3600",
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
}

In [9]:
# providing url
url = "https://cdec.water.ca.gov/reportapp/javareports?name=RES"

req = requests.get(url, headers)
soup = BeautifulSoup(req.content, "html.parser")

In [10]:
# tables = soup.find_all("table")

In [11]:
tables = pd.read_html("https://cdec.water.ca.gov/reportapp/javareports?name=RES")

In [12]:
src = pd.DataFrame(tables[0])

In [13]:
src.columns = [" ".join(col).strip() for col in src.columns.values]

In [14]:
src.columns = (
    (
        src.columns.str.replace("Water Storage ", "", regex=False)
        .str.lower()
        .str.replace(" ", "_", regex=False)
    )
    .str.replace("af", "acrefeet", regex=False)
    .str.replace("(", "_", regex=False)
    .str.replace(")", "", regex=False)
    .str.replace("%_of_", "pct_", regex=False)
)

### Clean up the table

In [15]:
src.columns = src.columns.str.replace("Water Storage ", "", regex=False)

In [23]:
df = src[
    ((~src["staid"].str.contains(" RIVER")) & (~src["staid"].str.contains(" CREEK")))
    & (src["staid"].isin(reservoirs.abbr.to_list()))
].copy()

In [24]:
df.columns

Index(['reservoir_name', 'staid', 'capacity_acrefeet', 'elevation_ft',
       'storage_acrefeet', 'storage_change', 'pct_capacity', 'average_storage',
       'pct_average', 'outflow_cfs', 'inflow_cfs',
       'storage-year_ago_this_date'],
      dtype='object')

In [25]:
df[
    [
        "capacity_acrefeet",
        "storage_acrefeet",
        "pct_capacity",
        "average_storage",
        "pct_average",
        "storage-year_ago_this_date",
    ]
] = df[
    [
        "capacity_acrefeet",
        "storage_acrefeet",
        "pct_capacity",
        "average_storage",
        "pct_average",
        "storage-year_ago_this_date",
    ]
].astype(
    int
)

In [26]:
df["diff"] = df["storage_acrefeet"] - df["storage-year_ago_this_date"]

In [29]:
df.drop(["outflow_cfs", "inflow_cfs"], axis=1, inplace=True)

In [30]:
df.to_json(
    "data/processed/capacity/capacity_" + today + ".json", indent=2, orient="records"
)