# Precipitation data gathering and creation of precipitation dataset

In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import json
import geopandas as gpd
import pygeos

import altair as alt

##### NOTE: The outputof this notebook after scraped data was cleaned and merged is stored in 
 **"/work/milestone2_waterwells_deepnote/assets/inputs/precipitation/precipitation_stations.csv"**

### SECTION 1 Scrape Precipitation data from daily reporting stations
- Web scraping with BeautifulSoup4 
- The trick is to understanding the HTML structure to rerieve the rows of data
- Data is retieved from the daily reporting site
-  We scrape the data at weekly level

In [None]:
def scrape_monthly_precipitation_data(year_start, year_end):
    """
        This function loops through a set of years in a list
        It creates URLS at yearly level for precipitation In each URL, the data is at monthly level
        It creates one dataframe containing precipitation data at monthly level for years for which we have data
    """
    
    all_years_precipitation_data = pd.DataFrame()
    for curr_year in range(year_start,year_end + 1):
    
        url=f"https://cdec.water.ca.gov/reportapp/javareports?name=PRECIPMON.{curr_year}"

        # Make a GET request to fetch the raw HTML content
        html_content = requests.get(url).text

        # Parse the html content
        soup = BeautifulSoup(html_content, "lxml")

        precipitation_table = soup.find("table", attrs={"id":"data", "class": "data"})
      
        if precipitation_table is None:
                continue
        
        precipitation_table_header = precipitation_table.thead.find_all("th")  
        precipitation_table_header = [th.text for th in precipitation_table_header]
        precipitation_table_header = precipitation_table_header[1:]
        precipitation_table_rows = precipitation_table.find_all('tr')
        all_rows_list = []
        for eachTableRow in precipitation_table_rows:
            this_row = []
            for td in eachTableRow.find_all("td"):
                this_row.append(td.text.strip())

            if this_row and len(this_row) > 1:
                all_rows_list.append(this_row)

        data_table = pd.DataFrame(all_rows_list )
        data_table.columns = precipitation_table_header
        for col in ['OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR','APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP']:
            data_table[col] = pd.to_numeric(data_table[col], errors='coerce')
        data_table['AVERAGE_YEARLY_PRECIPITATION'] =  data_table[ ['OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR','APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP']].mean(axis='columns')
        data_table['YEAR'] = curr_year
        data_table.rename(columns = {'STATION ID':'STATION_ID'}, inplace=True)
      
        if all_years_precipitation_data.empty:
                all_years_precipitation_data = data_table 
        else:
                all_years_precipitation_data  = all_years_precipitation_data.append(data_table)
    return all_years_precipitation_data


### Get Monthly Precipitation Reporting Station locations using webscraping

In [None]:
def get_precipitation_station_data(level):
    
    if level == 'daily':
        url=f"https://cdec.water.ca.gov/reportapp/javareports?name=DailyStations"
    else:
        url=  "https://cdec.water.ca.gov/reportapp/javareports?name=MonthlyPrecip"


    # Make a GET request to fetch the raw HTML content
    html_content = requests.get(url).text

    # Parse the html content
    soup = BeautifulSoup(html_content, "lxml")

    if level == 'daily':
            station_table = soup.find("table", attrs={"id":"DLY_STNLIST", "class": "data"})
    else:
            station_table = soup.find("table", attrs={"id":"REALPRECIP_LIST", "class": "data"})

    all_rows_list = []
    for eachRow in station_table.find_all("tr"):
        this_row = []
        for  td in eachRow.find_all("td"):
            this_row.append(td.text.strip())

        if this_row and len(this_row) > 1:
             all_rows_list.append(this_row)
    station_table = pd.DataFrame(all_rows_list )
    station_table.columns = station_table.iloc[0,:]
    station_table =  station_table.iloc[2:,:].copy()
    station_table.rename(columns={'ID':'STATION_ID'}, inplace=True)
    station_table.drop(columns=['ELEV(FEET)'], inplace = True)
    return station_table


In [None]:
def retrieve_merge_precipitation_stations(): 
    all_years_precipitation_data = scrape_monthly_precipitation_data(2013, 2022) #1767


    daily_station_table = get_precipitation_station_data('daily')
    monthly_station_table = get_precipitation_station_data('monthly')

    full_station_table = daily_station_table.append(monthly_station_table)
    group_full_station_count_df = full_station_table.groupby(['STATION', 'STATION_ID','LATITUDE', 'LONGITUDE', 'COUNTY' ]).agg(count_latitude=('LATITUDE', 'count')).reset_index()
    #Making sure we do not have duplicates
    #group_full_station_count_df[group_full_station_count_df.station_id.str.contains('ASM|ATW|BFK|BAL|YSV')]
    group_full_station_count_df.drop(columns=['count_latitude'], inplace=True)

    all_years_precipitation_station = all_years_precipitation_data.merge(group_full_station_count_df, how='inner', left_on='STATION_ID', right_on='STATION_ID')
    all_years_precipitation_station.drop(columns=[ 'STATION'], inplace=True)
    return all_years_precipitation_station

In [None]:
#list(set(all_years_precipitation_data.station_id) - set(monthly_station_table.station_id))
#list(set(all_years_precipitation_data.station_id) - set(full_station_table.station_id))
# There are stations not found on the site either ['MTP', 'CHL', 'LSB', 'HRR', 'GNV', 'LGB', 'LYT']
# all_years_precipitation_station = all_years_precipitation_data.merge(monthly_station_table, how='left', left_on='STATION_ID', right_on='STATION_ID', indicator=True)
# all_years_precipitation_station[all_years_precipitation_station['_merge']  != 'both'].station_id.unique()

In [None]:
#all_years_precipitation_station.describe()

In [None]:
def save_precipitation_data(all_years_precipitation_station):
    all_years_precipitation_station.to_csv(r"/work/milestone2_waterwells_deepnote/assets/inputs/precipitation/precipitation_stations.csv", index=False)

In [None]:
save_precipitation_data(all_years_precipitation_station)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>