In [None]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import datetime

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import json
import geopandas as gpd
import pygeos

import altair as alt



In [None]:
#Get well completion reports data
wellcompletion_plss_df = pd.read_csv(r"/work/assets/outputs/well_completion_clean.csv")
wellcompletion_plss_df = wellcompletion_plss_df[~wellcompletion_plss_df.YEARWORKENDED.isna()].copy()
wellcompletion_plss_df.YEARWORKENDED = wellcompletion_plss_df.YEARWORKENDED.astype('int64')
wellcompletion_plss_df.rename(columns={'YEARWORKENDED':'year'}, inplace=True)

#Precipitation data
all_years_precipitation_station = pd.read_csv(r"../assets/inputs/precipitation/precipitation_stations.csv")
# Set the county name to be camel case for join
all_years_precipitation_station.COUNTY = all_years_precipitation_station.COUNTY.str.title()


#Reservoir data
weekly_reservoir_station_data = pd.read_csv(r"../assets/inputs/reservoir/weekly_reservoir_station_data.csv")
weekly_reservoir_station_data.COUNTY = weekly_reservoir_station_data.COUNTY.str.title()

#Drought years determined visually as per chart in notebook drought_reservoir_data
drought_years = [2019, 2020, 2021, 2018, 2012, 2013, 2014,2015, 2016, 2007, 2008, 2009 ]
wellcompletion_plss_df['drought_year'] = np.where(wellcompletion_plss_df.year.isin(drought_years), 1, 0)

  wellcompletion_plss_df = pd.read_csv(r"/work/assets/outputs/well_completion_clean.csv")


In [None]:
all_years_precipitation_station.columns = [col.strip() for col in all_years_precipitation_station.columns]
all_years_precipitation_station.columns

Index(['station_id', 'STATION NAME', 'OCT', 'NOV', 'DEC', 'JAN', 'FEB', 'MAR',
       'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'average_year_precip', 'year',
       'LATITUDE', 'LONGITUDE', 'COUNTY'],
      dtype='object')

In [None]:
# load the plss shapefile (these only include TRS areas that are within the San Joaquin subbasin)
SJ_subbasin_plss = gpd.read_file("assets/clean_data/plss_subbasin.geojson")
# aggregate by TownshipRange
SJ_subbasin_plss_range = SJ_subbasin_plss.dissolve(by='TownshipRange').reset_index()
SJ_subbasin_plss_range.explore()

DriverError: assets/clean_data/plss_subbasin.geojson: No such file or directory

In [None]:
# load the plss shapefile (these only include TRS areas that are within the San Joaquin subbasin)
california_plss = gpd.read_file("assets/clean_data/california_plss.geojson")
# aggregate by TownshipRange
california_plss_range = california_plss.dissolve(by='TownshipRange').reset_index()
#california_plss_range.explore()

In [None]:
# create wells geodataframe
# In case of geographic coordinates, it is assumed that longitude is captured by x coordinates and latitude by y.
 
precipitation_data_gdf = gpd.GeoDataFrame(all_years_precipitation_station, geometry=gpd.points_from_xy(all_years_precipitation_station.LONGITUDE, all_years_precipitation_station.LATITUDE))
#Set the coordinate reference system (the projection that denote the axis for the points)
precipitation_data_gdf = precipitation_data_gdf.set_crs('epsg:4326')
# spatial join based on geometry
precipitation_data_plss = precipitation_data_gdf.sjoin(SJ_subbasin_plss, how="left")
precipitation_california_data_plss = precipitation_data_gdf.sjoin(california_plss, how="left")
precipitation_data_plss = precipitation_data_plss[~precipitation_data_plss.MTRS.isna()].copy()
precipitation_california_data_plss = precipitation_california_data_plss[~precipitation_california_data_plss.MTRS.isna()].copy()

# drop the ones that aren't in the san joaquin valley basin


In [None]:
precipitation_data_plss.shape #((144, 29)

(160, 29)

In [None]:
precipitation_data_plss.explore()

In [None]:
print(precipitation_california_data_plss.shape) #(1538, 29)
precipitation_california_data_plss.explore()

(1708, 29)


In [None]:
len(set(wellcompletion_plss_df.TownshipRange).intersection(set(precipitation_data_plss.TownshipRange)))

16

In [None]:
well_precip_tr = list(set(wellcompletion_plss_df.TownshipRange).intersection(set(precipitation_data_plss.TownshipRange)))

### The precipitation  stations are disperesed along the length of the San Joaquin river basin (almost uniformly)
- A decision is to be made about the precipitation in the TownshipRanges where there is no station to provide data 
- We can average out the entire regions precipitation

In [None]:
combined_well_precip_reser_df = wellcompletion_plss_df.merge(precipitation_data_plss, how='left', on=['TownshipRange', 'COUNTY', 'year'], indicator=True, suffixes = ('_wellcompletion', '_precipitation_station'))
#create a column with average precipitation across all toenship ranges for each year
#For toenship ranges with no precipitation data (since station is not present in that township, we use this average yearly amount )

combined_well_precip_reser_df['avg_precip_all_tr_year'] = combined_well_precip_reser_df.groupby('year')['average_year_precip'].transform('mean')

## THere are several year for which we do not have precipitation data, remove them
combined_well_precip_reser_df = combined_well_precip_reser_df[~combined_well_precip_reser_df['avg_precip_all_tr_year'].isnull()].copy()

combined_well_precip_reser_df['average_year_precip_corrected'] = np.where(combined_well_precip_reser_df['average_year_precip'].isnull(), combined_well_precip_reser_df['avg_precip_all_tr_year'], combined_well_precip_reser_df['average_year_precip'] )

In [None]:
print(combined_well_precip_reser_df.shape, wellcompletion_plss_df.shape, precipitation_data_plss.shape)


(15909, 63) (98080, 34) (160, 29)


In [None]:
pd.options.display.max_columns=100
combined_well_precip_reser_df.sample(5)

Unnamed: 0,LATITUDE_wellcompletion,LONGITUDE_wellcompletion,TOWNSHIP,RANGE,SECTION,WELLLOCATION,CITY,COUNTY,BOTTOMOFPERFORATEDINTERVAL,TOPOFPERFORATEDINTERVAL,GROUNDSURFACEELEVATION,STATICWATERLEVEL,RECORDTYPE,USE,WCRNUMBER,TOTALDRILLDEPTH,TOTALCOMPLETEDDEPTH,DATEWORKENDED,CASINGDIAMETER,TOTALCOMPLETEDDEPTH_CORRECTED,DATEWORKENDED_CORRECTED,year,MONTHWORKENDED,geometry_wellcompletion,index_right_wellcompletion,OBJECTID_wellcompletion,Township_wellcompletion,Range_wellcompletion,Meridian_wellcompletion,Source_wellcompletion,Section_wellcompletion,MTRS_wellcompletion,TownshipRange,drought_year,station_id,STATION NAME,OCT,NOV,DEC,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,average_year_precip,LATITUDE_precipitation_station,LONGITUDE_precipitation_station,geometry_precipitation_station,index_right_precipitation_station,OBJECTID_precipitation_station,Township_precipitation_station,Range_precipitation_station,Meridian_precipitation_station,Source_precipitation_station,Section_precipitation_station,MTRS_precipitation_station,_merge,avg_precip_all_tr_year,average_year_precip_corrected
90708,36.47886,-119.23233,17S,25E,2.0,NS AVE 376 & EO RD 144,VISALIA,Tulare,470.0,170.0,,150.0,WellCompletion/New/Production or Monitoring/NA,Public,WCR2016-014450,,500.0,2016-04-08,14.0,500.0,2016-04-08,2016,4.0,POINT (-119.23233 36.47886),8216.0,56977.0,T17S,R25E,MDM,BLM,2.0,MDM-T17S-R25E-2,T17S R25E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,0.976895,0.976895
49413,37.35321,-120.73293,07S,11E,2.0,13530 BELL DR,LIVINGSTON,Merced,,,,,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR0140117,,246.0,2014-07-07,,246.0,2014-07-07,2014,7.0,POINT (-120.73293 37.35321),4070.0,29531.0,T07S,R11E,MDM,BLM,2.0,MDM-T07S-R11E-2,T07S R11E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,0.4687,0.4687
71653,37.842675,-121.379078,01S,05E,13.0,13285 S WILLOW GLEN RD,STOCKTON,San Joaquin,85.0,65.0,10.0,16.0,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR2020-006231,85.0,85.0,2020-04-28,,85.0,2020-04-28,2020,4.0,POINT (-121.379078 37.842675),439.0,9448.0,T01S,R05E,MDM,DGR,13.0,MDM-T01S-R05E-13,T01S R05E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,0.746146,0.746146
12075,36.582361,-119.572111,15S,22E,34.0,12171 E Huntsman AVE,Selma,Fresno,,,,,WellCompletion/New/Production or Monitoring/NA,Agriculture,WCR2018-011936,270.0,260.0,2018-12-28,,260.0,2018-12-28,2018,12.0,POINT (-119.5721114 36.5823608),7263.0,50855.0,T15S,R22E,MDM,BLM,34.0,MDM-T15S-R22E-34,T15S R22E,1,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,0.560585,0.560585
5954,36.995711,-119.987743,11S,18E,10.0,16994 JENNIFER COURT,MADERA,Fresno,600.0,360.0,,310.0,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR2017-007956,,620.0,2017-05-10,4.0,620.0,2017-05-10,2017,5.0,POINT (-119.987743 36.995711),5290.0,39170.0,T11S,R18E,MDM,BLM,10.0,MDM-T11S-R18E-10,T11S R18E,0,,,,,,,,,,,,,,,,,,,,,,,,,,,left_only,1.384335,1.384335


In [None]:
combined_well_precip_reser_df = combined_well_precip_reser_df[['WCRNUMBER',  'TownshipRange', 'COUNTY',
                                                               'BOTTOMOFPERFORATEDINTERVAL', 'TOPOFPERFORATEDINTERVAL',
                                                               'GROUNDSURFACEELEVATION', 'average_year_precip_corrected', 
                                                               'STATICWATERLEVEL', 'RECORDTYPE', 'USE',
                                                               'TOTALDRILLDEPTH', 'TOTALCOMPLETEDDEPTH', 'DATEWORKENDED',
                                                               'CASINGDIAMETER', 'TOTALCOMPLETEDDEPTH_CORRECTED',
                                                               'DATEWORKENDED_CORRECTED', 'year', 'MONTHWORKENDED',
                                                               'geometry_wellcompletion', 'MTRS_wellcompletion', 
                                                               'drought_year']].copy()

In [None]:
print(combined_well_precip_reser_df.shape)

(15909, 21)


### Combine with reservoir data
- Data is at weekly level
- group to yearly level using months of spring (since most pecipitation occurs in spring) Use month 1, 2, 3 and 4 just as we did in groundwater data

In [None]:
print(weekly_reservoir_station_data.columns)
weekly_reservoir_station_data = weekly_reservoir_station_data[weekly_reservoir_station_data.month.isin([1, 2, 3, 4])].copy()
weekly_reservoir_station_data = weekly_reservoir_station_data.groupby(['station_id', 'year', 'LATITUDE', 'LONGITUDE' , 'COUNTY']).agg(avg_pct_capacity = ('pct_of_capacity', 'mean')).reset_index()
                                                                                                                                      

Index(['station_id', 'pct_of_capacity', 'date', 'year', 'month', 'LATITUDE',
       'LONGITUDE', 'COUNTY'],
      dtype='object')


In [None]:
weekly_reservoir_station_data.sample(1)

Unnamed: 0,station_id,year,LATITUDE,LONGITUDE,COUNTY,avg_pct_capacity
18,BRD,2022,38.202999,-120.074997,Tuolumne,49.222222


In [None]:
# create wells geodataframe
# In case of geographic coordinates, it is assumed that longitude is captured by x coordinates and latitude by y.
 
reservoir_data_gdf = gpd.GeoDataFrame(weekly_reservoir_station_data, geometry=gpd.points_from_xy(weekly_reservoir_station_data.LONGITUDE, weekly_reservoir_station_data.LATITUDE))
#Set the coordinate reference system (the projection that denote the axis for the points)
reservoir_data_gdf = reservoir_data_gdf.set_crs('epsg:4326')
# spatial join based on geometry
reservoir_data_plss = reservoir_data_gdf.sjoin(SJ_subbasin_plss, how="left")
reservoir_california_data_plss = reservoir_data_gdf.sjoin(california_plss, how="left")
reservoir_data_plss = reservoir_data_plss[~reservoir_data_plss.MTRS.isna()].copy()
reservoir_california_data_plss = reservoir_california_data_plss[~reservoir_california_data_plss.MTRS.isna()].copy()

# drop the ones that aren't in the san joaquin valley basin


In [None]:
reservoir_california_data_plss.explore()

In [None]:
len(set(combined_well_precip_reser_df.TownshipRange).intersection(set(reservoir_california_data_plss.TownshipRange)))

10

In [None]:
combined_well_precip_reser_df = combined_well_precip_reser_df.merge(reservoir_california_data_plss, how='left', on=['TownshipRange', 'COUNTY', 'year'], indicator=True, suffixes = ('_wellcompletion', '_reservoir_station'))
# #create a column with average reservoir capacity across all township ranges for each year
# #For township ranges with no reservoir data (since station is not present in that township, we use this average yearly amount )

combined_well_precip_reser_df['avg_capacity_all_tr_year'] = combined_well_precip_reser_df.groupby('year')['avg_pct_capacity'].transform('mean')

# ## THere are several year for which we do not have precipitation data, remove them
combined_well_precip_reser_df = combined_well_precip_reser_df[~combined_well_precip_reser_df['avg_capacity_all_tr_year'].isnull()].copy()

combined_well_precip_reser_df['avg_pct_capacity_year_corrected'] = np.where(combined_well_precip_reser_df['avg_pct_capacity'].isnull(), combined_well_precip_reser_df['avg_capacity_all_tr_year'], combined_well_precip_reser_df['avg_pct_capacity'] )

In [None]:
combined_well_precip_reser_df.columns

Index(['WCRNUMBER', 'TownshipRange', 'COUNTY', 'BOTTOMOFPERFORATEDINTERVAL',
       'TOPOFPERFORATEDINTERVAL', 'GROUNDSURFACEELEVATION',
       'average_year_precip_corrected', 'STATICWATERLEVEL', 'RECORDTYPE',
       'USE', 'TOTALDRILLDEPTH', 'TOTALCOMPLETEDDEPTH', 'DATEWORKENDED',
       'CASINGDIAMETER', 'TOTALCOMPLETEDDEPTH_CORRECTED',
       'DATEWORKENDED_CORRECTED', 'year', 'MONTHWORKENDED',
       'geometry_wellcompletion', 'MTRS_wellcompletion', 'drought_year',
       'station_id', 'LATITUDE', 'LONGITUDE', 'avg_pct_capacity', 'geometry',
       'index_right', 'OBJECTID', 'Township', 'Range', 'Meridian', 'Source',
       'Section', 'MTRS', '_merge', 'avg_capacity_all_tr_year',
       'avg_pct_capacity_year_corrected'],
      dtype='object')

In [None]:
combined_well_precip_reser_df.shape

(5192, 37)

In [None]:
combined_well_precip_reser_df = combined_well_precip_reser_df[['WCRNUMBER', 'TownshipRange', 'COUNTY', 'BOTTOMOFPERFORATEDINTERVAL',
                                                               'TOPOFPERFORATEDINTERVAL', 'GROUNDSURFACEELEVATION',
                                                               'average_year_precip_corrected', 'STATICWATERLEVEL', 'RECORDTYPE',
                                                               'USE', 'TOTALDRILLDEPTH', 'TOTALCOMPLETEDDEPTH', 'DATEWORKENDED',
                                                               'CASINGDIAMETER', 'TOTALCOMPLETEDDEPTH_CORRECTED',
                                                               'DATEWORKENDED_CORRECTED', 'year', 'MONTHWORKENDED',
                                                               'geometry_wellcompletion', 'MTRS_wellcompletion', 'station_id',
                                                               'LATITUDE', 'LONGITUDE', 'avg_pct_capacity', 'geometry',                                                                
                                                                'MTRS', 'avg_pct_capacity_year_corrected', 'drought_year']].copy()

In [None]:
combined_well_precip_reser_df.to_csv(r"assets/clean_data/combined_well_precip_reser_data.csv")

In [None]:
print(combined_well_precip_reser_df.shape)

(5192, 28)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>