In [None]:
import re
import os
import pygeos

import numpy as np
import pandas as pd

import geopandas as gpd
import altair as alt

## [PLSS documentation details](https://deepnote.com/project/Milestone-2-Water-Wells-sELi2mU2RJ2VuNhfoIgl3g/%2FExploratory%20Data%20Analysis%2Fgeopandas_deepnote.ipynb/#309c606e-64c6-4653-8eca-f2882f71012f)
- Based on Meridian and baselines, townships (N-S) and range (E-W) are defined. Each townships contains 36 1x1 mile sections (the basic unit)

[Source](https://maps-cnra-cadoc.opendata.arcgis.com/datasets/public-land-survey-system-plss-township-and-range/explore?location=37.270100%2C-119.333000%2C6.72&showTable=true)



In [None]:
plss_df = gpd.read_file(r"/work/assets/plss_subbasin.geojson") #14486 records
plss_df.sample(1)

Unnamed: 0,OBJECTID,Township,Range,Meridian,Source,Section,MTRS,TownshipRange,geometry
8434,59531,T18S,R20E,MDM,BLM,23,MDM-T18S-R20E-23,T18S R20E,"MULTIPOLYGON (((-119.76283 36.35706, -119.7673..."


In [None]:
# aggregate by TownshipRange
plss_range = plss.dissolve(by='TownshipRange').reset_index()



In [None]:
plss_range.explore()

MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.Chart(...)

## [Read in the user reported shortage report](https://data.ca.gov/dataset/household-water-supply-shortage-reporting-system-data)
- Data read in using requests module in Python and stored in CSV

In [None]:
def to_snake_case(col_name):
    '''
        Function to convert camel case to snake case
    '''
    col_name = col_name.replace(' ', '')
    #Regexp : Look forward and if not in start insert underscore
    col_name = re.sub(r'(?<!^)(?=[A-Z])', '_', col_name).lower()
    return (col_name)  

#Select relevant columns
def read_shortage_data():
    '''
        This function reads the user submitted shortage reports
        It then joins with the Public Land Survey System data for SanJoaquin basin
        It return a shortage dataframe with  township range columns 
    '''
    pd_csv_shortage = pd.read_csv('../assets/shortage.csv')
    pd_csv_shortage.rename(columns={'CREATE DATE': 'Create Date',
                                    'LONGITUDE': 'Longitude',
                                    'LATITUDE' : 'Latitude',
                                    'CITY' : 'City'}, inplace=True)
    pd_csv_shortage.columns = [to_snake_case(col_name) for col_name in pd_csv_shortage.columns]
    pd_csv_shortage.drop(columns=['unnamed:0', 'i_d', '_id' ], inplace=True)
    pd_csv_shortage = pd_csv_shortage.dropna(subset=['latitude', 'longitude'])
    pd_csv_shortage['create_date'] = pd.to_datetime(pd_csv_shortage['create_date'])

    #Get the year and month so we know what date ranges to work with
    pd_csv_shortage['create_year']  = pd_csv_shortage['create_date'].dt.year
    pd_csv_shortage['create_month']  = pd_csv_shortage['create_date'].dt.year


    #Convert the latitude and longitude into shapely points
    #Create a geo dataframe
    #Set the Coordinate Reference System (CRS) of the GeoDataFrame.
    #The dataframe can then be joined to the PLSS geojson seen above.

    shortage_gdf = gpd.GeoDataFrame(pd_csv_shortage, geometry=gpd.points_from_xy(pd_csv_shortage.longitude, pd_csv_shortage.latitude))
    shortage_gdf = shortage_gdf.set_crs('epsg:4326')
    # Perform a spatial join 
    shortage_plss_df = shortage_gdf.sjoin(plss_df, how="left")
    shortage_plss_df = shortage_plss_df.dropna(subset=['MTRS'])
    return shortage_plss_df

In [None]:
shortage_plss_df = read_shortage_data()

In [None]:
shortage_plss_df

array([2015, 2016, 2014, 2012, 2017, 2018, 2019, 2020, 2021, 2022])

## Precipitation data for San Joaquin valley
[Details on data source and retrieval](https://deepnote.com/project/Milestone-2-Water-Wells-sELi2mU2RJ2VuNhfoIgl3g/%2FData%20Engineering%2Fprecipitation.ipynb/#5a03b876-2cc1-4396-a124-e90c28429ebd)

In [None]:
def read_precipitation_data():
    '''
        This function reads precipitation data from measurement 
        stations in California
        It joins to Station latitude and longitude data
        It then joins to PLSS data to create a dataframe for TownshipRange for each year
    '''
    precipitation_df = pd.read_csv("/work/assets/precipitation_data.csv")
    precipitation_station_df = pd.read_csv("/work/assets/precipitation_station_data.csv")
    precipitation_df.rename(columns={'frb':'feb'}, inplace=True)
    precipitation_station_df['station_id'] = precipitation_station_df.station_id.str.strip()
    month_cols = ['oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep']
    for col in month_cols:
        precipitation_df[col] = pd.to_numeric(precipitation_df[col], errors='coerce')

    precipitation_df = precipitation_df.assign(year_avg=precipitation_df.loc[:, month_cols].mean(axis=1))
    precipitation_df =  precipitation_df[['station_id', 'station_name', 'precipitation_year', 'year_avg']].copy()    
    precipitation_loc_df = precipitation_df.merge(precipitation_station_df,
                                    how='inner', 
                                    left_on=['station_id'] , right_on=['station_id'] )

    #Convert the latitude and longitude into shapely points
    #Create a geo dataframe
    #Set the Coordinate Reference System (CRS) of the GeoDataFrame.
    #The dataframe can then be joined to the PLSS geojson seen above.

    precipitation_gdf = gpd.GeoDataFrame(precipitation_loc_df, geometry=gpd.points_from_xy(precipitation_loc_df.longitude, precipitation_loc_df.latitude))
    precipitation_gdf = precipitation_gdf.set_crs('epsg:4326')
    # Perform a spatial join 
    precipitation_plss_df = precipitation_gdf.sjoin(plss_df, how="left")
    precipitation_plss_df = precipitation_plss_df.dropna(subset=['MTRS']) 
    return  precipitation_plss_df, precipitation_loc_df , precipitation_gdf                              

In [None]:
precipitation_plss_df, precipitation_loc_df, precipitation_gdf   = read_precipitation_data()
precipitation_plss_df  
# We might want to use the mean of this for TownshipRange where data is not 
#Available

Unnamed: 0,station_id,station_name_x,precipitation_year,year_avg,station_name_y,latitude,longitude,county,river_basin,geometry,index_right,OBJECTID,Township,Range,Meridian,Source,Section,MTRS,TownshipRange
30,BFK,BAKERSFIELD AIRPORT,2022,0.475000,BAKERSFIELD AIRPORT,35.433998,-119.054001,KERN,TULARE LAKE,POINT (-119.05400 35.43400),12987.0,89544.0,T29S,R27E,MDM,BLM,2.0,MDM-T29S-R27E-2,T29S R27E
31,BFK,BAKERSFIELD AIRPORT,2019,0.651667,BAKERSFIELD AIRPORT,35.433998,-119.054001,KERN,TULARE LAKE,POINT (-119.05400 35.43400),12987.0,89544.0,T29S,R27E,MDM,BLM,2.0,MDM-T29S-R27E-2,T29S R27E
32,BFK,BAKERSFIELD AIRPORT,2020,0.641667,BAKERSFIELD AIRPORT,35.433998,-119.054001,KERN,TULARE LAKE,POINT (-119.05400 35.43400),12987.0,89544.0,T29S,R27E,MDM,BLM,2.0,MDM-T29S-R27E-2,T29S R27E
33,BFK,BAKERSFIELD AIRPORT,2021,0.245000,BAKERSFIELD AIRPORT,35.433998,-119.054001,KERN,TULARE LAKE,POINT (-119.05400 35.43400),12987.0,89544.0,T29S,R27E,MDM,BLM,2.0,MDM-T29S-R27E-2,T29S R27E
34,BFK,BAKERSFIELD AIRPORT,2015,0.484545,BAKERSFIELD AIRPORT,35.433998,-119.054001,KERN,TULARE LAKE,POINT (-119.05400 35.43400),12987.0,89544.0,T29S,R27E,MDM,BLM,2.0,MDM-T29S-R27E-2,T29S R27E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,WSC,WASCO,2022,0.710000,WASCO,35.599998,-119.333000,KERN,TULARE LAKE,POINT (-119.33300 35.60000),12287.0,84812.0,T27S,R24E,MDM,BLM,12.0,MDM-T27S-R24E-12,T27S R24E
929,WSC,WASCO,2019,0.611667,WASCO,35.599998,-119.333000,KERN,TULARE LAKE,POINT (-119.33300 35.60000),12287.0,84812.0,T27S,R24E,MDM,BLM,12.0,MDM-T27S-R24E-12,T27S R24E
930,WSC,WASCO,2020,0.726667,WASCO,35.599998,-119.333000,KERN,TULARE LAKE,POINT (-119.33300 35.60000),12287.0,84812.0,T27S,R24E,MDM,BLM,12.0,MDM-T27S-R24E-12,T27S R24E
931,WSC,WASCO,2021,0.247500,WASCO,35.599998,-119.333000,KERN,TULARE LAKE,POINT (-119.33300 35.60000),12287.0,84812.0,T27S,R24E,MDM,BLM,12.0,MDM-T27S-R24E-12,T27S R24E


In [None]:
precipitation_df = pd.read_csv("/work/assets/precipitation_data.csv")
precipitation_station_df = pd.read_csv("/work/assets/precipitation_station_data.csv")
precipitation_df.rename(columns={'frb':'feb'}, inplace=True)
precipitation_station_df['station_id'] = precipitation_station_df.station_id.str.strip()
month_cols = ['oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep']
for col in month_cols:
    precipitation_df[col] = pd.to_numeric(precipitation_df[col], errors='coerce')

precipitation_df = precipitation_df.assign(year_avg=precipitation_df.loc[:, month_cols].mean(axis=1))
precipitation_df =  precipitation_df[['station_id', 'station_name', 'precipitation_year', 'year_avg']].copy()    
precipitation_loc_df = precipitation_df.merge(precipitation_station_df,
                                how='inner', 
                                left_on=['station_id'] , right_on=['station_id'] )


In [None]:
precipitation_loc_df

Unnamed: 0,station_id,station_name_x,precipitation_year,year_avg,station_name_y,latitude,longitude,county,river_basin
0,APU,ANGWIN PACIFIC UNION COL,2022,8.880000,ANGWIN PACIFIC UNION COL,38.573101,-122.440598,NAPA,NAPA R
1,APU,ANGWIN PACIFIC UNION COL,2019,4.558333,ANGWIN PACIFIC UNION COL,38.573101,-122.440598,NAPA,NAPA R
2,APU,ANGWIN PACIFIC UNION COL,2020,1.968333,ANGWIN PACIFIC UNION COL,38.573101,-122.440598,NAPA,NAPA R
3,APU,ANGWIN PACIFIC UNION COL,2021,1.363333,ANGWIN PACIFIC UNION COL,38.573101,-122.440598,NAPA,NAPA R
4,APU,ANGWIN PACIFIC UNION COL,2015,3.118000,ANGWIN PACIFIC UNION COL,38.573101,-122.440598,NAPA,NAPA R
...,...,...,...,...,...,...,...,...,...
987,ALT,ALTURAS RS,2015,0.975833,ALTURAS RS,41.500000,-120.550003,MODOC,PIT R
988,ALT,ALTURAS RS,2016,1.120833,ALTURAS RS,41.500000,-120.550003,MODOC,PIT R
989,EPK,EAST PARK RESERVOIR,2015,1.220000,EAST PARK RESERVOIR,39.367001,-122.516998,COLUSA,STONY CR
990,GNV,GREENVILLE RS,2015,,GREENVILLE RS,40.132999,-120.932999,PLUMAS,FEATHER R


In [None]:
# Stations that are from San Joaquin for which we have lat and long data
stn_list = ['CRR', 'FLR', 'MIL' , 'HNT', 'LBS', 'NFR']
san_joaquin_stns = ['FLR', 'HNT', 'BGC', 'CRR', 'NFR', 'ABR', 'CLN', 'MIL', 'FGC', 'LBS', 'SSI', 'PNH', 'MDR', 'MFS', 'TCR', 'STK']

precipitation_loc_df[precipitation_loc_df.station_id.isin(san_joaquin_stns)]


Unnamed: 0,station_id,station_name,precipitation_year,year_avg,latitude,longitude,county,river_basin,geometry
76,FLR,FLORENCE LAKE (SCE),2022,2.48,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
77,FLR,FLORENCE LAKE (SCE),2019,2.284167,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
78,FLR,FLORENCE LAKE (SCE),2020,1.925,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
79,FLR,FLORENCE LAKE (SCE),2021,1.293333,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
184,NFR,NORTH FORK R S,2022,2.985,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
185,NFR,NORTH FORK R S,2019,3.441667,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
186,NFR,NORTH FORK R S,2020,1.685833,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
187,NFR,NORTH FORK R S,2021,1.420833,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)


In [None]:
precipitation_gdf[precipitation_gdf.station_id.isin(stn_list)]



Unnamed: 0,station_id,station_name,precipitation_year,year_avg,latitude,longitude,county,river_basin,geometry
76,FLR,FLORENCE LAKE (SCE),2022,2.48,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
77,FLR,FLORENCE LAKE (SCE),2019,2.284167,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
78,FLR,FLORENCE LAKE (SCE),2020,1.925,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
79,FLR,FLORENCE LAKE (SCE),2021,1.293333,37.266998,-118.967003,FRESNO,SAN JOAQUIN R,POINT (-118.96700 37.26700)
184,NFR,NORTH FORK R S,2022,2.985,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
185,NFR,NORTH FORK R S,2019,3.441667,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
186,NFR,NORTH FORK R S,2020,1.685833,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)
187,NFR,NORTH FORK R S,2021,1.420833,37.233002,-119.5,MADERA,SAN JOAQUIN R,POINT (-119.50000 37.23300)


## Population Density data

In [None]:
population_df = pd.read_csv(r"/work/assets/California Hard-to-Count Index by Census Tract.csv")
population_df = (
               population_df[[ 'OBJECTID_1', 'NAME', 'GEOID',  'Est. total population', 'County name', 'Tract number', 'Land Area']].copy()
)

In [None]:
population_df['County name'].unique()

NameError: name 'population_df' is not defined

In [None]:
population_df['OBJECTID_1'].max()

7933

In [None]:
plss_df['OBJECTID'].min()
#objectid ranges from 3577 to  164971

3577

In [None]:
#https://ask.census.gov/prweb/PRServletCustom/app/ECORRAsk_/YACFBFye-rFIz_FoGtyvDRUGg1Uzu5Mn*/!STANDARD?pzuiactionzzz=CXtpbn0rTEpMcGRYOG1vS0tqTFAwaENUZWpvM1NNWEMzZ3p5aFpnWUxzVmw0TjJoOEprcE5BQndaM1Vid1FKbWRibnZu*

In [None]:

pd_csv_station = pd.read_csv('../assets/station.csv')
pd_csv_groundwater = pd.read_csv('../assets/groundwater.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Well Completion data

[Source](https://data.cnra.ca.gov/dataset/well-completion-reports)

In [None]:

pd_csv_wellcompletion = pd.read_csv('../assets/wellcompletion.csv')
pd_csv_wellcompletion.iloc[9:20, :]

Unnamed: 0.1,Unnamed: 0,DECIMALLATITUDE,WORKFLOWSTATUS,LLACCURACY,PERMITDATE,PUMPTESTLENGTH,SECTION,REGIONOFFICE,DRILLINGMETHOD,OTHEROBSERVATIONS,...,TOTALCOMPLETEDDEPTH,OWNERASSIGNEDWELLNUMBER,COUNTYNAME,RANGE,BASELINEMERIDIAN,RECEIVEDDATE,DRILLERNAME,WELLYIELD,_id,CASINGDIAMETER
9,9,37.49857,,Centroid of Section,,,16,DWR North Central Region Office,Auger,,...,10.0,MW 5,Alameda,01W,Mount Diablo,,WDC EXPLORATION & WELLS WDC EXPLORATION & WELLS,,8,2.0
10,10,37.52746,,Centroid of Section,,,4,DWR North Central Region Office,,,...,,VW 7,Alameda,01W,Mount Diablo,,WOODWARD DRILLING COMPANY WOODWARD DRILLING CO...,,9,
11,11,37.58517,,Centroid of Section,,,15,DWR North Central Region Office,,,...,,MW 10,Alameda,02W,Mount Diablo,,WOODWARD DRILLING COMPANY WOODWARD DRILLING CO...,,10,
12,12,37.743056,,,,,24,DWR North Central Region Office,,,...,,SV-18A,Alameda,03W,Mount Diablo,,VIRONEX INC VIRONEX INC,,11,
13,13,37.826111,,,,,23,DWR North Central Region Office,Auger,,...,25.0,MW-2,Alameda,04W,Mount Diablo,,RSI RSI,,12,4.0
14,14,37.679814,,>50 FT,,,17,DWR North Central Region Office,,,...,,U-2,Alameda,02E,Mount Diablo,,GREGG DRILLING & TESTING INC,,13,
15,15,37.871943,,>50 FT,,,2,DWR North Central Region Office,HOLLOW STEM AUGER,,...,33.0,MW-1,Alameda,04W,Mount Diablo,,EXPLORATION GEOSERVICES INC,,14,2.0
16,16,37.663333,,Unknown,,,21,DWR North Central Region Office,,,...,,VP-1A/B/C,Alameda,01E,Mount Diablo,,GREGG DRILLING & TESTING INC,,15,
17,17,37.70172,,Centroid of Section,,,3,DWR North Central Region Office,HAND AUGER,,...,10.5,VP-2,Alameda,02E,M,,V T S DRILLING LLC,,16,0.25
18,18,37.524226,,Unknown,,,2,DWR North Central Region Office,SONIC,,...,51.3,MW-NEW15,Alameda,02W,Mount Diablo,,NATIONAL E W P INC,,17,2.375


##### Make decision on columns required
> Reqd: "DECIMALLATITUDE", "DECIMALLONGITUDE", "TOWNSHIP_RANGE", "RANGE", "SECTION", "WELLLOCATION", "CITY", "COUNTYNAME",
> "PERMITNUMBER", "BOTTOMOFPERFORATEDINTERVAL", "TOPOFPERFORATEDINTERVAL",      "GROUNDSURFACEELEVATION", 
> "STATICWATERLEVEL","RECORDTYPE",  "PLANNEDUSEFORMERUSE", "LOCALPERMITAGENCY", "WCRNUMBER", "TOTALDRILLDEPTH", 
> "TOTALCOMPLETEDDEPTH", "DATEWORKENDED", "DRILLERNAME", "DRILLERLICENSENUMBER", "CASINGDIAMETER"



In [None]:
wellcompletion_subset_df = pd_csv_wellcompletion[["DECIMALLATITUDE", "DECIMALLONGITUDE", "TOWNSHIP_RANGE", "RANGE", "SECTION", "WELLLOCATION", "CITY", "COUNTYNAME",
                              "PERMITNUMBER", "BOTTOMOFPERFORATEDINTERVAL", "TOPOFPERFORATEDINTERVAL", "GROUNDSURFACEELEVATION", "STATICWATERLEVEL", 
                              "RECORDTYPE",  "PLANNEDUSEFORMERUSE", "LOCALPERMITAGENCY", "WCRNUMBER", "TOTALDRILLDEPTH", 
                              "TOTALCOMPLETEDDEPTH", "DATEWORKENDED", "DRILLERNAME", "DRILLERLICENSENUMBER", "CASINGDIAMETER"]].copy()

##### 
 - Check on Types of wells
 'WellCompletion/New/Production or Monitoring/NA',
       'WellCompletion/Destruction/NA/NA',
       'WellCompletion/Drill and Destroy/NA/NA',
       'WellCompletion/Modification or Repair/Production or Monitoring/NA']

- PLANNEDUSAGE

- Completed depth 

In [None]:
# filter to only include new well completion
wellcompletion_subset_df = wellcompletion_subset_df[wellcompletion_subset_df['RECORDTYPE'] == 'WellCompletion/New/Production or Monitoring/NA'].copy()

In [None]:
# filter to only include agriculture, domestic, or public wells
#Data issues Agriculture is also denoted by "AG"
wellcompletion_subset_df['PLANNEDUSEFORMERUSE'] = wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].fillna("")
wellcompletion_subset_df['PLANNEDUSEFORMERUSE'] = wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].str.lower()
wellcompletion_subset_df['PLANNEDUSEFORMERUSE'] = (
                                    np.where(wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].str.contains("agri|irrigation"),
                                             "Agriculture",
                                             np.where(wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].str.contains("domestic"),
                                                     "Domestic",
                                                      np.where(wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].str.contains("indus|commerc"),
                                                      "Industrial",
                                                      np.where(wellcompletion_subset_df['PLANNEDUSEFORMERUSE'].str.contains("public"),
                                                              "Public",
                                                              "Other")
                                                     )
                                            )
                                    ))
wellcompletion_subset_df = wellcompletion_subset_df[wellcompletion_subset_df["use"].isin(["Agriculture","Domestic","Public", "Industrial"])]

In [None]:
# convert depth to number
wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'] = pd.to_numeric(wellcompletion_subset_df['completed_depth'], errors="coerce")
# Not sure abive below-remove depth data that is incorrect?
wellcompletion_subset_df['TOTALCOMPLETEDDEPTH_NEW'] = wellcompletion_subset_df['completed_depth'].apply(lambda x: x if x >= 20 else np.nan)

In [None]:
# convert date work ended to datetime and filter to only include completed dates that are possible (not a future date) 
wellcompletion_subset_df['DATEWORKENDED'] = pd.to_datetime(wellcompletion_subset_df['DATEWORKENDED'], errors='coerce')
wellcompletion_subset_df['DATEWORKENDED_ADJUSTED'] = wellcompletion_subset_df['DATEWORKENDED'].apply(lambda x: x if x < datetime.now() else np.nan)
# create simple year and month columns
wellcompletion_subset_df['YEAR_WORK_ENDED'] = wellcompletion_subset_df['DATEWORKENDED_ADJUSTED'].dt.year
wellcompletion_subset_df['MONTH_WORK_ENDED'] = wellcompletion_subset_df['DATEWORKENDED_ADJUSTED'].dt.month

In [None]:
# clean longitude/latitude data
wellcompletion_subset_df['LONGITUDE'] = wellcompletion_subset_df['LONGITUDE'].replace(to_replace= r'\/', value= '', regex=True)
wellcompletion_subset_df['LATITUDE'] = wellcompletion_subset_df['LATITUDE'].replace(to_replace= r'\/', value= '', regex=True)

In [None]:
# create wells geodataframe
wellcompletion_subset_gdf = gpd.GeoDataFrame(wellcompletion_subset_df, geometry=gpd.points_from_xy(wellcompletion_subset_df.LONGITUDE, wellcompletion_subset_df.LATITUDE))
wellcompletion_subset_gdf = wellcompletion_subset_gdf.dropna(subset=['LATITUDE','LONGITUDE'])
#Set the coordinate reference system (the projection that denote the axis for the points)
wellcompletion_subset_gdf = wellcompletion_subset_gdf.set_crs('epsg:4326')

In [None]:
# spatial join based on geometry
wellcompletion_subset_plss = wellcompletion_subset_gdf.sjoin(plss, how="left")

# drop the ones that aren't in the san joaquin valley basin
wellcompletion_subset_plss = wellcompletion_subset_plss.dropna(subset=['MTRS'])

In [None]:
wellcompletion_subset_plss.sample()
#wellcompletion_subset_plss['geometry'].explore()

In [None]:
# Save cleaned data to folder
wellcompletion_subset_plss.to_csv("./data/well_completion.csv", index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>