In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import geopandas as gpd
import pygeos

import altair as alt

In [None]:
# load the plss shapefile (these only include TRS areas that are within the San Joaquin subbasin)
SJ_subbasin_plss = gpd.read_file("assets/clean_data/plss_subbasin.geojson")
# aggregate by TownshipRange
SJ_subbasin_plss_range = SJ_subbasin_plss.dissolve(by='TownshipRange').reset_index()

In [None]:
#SJ_subbasin_plss.explore()

In [None]:
#SJ_subbasin_plss_range.explore()

### Well Completion data

[Source](https://data.cnra.ca.gov/dataset/well-completion-reports)

In [None]:
wellcompletion_df = pd.read_csv(r"assets/clean_data/wellcompletion.csv")
wellcompletion_df = wellcompletion_df.iloc[:, 1:].copy()

  wellcompletion_df = pd.read_csv(r"assets/clean_data/wellcompletion.csv")


In [None]:
wellcompletion_df.shape #1043032 - 988396  = 54636

(1043032, 46)

In [None]:
#drop the records that do not have latitude and longitude
wellcompletion_df = wellcompletion_df.dropna(subset=['DECIMALLATITUDE', 'DECIMALLONGITUDE']).copy()
#There are latitudes and longitudes that are corrupt : 37/41/11.82/
wellcompletion_df = wellcompletion_df[~wellcompletion_df.DECIMALLATITUDE.str.contains(r"/", na=False)].copy()
wellcompletion_df = wellcompletion_df[~wellcompletion_df.DECIMALLONGITUDE.str.contains(r"/", na=False)].copy()

wellcompletion_df['DECIMALLATITUDE'] = wellcompletion_df.DECIMALLATITUDE.astype('float')
wellcompletion_df['DECIMALLONGITUDE'] = wellcompletion_df.DECIMALLONGITUDE.astype('float')



# Pick data of interest
wellcompletion_subset_df = wellcompletion_df[["DECIMALLATITUDE", "DECIMALLONGITUDE", "TOWNSHIP", "RANGE", "SECTION", "WELLLOCATION", "CITY", "COUNTYNAME", 
                                              "BOTTOMOFPERFORATEDINTERVAL", "TOPOFPERFORATEDINTERVAL", "GROUNDSURFACEELEVATION", "STATICWATERLEVEL", 
                                              "RECORDTYPE",  "PLANNEDUSEFORMERUSE", "WCRNUMBER", "TOTALDRILLDEPTH", 
                                              "TOTALCOMPLETEDDEPTH", "DATEWORKENDED","CASINGDIAMETER"]].copy()


# rename columns
wellcompletion_subset_df.rename(columns={"DECIMALLATITUDE" : "LATITUDE", 
                                        "DECIMALLONGITUDE" : "LONGITUDE", 
                                        "PLANNEDUSEFORMERUSE": "USE" ,       
                                        "COUNTYNAME" : "COUNTY", 
                                       }, inplace=True)

In [None]:
# filter to only include new well completion since we predict on this
wellcompletion_subset_df = wellcompletion_subset_df.loc[wellcompletion_subset_df['RECORDTYPE'] == 'WellCompletion/New/Production or Monitoring/NA']

In [None]:
# filter to only include agriculture, domestic, or public wells
#Data issues Agriculture is also denoted by "AG"
wellcompletion_subset_df['USE'] = wellcompletion_subset_df['USE'].fillna("")
wellcompletion_subset_df['USE'] = wellcompletion_subset_df['USE'].str.lower()
wellcompletion_subset_df['USE'] = (
                                    np.where(wellcompletion_subset_df['USE'].str.contains("agri|irrigation"),
                                             "Agriculture",
                                             np.where(wellcompletion_subset_df['USE'].str.contains("domestic"),
                                                     "Domestic",
                                                      np.where(wellcompletion_subset_df['USE'].str.contains("indus|commerc"),
                                                      "Industrial",
                                                      np.where(wellcompletion_subset_df['USE'].str.contains("public"),
                                                              "Public",
                                                              "Other")
                                                     )
                                            )
                                    ))
wellcompletion_subset_df = wellcompletion_subset_df[wellcompletion_subset_df["USE"].isin(["Agriculture","Domestic","Public", "Industrial"])]

In [None]:
wellcompletion_subset_df.columns

Index(['LATITUDE', 'LONGITUDE', 'TOWNSHIP', 'RANGE', 'SECTION', 'WELLLOCATION',
       'CITY', 'COUNTY', 'BOTTOMOFPERFORATEDINTERVAL',
       'TOPOFPERFORATEDINTERVAL', 'GROUNDSURFACEELEVATION', 'STATICWATERLEVEL',
       'RECORDTYPE', 'USE', 'WCRNUMBER', 'TOTALDRILLDEPTH',
       'TOTALCOMPLETEDDEPTH', 'DATEWORKENDED', 'CASINGDIAMETER'],
      dtype='object')

In [None]:
wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'] = pd.to_numeric(wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'], errors="coerce")

In [None]:
wellcompletion_subset_df[wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'] <= 20]#889 rows 

Unnamed: 0,LATITUDE,LONGITUDE,TOWNSHIP,RANGE,SECTION,WELLLOCATION,CITY,COUNTY,BOTTOMOFPERFORATEDINTERVAL,TOPOFPERFORATEDINTERVAL,GROUNDSURFACEELEVATION,STATICWATERLEVEL,RECORDTYPE,USE,WCRNUMBER,TOTALDRILLDEPTH,TOTALCOMPLETEDDEPTH,DATEWORKENDED,CASINGDIAMETER
825,37.865833,-122.297778,01S,04W,03,211 FIFTH ST.,BERKELEY,Alameda,,,,8.0,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR2004-002682,,18.0,6/30/2004,
2648,37.772190,-122.253530,02S,04W,12,SAME AS ABOVE,,Alameda,20.0,,,9.0,WellCompletion/New/Production or Monitoring/NA,Agriculture,WCR0018634,,19.0,2/24/1977,4.0
4225,37.614410,-121.851110,04S,01E,03,3400 HAPPY VALLEY RD,SUNOL,Alameda,235.0,100.0,,62.0,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR1998-001528,,20.0,5/27/1998,6.0
7204,37.673700,-122.143330,03S,03W,13,,,Alameda,18.0,9.0,,11.0,WellCompletion/New/Production or Monitoring/NA,Agriculture,WCR0272435,,18.0,,4.0
10944,37.769380,-122.238150,02S,03W,07,,,Alameda,17.0,9.0,,8.0,WellCompletion/New/Production or Monitoring/NA,Agriculture,WCR0324980,,17.0,8/23/1978,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1022179,38.646111,-121.599167,09N,03E,12,,,Yolo,,,,,WellCompletion/New/Production or Monitoring/NA,Agriculture,WCR2002-006614,,8.0,4/10/2002,6.0
1027449,38.715560,-122.054260,10N,02W,14,,CAPAY,Yolo,17.0,13.0,,,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR1959-000545,,17.0,7/27/1959,6.0
1032108,39.129598,-121.493184,15N,04E,23,6019 MOON AVE,,Yuba,,,,30.0,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR1999-000784,,17.0,6/15/1999,9.0
1036648,39.164250,-121.605190,15N,03E,11,,MARYSVILLE,Yuba,,,,,WellCompletion/New/Production or Monitoring/NA,Domestic,WCR1957-000678,,17.0,5/15/1957,18.0


In [None]:
# removes depth data that are less than 20'
wellcompletion_subset_df['TOTALCOMPLETEDDEPTH_CORRECTED'] = wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'].apply(lambda x: x if x >= 20 else np.nan)
# convert date work ended to datetime and filter to only include completed dates that are possible (not a future date) 
wellcompletion_subset_df['DATEWORKENDED'] = pd.to_datetime(wellcompletion_subset_df['DATEWORKENDED'], errors='coerce')
# convert date work ended to datetime and filter to only include completed dates that are possible (not a future date) 
wellcompletion_subset_df['DATEWORKENDED'] = pd.to_datetime(wellcompletion_subset_df['DATEWORKENDED'], errors='coerce')
wellcompletion_subset_df['DATEWORKENDED_CORRECTED'] = wellcompletion_subset_df['DATEWORKENDED'].apply(lambda x: x if x < datetime.now() else np.nan)
# create simple year and month columns
wellcompletion_subset_df['YEARWORKENDED'] = pd.DatetimeIndex(wellcompletion_subset_df['DATEWORKENDED_CORRECTED']).year
wellcompletion_subset_df['MONTHWORKENDED'] = pd.DatetimeIndex(wellcompletion_subset_df['DATEWORKENDED_CORRECTED']).month

In [None]:
# create wells geodataframe
wellcompletion_subset_gdf = gpd.GeoDataFrame(wellcompletion_subset_df, geometry=gpd.points_from_xy(wellcompletion_subset_df.LONGITUDE, wellcompletion_subset_df.LATITUDE))
#Set the coordinate reference system (the projection that denote the axis for the points)
wellcompletion_subset_gdf = wellcompletion_subset_gdf.set_crs('epsg:4326')

In [None]:
# spatial join based on geometry
wellcompletion_subset_plss = wellcompletion_subset_gdf.sjoin(SJ_subbasin_plss, how="left")

In [None]:
# drop the ones that aren't in the san joaquin valley basin
wellcompletion_subset_plss = wellcompletion_subset_plss.dropna(subset=['MTRS'])

In [None]:
wellcompletion_subset_plss.shape

(105987, 33)

In [None]:
wellcompletion_subset_plss.to_csv(r"assets/clean_data/well_completion_clean.csv", index=False)

In [None]:
#wellcompletion_subset_plss.TownshipRange.unique()

array(['T02S R04E', 'T02S R03E', 'T01S R04E', 'T01S R03E', 'T03S R04E',
       'T03S R11E', 'T07N R09E', 'T05N R09E', 'T05N R10E', 'T06N R09E',
       'T06N R10E', 'T04N R11E', 'T04N R10E', 'T08S R12E', 'T03N R10E',
       'T04N R09E', 'T04S R13E', 'T03N R09E', 'T02N R10E', 'T03N R04E',
       'T01S R09E', 'T01N R09E', 'T02N R09E', 'T01N R02E', 'T02N R03E',
       'T03N R03E', 'T01N R03E', 'T02N R02E', 'T02N R01E', 'T01N R04E',
       'T02N R04E', 'T01S R02E', 'T04S R05E', 'T02N R07E', 'T06N R08E',
       'T15S R13E', 'T14S R24E', 'T14S R22E', 'T12S R20E', 'T14S R17E',
       'T15S R19E', 'T14S R20E', 'T14S R23E', 'T17S R20E', 'T16S R20E',
       'T15S R24E', 'T13S R21E', 'T16S R23E', 'T14S R19E', 'T12S R21E',
       'T14S R21E', 'T13S R22E', 'T14S R16E', 'T13S R14E', 'T12S R22E',
       'T13S R19E', 'T11S R13E', 'T15S R23E', 'T15S R17E', 'T13S R20E',
       'T17S R21E', 'T14S R18E', 'T16S R19E', 'T15S R20E', 'T15S R21E',
       'T17S R19E', 'T14S R15E', 'T13S R15E', 'T13S R17E', 'T16S

In [None]:
wellcompletion_subset_plss.COUNTY.unique()

array(['Alameda', 'Amador', 'Calaveras', 'Contra Costa', 'El Dorado',
       'Fresno', 'Humboldt', 'Inyo', 'Kern', 'Kings', 'Madera', 'Marin',
       'Mariposa', 'Merced', 'Monterey', 'Napa', 'Nevada', 'Placer',
       'Sacramento', 'San Benito', 'San Bernardino', 'San Joaquin',
       'San Luis Obispo', 'San Mateo', 'Santa Barbara', 'Santa Clara',
       'Solano', 'Sonoma', 'Stanislaus', 'Tulare', 'Trinity', 'Tuolumne',
       'Ventura', nan], dtype=object)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>