# California Well Completion Reports Datasets

Related links:
* For the documentation about this dataset, its source, how to download, and the features of interest, please refer to our [Well Completion Reports Dataset](/doc/assets/well_completion_reports.md) documentation


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import json
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import geopandas as gpd
import pygeos

import altair as alt



In [None]:
import sys
import os

# Note once library path is set this will not be require: REMOVE
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\lib")
from wellcompletionreports import WellCompletionReportsDataset

In [None]:
#This will be stored in the class
wcr_instance = WellCompletionReportsDataset()

In [None]:
def get_plss_and_wellcompletion()
    #initialize an instance
    wcr_instance = WellCompletionReportsDataset()

    # load the plss shapefile (these only include TRS areas that are within the San Joaquin subbasin)
    SJ_subbasin_plss = wcr_instance.map_df
    # aggregate by TownshipRange
    SJ_subbasin_plss_range = SJ_subbasin_plss.dissolve(by='TownshipRange').reset_index()

    wellcompletion_df = wcr_instance.data_df
    wellcompletion_df = wellcompletion_df.iloc[:, 1:].copy()
    
    return wellcompletion_df



In [None]:
#SJ_subbasin_plss.explore()

In [None]:
#SJ_subbasin_plss_range.explore()

### Well Completion data

[Source](https://data.cnra.ca.gov/dataset/well-completion-reports)

In [None]:
def draw_mising_data_chart(df):
    percent_missing = df.isnull().sum() / len(wellcompletion_df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    missing_value_df.sort_values('percent_missing', ascending = False, inplace=True)

    sort_list = list(missing_value_df['column_name'])
    chart = alt.Chart(missing_value_df
                     ).mark_bar(
                        ).encode(
                    y =alt.Y("sum(percent_missing)", stack="normalize", axis=alt.Axis(format='%')),
                    x = alt.X('column_name:N', sort=sort_list),
                    color=alt.value("orange"),
                    tooltip = ['column_name', 'percent_missing']
                    )
    
    
    text = chart.transform_calculate(
        position = 'datum.percent_missing + 0.05 * datum.percent_missing / abs(datum.percent_missing)'
    ).mark_text(
        align='center', 
        fontSize=10,
        color='black'
    ).encode(
        y='position:Q',
        text=alt.Text('percent_missing:Q', format='.0%'),
    )

    
    return chart + text


In [None]:
#Draw a chart showing percent missing values for the features we are interested in
draw_mising_data_chart(wcr_instance.data_df[["DECIMALLATITUDE", "DECIMALLONGITUDE", "TOWNSHIP", "RANGE", "SECTION", "WELLLOCATION", "CITY", "COUNTYNAME", 
                                              "BOTTOMOFPERFORATEDINTERVAL", "TOPOFPERFORATEDINTERVAL", "GROUNDSURFACEELEVATION", "STATICWATERLEVEL", 
                                              "RECORDTYPE",  "PLANNEDUSEFORMERUSE", "WCRNUMBER", "TOTALDRILLDEPTH", 
                                              "TOTALCOMPLETEDDEPTH", "DATEWORKENDED", 'WELLYIELD', 'WELLYIELDUNITOFMEASURE']])

In [None]:
def clean_well_completion_reports():
    wellcompletion_df = wcr_instance.data_df

    #There are latitudes and longitudes that are corrupt : 37/41/11.82/
    wellcompletion_df = wellcompletion_df[~wellcompletion_df.DECIMALLATITUDE.str.contains(r"/", na=False)].copy()
    wellcompletion_df = wellcompletion_df[~wellcompletion_df.DECIMALLONGITUDE.str.contains(r"/", na=False)].copy()

    wellcompletion_df['DECIMALLATITUDE'] = wellcompletion_df.DECIMALLATITUDE.astype('float')
    wellcompletion_df['DECIMALLONGITUDE'] = wellcompletion_df.DECIMALLONGITUDE.astype('float')


    #Correct incorrectly signed logitude and latiude Example :   120.54483 Longitude
    wellcompletion_df['DECIMALLONGITUDE'] = np.where(wellcompletion_df['DECIMALLONGITUDE'] > 0,
                                                    -wellcompletion_df['DECIMALLONGITUDE'],
                                                    wellcompletion_df['DECIMALLONGITUDE'])

    wellcompletion_df['DECIMALLATITUDE'] = np.where(wellcompletion_df['DECIMALLATITUDE'] < 0,
                                                    -wellcompletion_df['DECIMALLATITUDE'],
                                                    wellcompletion_df['DECIMALLATITUDE'])

    #About 5% of the dataframe has eith latitude or longitude missing, we drop these
    wellcompletion_df = wellcompletion_df.dropna(subset=['DECIMALLATITUDE', 'DECIMALLONGITUDE']).copy()

    # Pick data of interest
    wellcompletion_subset_df = wellcompletion_df[["DECIMALLATITUDE", "DECIMALLONGITUDE", "TOWNSHIP", "RANGE", "SECTION", "WELLLOCATION", "CITY", "COUNTYNAME", 
                                                  "BOTTOMOFPERFORATEDINTERVAL", "TOPOFPERFORATEDINTERVAL", "GROUNDSURFACEELEVATION", "STATICWATERLEVEL", 
                                                  "RECORDTYPE",  "PLANNEDUSEFORMERUSE", "WCRNUMBER", "TOTALDRILLDEPTH", 
                                                  "TOTALCOMPLETEDDEPTH", "DATEWORKENDED", 'WELLYIELD', 'WELLYIELDUNITOFMEASURE']].copy()



    #len(wellcompletion_subset_df[(wellcompletion_subset_df['LATITUDE'].isnull()) | (wellcompletion_subset_df['LATITUDE'].isnull())])/  len(wellcompletion_subset_df)
    #.05

    # rename columns
    wellcompletion_subset_df.rename(columns={"DECIMALLATITUDE" : "LATITUDE", 
                                            "DECIMALLONGITUDE" : "LONGITUDE", 
                                            "PLANNEDUSEFORMERUSE": "USE" ,       
                                            "COUNTYNAME" : "COUNTY", 
                                           }, inplace=True)

    # filter to only include new well completion since we predict on this
    wellcompletion_subset_df = wellcompletion_subset_df.loc[wellcompletion_subset_df['RECORDTYPE'] == 'WellCompletion/New/Production or Monitoring/NA']

    # filter to only include agriculture, domestic, or public wells
    #Data issues Agriculture is also denoted by "AG"
    wellcompletion_subset_df['USE'] = wellcompletion_subset_df['USE'].fillna("")
    wellcompletion_subset_df['USE'] = wellcompletion_subset_df['USE'].str.lower()
    wellcompletion_subset_df['USE'] = (
                                        np.where(wellcompletion_subset_df['USE'].str.contains("agri|irrigation"),
                                                 "Agriculture",
                                                 np.where(wellcompletion_subset_df['USE'].str.contains("domestic"),
                                                         "Domestic",
                                                          np.where(wellcompletion_subset_df['USE'].str.contains("indus|commerc"),
                                                          "Industrial",
                                                          np.where(wellcompletion_subset_df['USE'].str.contains("public"),
                                                                  "Public",
                                                                  "Other")
                                                         )
                                                )
                                        ))
    wellcompletion_subset_df = wellcompletion_subset_df[wellcompletion_subset_df["USE"].isin(["Agriculture","Domestic","Public", "Industrial"])]

    wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'] = pd.to_numeric(wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'], errors="coerce")

    # removes depth data that are less than 20'
    wellcompletion_subset_df['TOTALCOMPLETEDDEPTH_CORRECTED'] = wellcompletion_subset_df['TOTALCOMPLETEDDEPTH'].apply(lambda x: x if x >= 20 else np.nan)
    # convert date work ended to datetime and filter to only include completed dates that are possible (not a future date) 
    wellcompletion_subset_df['DATEWORKENDED'] = pd.to_datetime(wellcompletion_subset_df['DATEWORKENDED'], errors='coerce')
    # convert date work ended to datetime and filter to only include completed dates that are possible (not a future date) 
    wellcompletion_subset_df['DATEWORKENDED'] = pd.to_datetime(wellcompletion_subset_df['DATEWORKENDED'], errors='coerce')
    wellcompletion_subset_df['DATEWORKENDED_CORRECTED'] = wellcompletion_subset_df['DATEWORKENDED'].apply(lambda x: x if x < datetime.now() else np.nan)
    # create simple year and month columns
    wellcompletion_subset_df['YEARWORKENDED'] = pd.DatetimeIndex(wellcompletion_subset_df['DATEWORKENDED_CORRECTED']).year
    wellcompletion_subset_df['MONTHWORKENDED'] = pd.DatetimeIndex(wellcompletion_subset_df['DATEWORKENDED_CORRECTED']).month
    
    return wellcompletion_subset_df

In [None]:
def merge_data_plss():
    df = self.wellcompletion_subset_df
    # create wells geodataframe
    wellcompletion_subset_gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.LONGITUDE, df.LATITUDE))
    #Set the coordinate reference system (the projection that denote the axis for the points)
    wellcompletion_subset_gdf = wellcompletion_subset_gdf.set_crs('epsg:4326')

    # spatial join based on geometry
    wellcompletion_subset_plss = wellcompletion_subset_gdf.sjoin(SJ_subbasin_plss, how="left")

    # drop the ones that aren't in the san joaquin valley basin
    wellcompletion_subset_plss = wellcompletion_subset_plss.dropna(subset=['MTRS'])
    
    return wellcompletion_subset_plss

In [None]:
#len(wellcompletion_subset_df)/ len(wcr_instance.data_df) # Cleaned dataset =  0.43264636176071297  of original

In [None]:
wellcompletion_subset_plss.shape #(105987, 34)

In [None]:
len(wellcompletion_subset_plss)/ len(wcr_instance.data_df)# San Joaquin dataset =  0.10161433206267881  of original

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b042e2da-6536-449d-95b8-d85fa08825de' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>