MASSACHUSETTS INSTITUTE OF TECHNOLOGY<br>
SYSTEM DESIGN & MANAGEMENT<br>
Author: R. Chadwick Holmes<br>
Date: December 5, 2021<br><br>

Script Purpose:<br>
Script reads in well data set, expands by 4 or 8 using neighboring locations within some distance threshold (nominally, .01 degrees), and saves new "expanded" datasets out for use in modeling.
<br><br>


In [1]:
!apt update
!apt upgrade
!apt install gdal-bin python-gdal python3-gdal 
# Install rtree - Geopandas requirment
!apt install python3-rtree 
# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git
# Install descartes - Geopandas requirment
!pip install descartes cartopy
!pip uninstall -y shapely
!pip install shapely --no-binary shapely
!pip install dataprep --no-binary dataprep

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:12 http://ppa.

### Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')
path = r'/content/drive/MyDrive/Colab Notebooks/Data'

Mounted at /content/drive


### Load Key Libraries

In [3]:
import numpy as np
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
from shapely.geometry import Point, Polygon, MultiPolygon  # for manipulating text data into geospatial shapes
from shapely import wkt  # stands for "well known text," allows for interchange across GIS programs
import rtree  # supports geospatial join

from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import StandardScaler, PowerTransformer

from pathlib import Path
import pickle as pkl

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

import seaborn as sns
#from dataprep.eda import plot, plot_correlation, plot_missing, create_report

### Load data into geopandas dataframe

In [11]:
datafolder = Path(path)

wfile = r'SMU_wells_clipped2extent.csv'
welldf = pd.read_csv(datafolder / wfile)
welldf.shape

(1148, 31)

### Generate new well file with additional points (N, S, E, W)

In [12]:
# drop any rows missing lat, lon and geothermal gradient values
welldf = welldf.dropna(subset=['Latitude','Longitude','GeothermGrad'])
welldf.shape

(1117, 31)

In [13]:
# sort wells by gradient, drop duplicates and keep highest gradient observed
welldf = welldf.sort_values(by=['Latitude','Longitude','GeothermGrad'],ascending=False).drop_duplicates(subset=['Latitude','Longitude'],keep='first')
welldf.shape

(812, 31)

In [14]:
list(welldf)

['OID_',
 'id',
 'gid',
 'Latitude',
 'Longitude',
 'area_or_ams_map',
 'twn_rng_sec',
 'county',
 'state',
 'tect_prov',
 'location_notes',
 'meas_date',
 'drill_date',
 'collar_elevation',
 'drill_depth',
 'water_table',
 'surf_temp',
 'min_temp',
 'max_temp',
 'bot_temp',
 'interval_id',
 'depth',
 'bottom',
 'geoth_grad',
 'geoth_grad_sym',
 'geoth_grad_std_err',
 'cgeoth_grad',
 'cgeoth_grad_sym',
 'cgeoth_grad_std_err',
 'GeothermGrad',
 'smu_id']

In [15]:
welldf['Latitude0'] = welldf['Latitude']
welldf['Longitude0'] = welldf['Longitude']

In [16]:
def addnewrow(df,rawrow,pluslat,pluslong):
    rawrow.Latitude += pluslat
    rawrow.Longitude += pluslong
    return df.append(rawrow)

In [17]:
outdf = welldf.copy(deep=True).reset_index(drop=True)
ind = outdf.index.copy(deep=True)
print(outdf.shape)

#### write out for use with ArcGIS
print('saving original data...')
outdf.to_csv(datafolder / 'well_locs_orig.csv')

(812, 33)
saving original data...


In [18]:
outdf = welldf.copy(deep=True).reset_index(drop=True)
ind = outdf.index.copy(deep=True)
print(outdf.shape)

for i in ind:
    rawrow = outdf.loc[i,:].copy(deep=True)
    
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.01,0.00);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),-0.01,0.00);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.00,0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.00,-0.01);

#### write out for use with ArcGIS
print(outdf.shape)
print('saving data with 4 new pseudowells...')
outdf.to_csv(datafolder / 'well_locs_plus4.csv')

(812, 33)
(4060, 33)
saving data with 4 new pseudowells...


In [19]:
outdf = welldf.copy(deep=True).reset_index(drop=True)
ind = outdf.index.copy(deep=True)
print(outdf.shape)

for i in ind:
    rawrow = outdf.loc[i,:].copy(deep=True)
    
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.01,0.00);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),-0.01,0.00);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.00,0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.00,-0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.01,0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),-0.01,0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),0.01,-0.01);
    outdf = addnewrow(outdf,rawrow.copy(deep=True),-0.01,-0.01);

#### write out for use with ArcGIS
print(outdf.shape)
print('saving data with 8 new pseudowells...')
outdf.to_csv(datafolder / 'well_locs_plus8.csv')

(812, 33)
(7308, 33)
saving data with 8 new pseudowells...
