In [None]:
import pandas as pd
import numpy as np
from glob import glob
import os
import shutil
import errno

def copyAnything(src, dst):
    try:
        shutil.copytree(src, dst)
    except OSError as exc: # python >2.5
        if exc.errno in (errno.ENOTDIR, errno.EINVAL):
            shutil.copy(src, dst)
        else: raise

def findWells(wellListFile, wellFolders):
    # get the apiNo's we found in the drillingInfo query.

    bakkenWells = pd.read_csv(wellListFile)
    apiColumn = bakkenWells.columns[0]
    apiNums = set(map(lambda x: int(x/1e4), bakkenWells[apiColumn]))

    # get the list of wells in our north dakota directory (without checking for LAS present).
    folderList = glob(f'{wellFolders}/*', recursive=True)
    refinedList = set([int(x.split('\\')[1]) for x in folderList])
    
    # # check if we have folders for any of the wells found in the drillinginfo set.
    intersection = list(refinedList.intersection(apiNums))

    # given the new wellset, derive a new directory with only folders from this subset
    if not os.path.exists('wellSubset'):
        os.makedirs('wellSubset')

    newPath = f'wellSubset\\'
    for folder in intersection:
        oldPath = f'scrapedWells\\{folder}\\'
        _new = f'{newPath}{folder}'
        try:
            copyAnything(oldPath, _new)
        except FileExistsError:
            pass
      
def checkForLAS(wellFolders):
    folderList = glob(f'{wellFolders}/*', recursive=True)
    for folder in folderList:
        files = glob(f'{folder}/*')
        if any('.las' in s for s in files):
            pass
        else:
            print(f'no LAS files found. removing {folder}')
            try:
                shutil.rmtree(folder)
            except OSError as err:
                print("Error: %s : %s" % (folder, err.strerror))
    
def parseLASv1(LAS):
    with open(LAS,'r') as f:
        """
        for each line, if its not a new line char, split the spaces out.
        once only valid lines remain, reverse the list of lists, so that we start from the bottom first.
        by doing so, we immediately know the shape of our rows, and can traverse through the list of list,
        until we encounter a different sized list. then we can just find the first list with +1 shape as our 
        rows and thats the column headers.
        """
       
        lines = [line.strip().split() for line in f.read().splitlines() if line]

        delimiter = None
        lenLine = None

        for idx,line in enumerate(lines):
            if '~A' in line[0]:
                delimiter = idx
                lenLine = len(line)
                break

        # find column names and assign to dataframe

        if lenLine == 1:
            columnHeaders = lines[delimiter-2][1:]
            dataframe = pd.DataFrame(lines[delimiter+1:],columns=columnHeaders)

        if lenLine > 1:
            columnHeaders = lines[delimiter][1:]
            dataframe = pd.DataFrame(lines[delimiter+1:], columns=columnHeaders)
            
        # generate new file, and write back to it 
        dataframe.to_csv(LAS.split('.')[0]+'.csv')

def parseLASv2(LAS):
    with open(LAS,'r') as f:
        # reverse the arrays, find the first deviation away from the current structure
        # add an ADDITIONAL filter for empty lists because apparently one isn't enough.
        lines = list(filter(None,[line.strip().split() for line in f.read().splitlines() if line]))
        
        for idx, line in enumerate(lines):
            if '~A' in line[0]:
                stop = idx
                break
        
        # the column headers are usually within 3 rows of the delimiter
        for i in range(0,10):
            row = lines[stop-i]
            for j in range(len(row)):
                if any(string in row[j] for string in ['DEP','Dep','dep']):
                    columnHeaders = row[j:]
                    break
            else:
                continue
            break
                
        dataframe = pd.DataFrame(lines[stop+1:], columns=columnHeaders)
        
        dataframe.to_csv(LAS.split('.')[0]+'.csv')              

_folder = 3310502065

fileList =  glob(f'wellSubset\\{_folder}\\*.las')
for file in fileList:
    
    try:
        parseLASv2(file)
    except Exception as err:
        print(file.split('\\')[-1])            
        print(err)

In [None]:
def beginParsing(parentDirectory):
    folderList = glob(f'{parentDirectory}/*', recursive=True)
    for folder in folderList:
        files = glob(f'{folder}/*.las')
        for file in files:
            try:
                parseLASv2(file)
            except Exception as err:
                with open('revisit.txt', 'a') as f:
                    f.write(file)
    
beginParsing(parentDirectory='wellSubset')


In [None]:
def readStragglers(file):
    with open(file,'r') as f:
        data = f.readlines()[0].split('well')[1:]
        _d = ['well'+d for d in data]
    return _d

stragglers = readStragglers('revisit.txt')


In [None]:
import pandas as pd
def parseLASv3(file):
    with open(file, 'r') as f:
        lines = list(filter(None,[line.strip().split() for line in f.read().splitlines() if line]))
    
    for line in lines:
        print(line)
    for idx, line in enumerate(lines):
        if '~A' in line[0]:
            stop = idx
            break

    # the column headers are usually within 3 rows of the delimiter
    for i in range(0,10):
        row = lines[stop-i]
        for j in range(len(row)):
            if any(string in row[j] for string in ['DEP','Dep','dep']):
                columnHeaders = row[j:]
                print(columnHeaders)
                break
        else:
            continue
        break
            
    dataframe = pd.DataFrame(lines[stop+1:], columns=columnHeaders)
    
    # dataframe.to_csv(file.split('.')[0]+'.csv')              


parseLASv3(stragglers[6])

In [None]:
from glob import glob

# get updated well list.

files = glob('wellSubset/*')
wellList = [well.split('\\')[1]+'0000' for well in files]
f = open('updatedWellList.txt', 'w')
for well in wellList:
    f.write(well+ '\n')
f.close()

In [None]:
import json
import pandas as pd

# read in maxDepth.txt and look at the data, try to see what logs are most common in lateral leg.

def findMostCommonFeature(file):
    with open(file, 'r') as f:
        d = json.loads(f.read())

    i = 0

    columnDict = {'Depth':0,
                'GR':0
                }

    for k,v in d.items():
        # only look at 1000 or so instances. 
        if i < min(1000, len(d.keys())):
            with open('datasets\\'+k, 'r') as f:
                lines = f.readlines()
                try:
                    columns = lines[0].strip().split(',')[1:]
                    columnDict['Depth'] += 1
                    for _c in columns[1:]:
                        if _c not in columnDict and _c not in ['#Depth', 'DEPTH', 'DEPT']:
                            columnDict[_c] = 1
                        else:
                            columnDict[_c] += 1 
                except Exception as err:
                    pass
        # df = pd.read_csv(k, index_col=0)
        i += 1

    sortedColumnDict = dict(sorted(columnDict.items(), 
                                    key=lambda item: item[1],
                                    reverse=True))

    print(sortedColumnDict)

findMostCommonFeature('housekeeping/maxDepths.txt')

In [None]:
import json 
import csv
import pandas as pd 

gasList = ['C1', 'C2', 'C3', 'C4']
gammaRay = ['Gamma', 'GR','GAMMA']
# read in maxDepth.txt and look at the data. 
# find gas logs, and keep them only. these were the most common features.
with open('housekeeping/maxDepths.txt', 'r') as f:
    d = json.loads(f.read())
    
    
newDict = {}
for k,v in d.items():
    k = 'datasets\\'+k
    
    # read in csv, i was going to use csv.DictReader but i dont want first column as unnamed.
    df = pd.read_csv(k, index_col=0)
    
    if all(substring in df.columns for substring in gasList):
        grMatch = list(set(df.columns).intersection(set(gammaRay)))
        if grMatch:
            # restructure depth to be referencable regardless of file.
            df = df.rename(columns={ df.columns[0]: "DEPTH", grMatch[0]:"GR"})
            
            # generate new well list of pkls
            pklPath = k.split('.')[0]+'.pkl'
            df.to_pickle(pklPath)
            newDict[pklPath] = v
    break
    

In [None]:
import pickle
with open('housekeeping/maxDepths.txt', 'r') as f:
    d = json.loads(f.read())

# horizontals = []
# for k,v in d.items():
#     if v > 14000:
#         horizontals.append(k)

# with open('housekeeping/horizontalWells.pkl', 'wb') as fp:
#     pickle.dump(horizontals, fp)

verticals = []
for k,v in d.items():
    if v < 12000:
        print(v)
        verticals.append(k)
print(len(verticals))
with open('../housekeeping/verticalWells.pkl', 'wb') as fp:
    pickle.dump(verticals, fp)

In [None]:
with open ('housekeeping/verticalWells.pkl', 'rb') as fp:
    d = pickle.load(fp)

i = 0

columnDict = {'Depth':0,
            'GR':0
            }

for k in d:
    # only look at 1000 or so instances. 
    print(k)
    if i < min(1000, len(d)):
        with open('datasets\\'+k, 'r') as f:
            lines = f.readlines()
            try:
                columns = lines[0].strip().split(',')[1:]
                columnDict['Depth'] += 1
                for _c in columns[1:]:
                    if _c not in columnDict and _c not in ['#Depth', 'DEPTH', 'DEPT']:
                        columnDict[_c] = 1
                    else:
                        columnDict[_c] += 1 
            except Exception as err:
                pass
    # df = pd.read_csv(k, index_col=0)
    i += 1

sortedColumnDict = dict(sorted(columnDict.items(), 
                                key=lambda item: item[1],
                                reverse=True))

print(sortedColumnDict)

In [None]:
'./datasets/horizontalLogs/'+'_'.join(itemlist[0].split('\\')[1:3]).split('.')[0] + '.pkl'

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px

bakkenWells = pd.read_csv('middle bakken wells.csv')
bakkenWells = bakkenWells.rename(columns={
    'Bottom Hole Latitude (WGS84)': 'bh latitude',
    'Bottom Hole Longitude (WGS84)': 'bh longitude'
})

_bakken = bakkenWells[bakkenWells['Producing Reservoir'] == 'MIDDLE BAKKEN (BAKKEN POOL)']
x, y, z = _bakken['bh latitude'], _bakken['bh longitude'], _bakken['True Vertical Depth']



In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator
from matplotlib import interactive
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy import stats

interactive(True)
%matplotlib qt

def reject_outliers(data, m=2.):
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d / (mdev if mdev else 1.)
    return data[s < m]

# read in data and relabel for ease of access.
bakkenWells = pd.read_csv('middle bakken wells.csv')
bakkenWells = bakkenWells.rename(columns={
    'Bottom Hole Latitude (WGS84)': 'bh latitude',
    'Bottom Hole Longitude (WGS84)': 'bh longitude',
    'True Vertical Depth': 'TVD'
})

_bakken = bakkenWells[bakkenWells['Producing Reservoir'] == 'MIDDLE BAKKEN (BAKKEN POOL)']

# create subset of dataset to derive a surface equation for z-dimension. we'll use a dnn and tensorflow
dataset = _bakken[['bh latitude', 'bh longitude', 'TVD']]
# drop na rows if any present
dataset.dropna()

# since 2 axes are longitude and latitude, the only
# variance should come STRICTLY from z-axis aka TVD. we want to be pretty strict with how many 
# std. devs we want to consider for outliers; the underground structure is allowed to be steep,

# if you plot a histogram of the data, the distribution is strongly negatively skewed, so using 
# z-score not the best, unless you kind of know what you're trying to remove. i KNOW i'm removing
# a decent chunk of the left tail because too shallow could be anomalous, and a small slice of right
# tail where TVD > 16000. in reality, these could represent faults, but it could just mess up the 
# symmetry of the surface.

# plt.hist(dataset['TVD'], bins=1000)

dataset = dataset[(np.abs(stats.zscore(dataset['TVD'])) < 4.5)]

# plot data to visually determine if there are still outliers. 

X, Y, Z = dataset['bh latitude'], dataset['bh longitude'], dataset['TVD']

fig, ax = plt.subplots(figsize=(30,12),
                       subplot_kw={"projection": "3d"})

ax.scatter(X,Y,Z, c=Z, s=10, cmap='viridis')
ax.set_xlabel('Bottom Hole Latitude (WGS84)')
ax.set_ylabel('Bottom Hole Longitude (WGS84)')
ax.set_zlabel('Middle Bakken Producing Depth (TVD)')
ax.invert_zaxis()

# looks better.
plt.show()

# split dataset into training and testing
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('TVD')
test_labels = test_features.pop('TVD')

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

first = np.array(train_features[:1])

# with np.printoptions(precision=2, suppress=True):
#   print('First example:', first)
#   print()
#   print('Normalized:', normalizer(first).numpy())

def build_and_compile_model(norm):
  model = tf.keras.Sequential([
      norm,
      tf.keras.layers.Dense(8, activation='relu'),
      tf.keras.layers.Dense(8, activation='relu'),
      tf.keras.layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_percentage_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

def plot_loss(history):
    plt.figure(figsize=(20,12))
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    #   plt.ylim([0, 10])
    plt.xlabel('Epoch')
    plt.ylabel('Error [TVD]')
    plt.legend()
    plt.grid(True)

dnn_model = build_and_compile_model(normalizer)
# dnn_model.summary()

history = dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)

plot_loss(history)

# X, Y = np.meshgrid(X,Y)

# # Plot the surface.
# surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
#                        linewidth=0, antialiased=False)

# # Customize the z axis.
# ax.set_zlim(-1.01, 1.01)
# ax.zaxis.set_major_locator(LinearLocator(10))
# # A StrMethodFormatter is used automatically
# ax.zaxis.set_major_formatter('{x:.02f}')

# # Add a color bar which maps values to colors.
# fig.colorbar(surf, shrink=0.5, aspect=5)




In [None]:
test_results = {}
test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)


In [None]:
# libraries
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import interactive
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats  

interactive(True)
%matplotlib qt

# read in data and relabel for ease of access.
bakkenWells = pd.read_csv('middle bakken wells.csv')
bakkenWells = bakkenWells.rename(columns={
    'Bottom Hole Latitude (WGS84)': 'bh latitude',
    'Bottom Hole Longitude (WGS84)': 'bh longitude',
    'True Vertical Depth': 'TVD'
})

_bakken = bakkenWells[bakkenWells['Producing Reservoir'] == 'MIDDLE BAKKEN (BAKKEN POOL)']

# create subset of dataset to derive a surface equation for z-dimension. we'll use a dnn and tensorflow
dataset = _bakken[['bh latitude', 'bh longitude', 'TVD']]
# drop na rows if any present
dataset.dropna()

# since 2 axes are longitude and latitude, the only
# variance should come STRICTLY from z-axis aka TVD. we want to be pretty strict with how many 
# std. devs we want to consider for outliers; the underground structure is allowed to be steep,

# if you plot a histogram of the data, the distribution is strongly negatively skewed, so using 
# z-score not the best, unless you kind of know what you're trying to remove. i KNOW i'm removing
# a decent chunk of the left tail because too shallow could be anomalous, and a small slice of right
# tail where TVD > 16000. in reality, these could represent faults, but it could just mess up the 
# symmetry of the surface.

# plt.hist(dataset['TVD'], bins=1000)

dataset = dataset[(np.abs(stats.zscore(dataset['TVD'])) < 4.5)]

# plot data to visually determine if there are still outliers. 

X, Y, Z = dataset['bh latitude'], dataset['bh longitude'], dataset['TVD']

# to Add a color bar which maps values to colors.
fig = plt.figure(figsize=(20,12))
ax = fig.gca(projection='3d')
surf=ax.plot_trisurf(Y, X, Z, cmap=plt.cm.viridis,
                     edgecolor='none', linewidth=0, antialiased=False)
cbar = fig.colorbar( surf, shrink=0.5, aspect=5, )
cbar.set_label('TVD')
ax.set_xlabel('Bottom Hole Latitude (WGS84)')
ax.set_ylabel('Bottom Hole Longitude (WGS84)')
ax.set_zlabel('Middle Bakken Producing Depth (TVD)')
ax.invert_zaxis()

plt.show()

# Other palette
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_trisurf(Y,X,Z, cmap=plt.cm.jet, linewidth=0.00)
ax.invert_zaxis()
ax.set_xlabel('Bottom Hole Latitude (WGS84)')
ax.set_ylabel('Bottom Hole Longitude (WGS84)')
ax.set_zlabel('Middle Bakken Producing Depth (TVD)')
plt.show()

In [None]:
import pickle
import pandas as pd
# reformat 
with open('../housekeeping/verticalWells.pkl', 'rb') as fp:
    well_dict = pickle.load(fp)

bakkenWells = pd.read_csv('../datasets/envernus/middle bakken wells.CSV')
print(bakkenWells)
# new_list = []
# for k in well_dict:
#     new_list.append('../datasets/'+k)

# pd.read_csv(new_list[0])