Given a set of vertical wells, perform TICC (Toeplitz Inverse Covariance Clustering)
across the same set of parameters found in each well (GR, Density, Porosity, PE, etc.).

First, determine, which wells we have in the vertical portion. Once our raw dataset is derived, 
attempt to find a proximal set of wells (i.e. where cluster density is maximized), so as to 
increase the probability of encountering similar geostructure amoungst neighboring wells.

Once the refined dataset is derived, run TICC script in parallel so as to save in computation time.
Evaluate results and determine if the sensor-to-drillbit data gap can be accurately predicted. 

In [None]:
import json

d_drive_dataset = 'D:\horizontal drilling\datasets'

# read in maxDepth.txt and look at the data, try to see what logs are most common in vertical leg.

def findMostCommonFeature(file):
    with open(file, 'r') as f:
        d = json.loads(f.read())

    i = 0

    columnDict = {'Depth':0,
                'GR':0
                }

    for k,v in d.items():
        # only look at 1000 or so instances. 
        if i < min(1000, len(d.keys())):
            with open('..\\datasets\\scrapedWells\\'+k, 'r') as f:
                lines = f.readlines()
                try:
                    columns = lines[0].strip().split(',')[1:]
                    columnDict['Depth'] += 1
                    for _c in columns[1:]:
                        if _c not in columnDict and _c not in ['#Depth', 'DEPTH', 'DEPT']:
                            columnDict[_c] = 1
                        else:
                            columnDict[_c] += 1 
                except Exception as err:
                    pass
        # df = pd.read_csv(k, index_col=0)
        i += 1

    sortedColumnDict = dict(sorted(columnDict.items(), 
                                    key=lambda item: item[1],
                                    reverse=True))

    print(sortedColumnDict)

findMostCommonFeature('../housekeeping/maxDepths.txt')

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

pd.set_option('display.float_format', lambda x: '%.3f' % x)

def find_dense_well_clusters(dataset_directory):
    # read in excel dataset from file.
    
    df = pd.read_csv(dataset_directory)

    # determine the highest well density / county.
    county_well_count = df['County/Parish'].value_counts()
    densest_county, c_num_wells = [(k,v) for k,v in county_well_count.items()][0]

    # create subset of wells in this county.
    county_wells_df = df.loc[df['County/Parish'] == densest_county]

    # redo for highest field density.
    field_well_count = county_wells_df['Field'].value_counts()
    densest_field, f_num_wells    = [(k,v) for k,v in field_well_count.items()][0]
        
    densest_field_df = county_wells_df.loc[df['Field'] == densest_field]

    # densest subplay
    subplay_well_count = densest_field_df['DI Subplay'].value_counts()
    densest_subplay, sp_num_wells    = [(k,v) for k,v in subplay_well_count.items()][0]
    densest_subplay_df = densest_field_df.loc[df['DI Subplay'] == densest_subplay]

    densest_subplay_df.to_csv('../datasets/densest_subplay.csv', index=False)
    return densest_subplay_df

def create_trajectory_directories(subplay_df):

    # just rename this for ease of use.
    df = subplay_df

    api14s = [int(api / 1e4) for api in df['API14'].tolist()]

    root_dir = 'D:\horizontal drilling\datasets\scrapedWells'

    log_folders = sorted([int(path.split('\\')[-1]) for path in glob.glob(f'{root_dir}/*')], reverse=True)

    wells_with_data = sorted(list(set(api14s) & set(log_folders)))
   
    try:
        os.mkdir('../datasets/mckenzie-county')
    except OSError:
        print('directory already exists.')
        pass
    
    for api14 in wells_with_data:
        try:
            os.mkdir(f'../datasets/mckenzie-county/{api14}')
        except OSError:
            pass
    
    return wells_with_data   

def verify_folders_have_trajectories(api14s):

    root_dir = r"C:\Users\rush\Desktop\programming\horizontal drilling\datasets\mckenzie-county"

    empty_dirs = [api14 for api14 in api14s if not os.listdir(root_dir + f'\{api14}')]

    # returns True if no empty dirs, returns the list of empty dirs if exist.

    if empty_dirs:
        return empty_dirs
    else:
        return True

    

if __name__ == "__main__":

    # learned that for some reason, the api14's were not properly saved somewhere along the line. so 
    # i went BACK to drillinginfo, and did the query again knowing mckenzie county was very dense.

    densest_subplay = find_dense_well_clusters(dataset_directory='../datasets/mckenzie county.CSV')

    api14s = create_trajectory_directories(densest_subplay)

    if verify_folders_have_trajectories(api14s):
        print('trajectories all accounted for.')
    else:
        print(verify_folders_have_trajectories(api14s))

    

directory already exists.
trajectories all accounted for.
