In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import itertools
import pandas as pd

from sklearn import decomposition, preprocessing
from skimage.feature import greycomatrix, greycoprops
from skimage import exposure

## Feature engineering for the validation set

<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/88x31.png" /></a><br /><span xmlns:dct="http://purl.org/dc/terms/" property="dct:title">The code and ideas to engineer new features used in this notebook, </span> by <span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Matteo Niccoli and Mark Dahl, with contributions by Daniel Kittridge,</span> are licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>. 

### 1 - load validation data

In [2]:
# import data and fill missing PE values with average

filename = 'validation_data_nofacies.csv'
val = pd.read_csv(filename)

val['PE'].fillna((val['PE'].mean()), inplace=True)
print  np.shape(val)
val['PE'].fillna((val['PE'].mean()), inplace=True)
print  np.shape(val)

(830, 10)
(830, 10)


In [3]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)
val.describe()

Unnamed: 0,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,2987.0705,57.6117,0.6663,2.852,11.6553,3.6542,1.6783,0.5358
std,94.3919,27.5277,0.2884,3.4421,5.1902,0.6498,0.4674,0.2831
min,2808.0,12.036,-0.468,-8.9,1.855,2.113,1.0,0.013
25%,2911.625,36.7733,0.541,0.4113,7.7,3.1715,1.0,0.3
50%,2993.75,58.3445,0.675,2.3975,10.95,3.5155,2.0,0.5475
75%,3055.375,73.0515,0.8508,4.6,14.7938,4.1915,2.0,0.778
max,3160.5,220.413,1.507,16.5,31.335,6.321,2.0,1.0


### 2 -  moments feature generation

In [4]:
# standardize features to go into moments calculation
feature_vectors = val.drop(['Formation', 'Well Name', 'Depth'], axis=1)
scaler = preprocessing.StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors)

In [5]:
scaled_vectors_df = pd.DataFrame(scaled_features, columns=list(feature_vectors))
scaled_feat_df = pd.concat((val[['Depth', 'Well Name', 'Formation']], scaled_vectors_df),1)
scaled_feat_df.head()

Unnamed: 0,Depth,Well Name,Formation,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,2808.0,STUART,A1 SH,0.3149,-0.126,0.1302,-0.1938,-0.0973,-1.4521,1.6409
1,2808.5,STUART,A1 SH,0.7139,-0.2821,1.0605,0.0568,-0.4823,-1.4521,1.5631
2,2809.0,STUART,A1 SH,0.9192,-0.3481,1.9035,0.3749,-0.9088,-1.4521,1.4854
3,2809.5,STUART,A1 SH,0.8382,-0.2544,1.9326,0.3074,-1.0428,-1.4521,1.404
4,2810.0,STUART,A1 SH,0.6673,-0.0982,1.7,0.1339,-0.9766,-1.4521,1.3263


In [6]:
sizes = [3, 5, 9, 17, 33, 65, 129]

In [7]:
# Efficient rolling statistics with NumPy
# http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

In [8]:
# function to calculate moments using a rolling window

def rollin_moments(arr, w, moment ='mean'):
    """- pad input array by (w-1)/2 samples at the top and bottom
       - apply rolling window function
       - calculate moment: mean (default), var, or skew"""
    mom = []
    arr = np.pad(arr, ((w-1)/2, (w-1)/2), 'edge')  
    if moment == 'std':
        return np.array(np.std(rolling_window(arr, w), 1))
    elif moment == 'skew':
        return np.array(sp.stats.skew(rolling_window(arr, w), 1))
    else:
        return np.array(np.mean(rolling_window(arr, w), 1))

In [9]:
moments=['mean', 'std', 'skew']

In [10]:
# calculate all moments for all logs, for all wells

final_df = pd.DataFrame()             # final dataframe

grouped = val['Well Name'].unique()

for well in grouped:                  # for each well     
    new_df = pd.DataFrame()           # make a new temporary dataframe 
   
    for log in ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND' ,'PE']:
        for mo in moments:            # for each moment
                                      # calculate the rolling moments with each window size
                                      # and also the mean of moments (all window sizes)
            results = np.array([rollin_moments(scaled_feat_df[log][scaled_feat_df['Well Name'] == well],
                                               size, moment = mo) for size in sizes])
            mean_result = np.mean(results, axis=0)
                                      # write to temporary dataframe 
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[0])] = results[0]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[1])] = results[1]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[2])] = results[2]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[3])] = results[3]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[4])] = results[4]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[5])] = results[5]
            new_df[str(log)+ '_' + str(mo)+'_wsize=' +str(sizes[6])] = results[6]
            new_df[str(log)+ '_' + str(mo)+'_wsize=ave'] = mean_result
                                      # append all rows of temporary dataframe to final dataframe          

    final_df = pd.concat([final_df, new_df])

In [11]:
final_df.describe()

Unnamed: 0,GR_mean_wsize=3,GR_mean_wsize=5,GR_mean_wsize=9,GR_mean_wsize=17,GR_mean_wsize=33,GR_mean_wsize=65,GR_mean_wsize=129,GR_mean_wsize=ave,GR_std_wsize=3,GR_std_wsize=5,...,PE_std_wsize=129,PE_std_wsize=ave,PE_skew_wsize=3,PE_skew_wsize=5,PE_skew_wsize=9,PE_skew_wsize=17,PE_skew_wsize=33,PE_skew_wsize=65,PE_skew_wsize=129,PE_skew_wsize=ave
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,-0.0,0.0,0.0002,0.0011,0.0035,0.003,-0.0059,0.0003,0.1472,0.2448,...,0.9189,0.5909,0.0295,0.0082,0.0266,0.1636,0.3223,0.448,0.5806,0.2255
std,0.9762,0.9367,0.8469,0.699,0.5061,0.345,0.2315,0.5721,0.1629,0.2519,...,0.2495,0.2278,0.4426,0.6299,0.7252,0.7299,0.605,0.4907,0.4267,0.328
min,-1.614,-1.5656,-1.4971,-1.3982,-1.0994,-0.7095,-0.4113,-0.9915,0.0,0.0,...,0.2094,0.0988,-0.7071,-1.4773,-2.0822,-2.905,-2.3414,-0.6499,-0.111,-0.6534
25%,-0.752,-0.7202,-0.6895,-0.5207,-0.3507,-0.2425,-0.171,-0.4676,0.0403,0.077,...,0.7755,0.4476,-0.3291,-0.4495,-0.4587,-0.2996,-0.0458,0.0832,0.254,-0.0034
50%,0.0208,0.0351,0.0256,-0.0136,-0.0265,-0.0678,-0.0507,0.0162,0.0904,0.1681,...,0.9294,0.5888,0.0129,0.0306,0.0438,0.1474,0.3486,0.451,0.5065,0.2425
75%,0.5432,0.5256,0.5107,0.4601,0.3487,0.2043,0.0823,0.3489,0.2036,0.333,...,1.1342,0.731,0.4276,0.4953,0.4989,0.6038,0.7172,0.8012,0.7933,0.4679
max,5.5963,5.0319,3.7101,2.0829,1.2403,1.2112,0.9123,2.5144,1.206,1.8474,...,1.2681,1.4662,0.7071,1.4605,2.0767,2.4283,2.6513,1.6184,1.9187,1.2194


### 2 - GLCM feature generation

In [12]:
# function to calculate glcm and greycoprops using a rolling window

def gprops_calc(arr, w, lv, sym = True, prop='dissimilarity'):
    """- make w copies of the input array, roll it up one row at a time
       - calculate glcm on a square window of size w
       - calculate greycoprops from glcm: dissimilarity (default), energy, or correlation
       - repeat until back at row one
       N.B. the input array is padded by (w-1)/2 samples at the top and bottom"""
    diss = []
    itr = len(arr)
    arr = np.pad(arr, ((w-1)/2, (w-1)/2), 'edge')
    s = np.array([arr,]*w,dtype=np.uint8).transpose()
    for _ in np.arange(itr):
        if sym == True:
            glcm = greycomatrix(s[:w,:], [1], [np.pi/2], levels = lv, symmetric = True, normed = True)
        else:
            glcm = greycomatrix(s[:w,:], [1], [np.pi/2], levels = lv, symmetric = False, normed = True)
        if prop == 'correlation':
            ds = greycoprops(glcm, 'correlation')
        elif prop == 'energy':
            ds = greycoprops(glcm, 'energy')
        else:
            ds = greycoprops(glcm, 'dissimilarity')
        diss.append(ds)
        s = np.roll(s[:, :], -w)
    return np.ndarray.flatten(np.array(diss)) 

In [13]:
methods=['dissimilarity','energy', 'correlation']

In [14]:
# functions to equalize histogram of features to go into GLCM calculation
def eqlz(arr, bins):
    return (bins-1) * exposure.equalize_hist(arr)
def eqlz_along_axis(arr, bins):
    return np.apply_along_axis(eqlz, 0, arr, bins)

In [15]:
# equalize features
feature_vectors_glcm = val.drop(['Formation', 'Well Name', 'Depth'], axis=1)
eq_vectors_glcm = eqlz_along_axis(feature_vectors_glcm, 64)

In [16]:
eq_vectors_glcm_df = pd.DataFrame(eq_vectors_glcm, columns=list(feature_vectors_glcm))
eq_vectors_glcm_df = np.round(eq_vectors_glcm_df).astype(int)
eq_glcm_df = pd.concat((val[['Depth', 'Well Name', 'Formation']], eq_vectors_glcm_df),1)
eq_glcm_df.head()

Unnamed: 0,Depth,Well Name,Formation,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,2808.0,STUART,A1 SH,40,26,39,30,34,20,63
1,2808.5,STUART,A1 SH,52,20,56,37,23,20,61
2,2809.0,STUART,A1 SH,56,18,60,44,11,20,59
3,2809.5,STUART,A1 SH,55,21,60,42,8,20,58
4,2810.0,STUART,A1 SH,51,27,59,39,10,20,56


First let's calculate symmetric GLCM properties:

In [17]:
final_df_glcm = pd.DataFrame()        # final dataframe
grouped = val['Well Name'].unique()

for well in grouped:                   # for each well   
    new_dfg = pd.DataFrame()           # make a new temporary dataframe 

    for log in ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']:   # for each log
        for me in methods:            # for each property
                                      # calculate rolling GLCM properties with each window size
                                      # and also the mean of moments (all window sizes)
            lg = eq_glcm_df[log][eq_glcm_df['Well Name'] == well]
            results = np.array([gprops_calc(lg.astype(int), wd, lv = 64, sym = True, prop = me) for wd in sizes])
            mean_result = np.mean(results, axis=0)
                                      # write to temporary dataframe 
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[0])] = results[0]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[1])] = results[1]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[2])] = results[2]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[3])] = results[3]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[4])] = results[4]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[5])] = results[5]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=' +str(sizes[6])] = results[6]
            new_dfg[str(log)+ '_GLCM_' + str(me)+'_wsize=ave'] = mean_result
                                      # append all rows of temporary dataframe to final dataframe 

    final_df_glcm  = pd.concat([final_df_glcm , new_dfg])

In [18]:
final_df_glcm.describe()

Unnamed: 0,GR_GLCM_dissimilarity_wsize=3,GR_GLCM_dissimilarity_wsize=5,GR_GLCM_dissimilarity_wsize=9,GR_GLCM_dissimilarity_wsize=17,GR_GLCM_dissimilarity_wsize=33,GR_GLCM_dissimilarity_wsize=65,GR_GLCM_dissimilarity_wsize=129,GR_GLCM_dissimilarity_wsize=ave,GR_GLCM_energy_wsize=3,GR_GLCM_energy_wsize=5,...,PE_GLCM_energy_wsize=129,PE_GLCM_energy_wsize=ave,PE_GLCM_correlation_wsize=3,PE_GLCM_correlation_wsize=5,PE_GLCM_correlation_wsize=9,PE_GLCM_correlation_wsize=17,PE_GLCM_correlation_wsize=33,PE_GLCM_correlation_wsize=65,PE_GLCM_correlation_wsize=129,PE_GLCM_correlation_wsize=ave
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,3.4024,3.3952,3.387,3.3779,3.352,3.2929,3.1588,3.338,0.557,0.4119,...,0.1442,0.2785,-0.2047,0.3351,0.6797,0.8495,0.9198,0.9385,0.945,0.6375
std,3.1384,2.5533,1.9162,1.4214,1.0729,0.8625,0.7652,1.3525,0.1123,0.0944,...,0.1128,0.0586,0.3917,0.355,0.2398,0.1252,0.0442,0.0233,0.0167,0.115
min,0.0,0.0,0.0,0.0625,0.5,0.9219,0.9219,0.3438,0.5,0.3536,...,0.0765,0.2264,-1.0,-0.9109,-0.4545,-0.1193,0.6799,0.8632,0.9016,0.1932
25%,1.0,1.25,1.75,2.3125,2.5938,2.6719,2.75,2.2243,0.5,0.3536,...,0.085,0.2394,-0.3333,0.0993,0.5898,0.8142,0.8934,0.9225,0.9317,0.5754
50%,2.5,2.75,3.25,3.5,3.375,3.3594,3.3281,3.2338,0.5,0.3953,...,0.088,0.2556,-0.1111,0.4667,0.7535,0.883,0.9321,0.9441,0.945,0.6678
75%,5.0,5.0,4.875,4.375,4.2812,3.9492,3.75,4.2547,0.6124,0.433,...,0.1316,0.2937,-0.0196,0.6146,0.8522,0.9271,0.9535,0.957,0.9606,0.7257
max,15.5,12.5,8.375,6.9375,5.4375,5.1094,4.3516,7.0357,1.0,1.0,...,0.5056,0.5397,1.0,1.0,0.9193,0.9699,0.9823,0.977,0.9753,0.8931


And now let's calculate asymmetric GLCM properties using only the upward neighbour:

In [19]:
final_df_glcm1 = pd.DataFrame()        # final dataframe
grouped1 = val['Well Name'].unique()

for well1 in grouped1:                   # for each well   
    new_dfg1 = pd.DataFrame()           # make a new temporary dataframe 

    for log1 in ['GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE']:   # for each log
        for me in methods:            # for each property
                                      # calculate rolling GLCM properties with each window size
                                      # and also the mean of moments (all window sizes)
            lg1 = eq_glcm_df[log][eq_glcm_df['Well Name'] == well1]
            results1 = np.array([gprops_calc(lg1.astype(int), wd, lv = 64, sym = False, prop = me) for wd in sizes])
            mean_result1 = np.mean(results1, axis=0)
            
                                      # write to temporary dataframe 
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[0])] = results1[0]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[1])] = results1[1]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[2])] = results1[2]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[3])] = results1[3]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[4])] = results1[4]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[5])] = results1[5]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=' +str(sizes[6])] = results1[6]
            new_dfg1[str(log1)+ '_GLCM_' + str(me)+'_asym_wsize=ave'] = mean_result1
                                      # append all rows of temporary dataframe to final dataframe 

    final_df_glcm1  = pd.concat([final_df_glcm1 , new_dfg1])

In [20]:
final_df_glcm1.describe()

Unnamed: 0,GR_GLCM_dissimilarity_asym_wsize=3,GR_GLCM_dissimilarity_asym_wsize=5,GR_GLCM_dissimilarity_asym_wsize=9,GR_GLCM_dissimilarity_asym_wsize=17,GR_GLCM_dissimilarity_asym_wsize=33,GR_GLCM_dissimilarity_asym_wsize=65,GR_GLCM_dissimilarity_asym_wsize=129,GR_GLCM_dissimilarity_asym_wsize=ave,GR_GLCM_energy_asym_wsize=3,GR_GLCM_energy_asym_wsize=5,...,PE_GLCM_energy_asym_wsize=129,PE_GLCM_energy_asym_wsize=ave,PE_GLCM_correlation_asym_wsize=3,PE_GLCM_correlation_asym_wsize=5,PE_GLCM_correlation_asym_wsize=9,PE_GLCM_correlation_asym_wsize=17,PE_GLCM_correlation_asym_wsize=33,PE_GLCM_correlation_asym_wsize=65,PE_GLCM_correlation_asym_wsize=129,PE_GLCM_correlation_asym_wsize=ave
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,3.841,3.8304,3.8161,3.8013,3.7688,3.7163,3.5909,3.7664,0.7195,0.5189,...,0.1568,0.3445,0.6434,0.6096,0.7566,0.8678,0.9222,0.9389,0.9451,0.8119
std,3.844,3.0058,2.281,1.6697,1.2653,1.1018,1.051,1.6288,0.0589,0.0627,...,0.1078,0.0439,0.766,0.4638,0.253,0.1222,0.0435,0.0233,0.0167,0.1804
min,0.0,0.0,0.25,0.625,0.625,0.5469,0.8281,0.5893,0.7071,0.5,...,0.0957,0.3158,-1.0,-0.9683,-0.4523,-0.1193,0.6801,0.8633,0.9019,0.1572
25%,1.5,1.5,1.875,2.5156,3.0938,3.0938,2.8594,2.5991,0.7071,0.5,...,0.1007,0.3199,1.0,0.3333,0.6614,0.8309,0.897,0.9228,0.9317,0.7408
50%,2.5,3.0,3.375,3.7188,3.8125,3.75,3.7305,3.5112,0.7071,0.5,...,0.1036,0.3258,1.0,0.8511,0.8492,0.899,0.9348,0.9442,0.9451,0.8938
75%,5.375,5.25,5.375,4.875,4.4062,4.4531,4.3516,4.6523,0.7071,0.5,...,0.1407,0.3454,1.0,0.953,0.9349,0.9462,0.9546,0.9573,0.9606,0.94
max,28.5,15.25,11.375,9.9375,7.5,5.8594,5.5,10.2176,1.0,1.0,...,0.5064,0.5659,1.0,1.0,0.9969,0.9953,0.9855,0.9771,0.9754,0.9793


###  3 - Concatenate results with input into a single numpy array, then make it into final dataframe

In [21]:
arr_final = (np.concatenate((val.values, final_df.values, final_df_glcm, final_df_glcm1), axis=1))
print np.shape(arr_final)
cols1 = list(val) + list(final_df) + list(final_df_glcm) + list(final_df_glcm1)
arr_final_df = pd.DataFrame(arr_final, columns=cols1)
#arr_final_df.describe()
#arr_final_df.dtypes

(830, 370)


In [22]:
lll2 = list(val)[3:] + list(final_df) + list(final_df_glcm) + list(final_df_glcm1)
for l2 in lll2:
    arr_final_df[l2] = arr_final_df[l2].astype('float64')
    
#arr_final_df['Facies'] = arr_final_df['Facies'].astype('int64')
arr_final_df['Formation'] = arr_final_df['Formation'].astype('category')
arr_final_df['Well Name'] = arr_final_df['Well Name'].astype('category')
arr_final_df['NM_M'] = arr_final_df['NM_M'].astype('int64')

In [23]:
arr_final_df.describe()

Unnamed: 0,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,GR_mean_wsize=3,GR_mean_wsize=5,GR_mean_wsize=9,...,PE_GLCM_energy_asym_wsize=129,PE_GLCM_energy_asym_wsize=ave,PE_GLCM_correlation_asym_wsize=3,PE_GLCM_correlation_asym_wsize=5,PE_GLCM_correlation_asym_wsize=9,PE_GLCM_correlation_asym_wsize=17,PE_GLCM_correlation_asym_wsize=33,PE_GLCM_correlation_asym_wsize=65,PE_GLCM_correlation_asym_wsize=129,PE_GLCM_correlation_asym_wsize=ave
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,57.6117,0.6663,2.852,11.6553,3.6542,1.6783,0.5358,-0.0,0.0,0.0002,...,0.1568,0.3445,0.6434,0.6096,0.7566,0.8678,0.9222,0.9389,0.9451,0.8119
std,27.5277,0.2884,3.4421,5.1902,0.6498,0.4674,0.2831,0.9762,0.9367,0.8469,...,0.1078,0.0439,0.766,0.4638,0.253,0.1222,0.0435,0.0233,0.0167,0.1804
min,12.036,-0.468,-8.9,1.855,2.113,1.0,0.013,-1.614,-1.5656,-1.4971,...,0.0957,0.3158,-1.0,-0.9683,-0.4523,-0.1193,0.6801,0.8633,0.9019,0.1572
25%,36.7733,0.541,0.4113,7.7,3.1715,1.0,0.3,-0.752,-0.7202,-0.6895,...,0.1007,0.3199,1.0,0.3333,0.6614,0.8309,0.897,0.9228,0.9317,0.7408
50%,58.3445,0.675,2.3975,10.95,3.5155,2.0,0.5475,0.0208,0.0351,0.0256,...,0.1036,0.3258,1.0,0.8511,0.8492,0.899,0.9348,0.9442,0.9451,0.8938
75%,73.0515,0.8508,4.6,14.7938,4.1915,2.0,0.778,0.5432,0.5256,0.5107,...,0.1407,0.3454,1.0,0.953,0.9349,0.9462,0.9546,0.9573,0.9606,0.94
max,220.413,1.507,16.5,31.335,6.321,2.0,1.0,5.5963,5.0319,3.7101,...,0.5064,0.5659,1.0,1.0,0.9969,0.9953,0.9855,0.9771,0.9754,0.9793


In [24]:
arr_final_df.to_csv('engineered_features_validation_set.csv', sep=',',  index=False)

In [25]:
# based on SFS notebook
top70 = ['GR_GLCM_energy_asym_wsize=3', 'GR_GLCM_energy_asym_wsize=5', 'DeltaPHI', 'PHIND', 'NM_M', 'GR',
         'GR_GLCM_correlation_asym_wsize=3', 'GR_mean_wsize=5', 'GR_std_wsize=5',
         'ILD_log10_GLCM_dissimilarity_asym_wsize=65', 'GR_std_wsize=ave', 'ILD_log10_GLCM_energy_asym_wsize=3',
         'ILD_log10_GLCM_energy_asym_wsize=5', 'ILD_log10_GLCM_energy_asym_wsize=17', 
         'ILD_log10_GLCM_correlation_asym_wsize=3', 'ILD_log10_GLCM_correlation_asym_wsize=5', 
         'ILD_log10_GLCM_correlation_asym_wsize=ave', 'ILD_log10_std_wsize=3', 'RELPOS',
         'DeltaPHI_GLCM_energy_asym_wsize=3', 'DeltaPHI_GLCM_energy_asym_wsize=5',
         'DeltaPHI_GLCM_energy_asym_wsize=33', 'GR_GLCM_correlation_asym_wsize=5',
         'DeltaPHI_GLCM_correlation_asym_wsize=3', 'DeltaPHI_GLCM_correlation_asym_wsize=65', 
         'PE_GLCM_correlation_asym_wsize=3', 'PHIND_skew_wsize=5', 'PHIND_GLCM_energy_asym_wsize=3',
         'PHIND_GLCM_energy_asym_wsize=5', 'PHIND_GLCM_energy_asym_wsize=17', 'DeltaPHI_skew_wsize=ave', 
         'PHIND_mean_wsize=3', 'PHIND_mean_wsize=5', 'PHIND_mean_wsize=17', 'PHIND_std_wsize=5',
         'PE_GLCM_dissimilarity_asym_wsize=129', 'PHIND_std_wsize=ave', 'PE_GLCM_energy_asym_wsize=3',
         'PE_GLCM_energy_asym_wsize=5', 'PHIND_skew_wsize=17', 'PE_mean_wsize=3', 
         'PE_GLCM_correlation_asym_wsize=33', 'PE_mean_wsize=65', 'PE_std_wsize=5', 'PE_std_wsize=129',
         'PE_skew_wsize=3', 'PE_skew_wsize=9', 'GR_GLCM_dissimilarity_wsize=3', 'GR_GLCM_energy_wsize=5',
         'GR_GLCM_correlation_wsize=65', 'GR_GLCM_correlation_wsize=ave', 'ILD_log10_GLCM_dissimilarity_wsize=5',
         'ILD_log10_GLCM_correlation_wsize=3', 'ILD_log10_GLCM_correlation_wsize=5', 
         'ILD_log10_GLCM_correlation_wsize=17', 'ILD_log10_GLCM_correlation_wsize=129',
         'DeltaPHI_GLCM_correlation_wsize=5', 'DeltaPHI_GLCM_correlation_wsize=33',
         'PHIND_GLCM_dissimilarity_wsize=ave', 'PHIND_GLCM_energy_wsize=3', 'PE_GLCM_dissimilarity_wsize=3',
         'PHIND_GLCM_correlation_asym_wsize=3', 'PE_GLCM_dissimilarity_wsize=9', 'PE_GLCM_dissimilarity_wsize=17', 
         'PE_GLCM_energy_wsize=3', 'PE_GLCM_energy_wsize=5', 'GR_skew_wsize=5', 'GR_GLCM_dissimilarity_asym_wsize=3',
         'GR_GLCM_dissimilarity_asym_wsize=9', 'GR_GLCM_dissimilarity_asym_wsize=ave']

In [26]:
val = pd.read_csv(filename)
Xs = pd.concat([val[['Depth', 'Well Name', 'Formation']], arr_final_df[top70]], axis = 1)
print np.shape(Xs), list(Xs)

(830, 73) ['Depth', 'Well Name', 'Formation', 'GR_GLCM_energy_asym_wsize=3', 'GR_GLCM_energy_asym_wsize=5', 'DeltaPHI', 'PHIND', 'NM_M', 'GR', 'GR_GLCM_correlation_asym_wsize=3', 'GR_mean_wsize=5', 'GR_std_wsize=5', 'ILD_log10_GLCM_dissimilarity_asym_wsize=65', 'GR_std_wsize=ave', 'ILD_log10_GLCM_energy_asym_wsize=3', 'ILD_log10_GLCM_energy_asym_wsize=5', 'ILD_log10_GLCM_energy_asym_wsize=17', 'ILD_log10_GLCM_correlation_asym_wsize=3', 'ILD_log10_GLCM_correlation_asym_wsize=5', 'ILD_log10_GLCM_correlation_asym_wsize=ave', 'ILD_log10_std_wsize=3', 'RELPOS', 'DeltaPHI_GLCM_energy_asym_wsize=3', 'DeltaPHI_GLCM_energy_asym_wsize=5', 'DeltaPHI_GLCM_energy_asym_wsize=33', 'GR_GLCM_correlation_asym_wsize=5', 'DeltaPHI_GLCM_correlation_asym_wsize=3', 'DeltaPHI_GLCM_correlation_asym_wsize=65', 'PE_GLCM_correlation_asym_wsize=3', 'PHIND_skew_wsize=5', 'PHIND_GLCM_energy_asym_wsize=3', 'PHIND_GLCM_energy_asym_wsize=5', 'PHIND_GLCM_energy_asym_wsize=17', 'DeltaPHI_skew_wsize=ave', 'PHIND_mean_wsiz

In [27]:
Xs.to_csv('engineered_features_validation_set_top70.csv', sep=',',  index=False)