# 02 - Facies Classification using TPOT
### George Crowther - https://www.linkedin.com/in/george-crowther-9669a931?trk=hp-identity-name

In this second attempt, I've updated some of the feature engineering before re-training an extra trees classifier on the data

# 1. Data Loading and Initial Observations

In [1]:
# Initial imports for reading data and first observations
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from tpot import TPOTClassifier

import sys
sys.path.append(r'C:\Users\george.crowther\Documents\Python\Projects\2016-ml-contest-master')
from classification_utilities import display_cm, display_adj_cm

bk.output_notebook()

In [2]:
# Input file paths
train_path = r'..\training_data.csv'
test_path = r'.\validation_data_nofacies.csv'

# Read training data to dataframe
train = pd.read_csv(train_path)

# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)

In [6]:
train.head()

Unnamed: 0,class,Formation,Well Name,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
0,3,A1 SH,SHRIMPLIN,2793.0,77.45,0.664,9.9,11.915,4.6,1,1.0
1,3,A1 SH,SHRIMPLIN,2793.5,78.26,0.661,14.2,12.565,4.1,1,0.979
2,3,A1 SH,SHRIMPLIN,2794.0,79.05,0.658,14.8,13.05,3.6,1,0.957
3,3,A1 SH,SHRIMPLIN,2794.5,86.1,0.655,13.9,13.115,3.5,1,0.936
4,3,A1 SH,SHRIMPLIN,2795.0,74.58,0.647,13.5,13.3,3.4,1,0.915


In [7]:
train.describe()

Unnamed: 0,class,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS
count,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0
mean,4.42203,2875.824567,66.135769,0.642719,3.559642,13.483213,3.725014,1.498453,0.520287
std,2.504243,131.006274,30.854826,0.241845,5.228948,7.69898,0.896152,0.500075,0.286792
min,1.0,2573.5,13.25,-0.025949,-21.832,0.55,0.2,1.0,0.01
25%,2.0,2791.0,46.91875,0.49275,1.16375,8.34675,3.1,1.0,0.273
50%,4.0,2893.5,65.7215,0.624437,3.5,12.15,3.5515,1.0,0.526
75%,6.0,2980.0,79.62625,0.812735,6.4325,16.45375,4.3,2.0,0.76725
max,9.0,3122.5,361.15,1.48,18.6,84.4,8.094,2.0,1.0


# Feature Engineering and Creation
Again, as with the previous result, the method here is somewhat brute-force, looking at the differences between each sample and it's formation mean/median, its above formation lower sample and below formation upper sample. There could definitely be more metrics, and undoubtedly more informed metrics to pull in this manner, these are arguably somewhat naieve.

In [24]:
def feature_extraction(train):
    #------------------------------------
    # Split and separate formation names into 
    for i, value in enumerate(train.Formation.unique()):
        name_a = value.split(' ')[0]
        name_b = value.split(' ')[1]
        if name_a not in train.columns:
            train[name_a] = 0
        if name_b not in train.columns:
            train[name_b] = 0

        train.loc[train.Formation == value, name_a] = 1
        train.loc[train.Formation == value, name_b] = 1
    #------------------------------------
    # Replace formation names with values
    for i, value in enumerate(train['Formation'].unique()):
        train.loc[train['Formation'] == value, 'Formation'] = i

    #------------------------------------
    # Going to take the difference of each sample from the formation mean and median for each well for each measured parameter
    # This will add a 0 value column for each potential value
    columns = ['Formation', 'Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS']

    above_columns = ['above_delta_' + col for col in columns]
    below_columns = ['below_delta_' + col for col in columns]
    formation_columns = ['formation_delta_' + col for col in columns]
    formation_med_columns = ['formation_delta_med_' + col for col in columns]

    def add_empty_columns(df, column_list):
        for column in column_list:
            df[column] = 0

    for column_list in [above_columns, below_columns, formation_columns, formation_med_columns]:
        add_empty_columns(train, column_list)

    #-------------------------------------------
    # Group data by well, sort by depth, then groupby formation
    # Take mean, median, top and bottom (by depth) values for each sub group
    # Add feature which is the difference of the sample from the mean for each formation and its adjacent formations
    # TBD - un-log 'ILD log10' prior to mean, the re-log
    for i, group in train.groupby('Well Name'):
        iteration = 0
        sorted_group = group.sort_values('Depth')
        for j, sub_group in sorted_group.groupby('Formation'):

            means = sub_group[columns].mean()
            medians = sub_group[columns].median()
            top = sub_group.iloc[0][columns]

            if iteration == 0:
                above_group = sub_group
            else:
                above_means = above_group[columns].mean()
                above_bottom = above_group.iloc[-1][columns]
                train.loc[sub_group.index, above_columns] = (train.loc[sub_group.index, columns] - above_bottom).values
                train.loc[above_group.index, below_columns] = (train.loc[sub_group.index, columns] - top).values

            train.loc[sub_group.index, formation_columns] = (train.loc[sub_group.index, columns] - means).values
            train.loc[sub_group.index, formation_med_columns] = (train.loc[sub_group.index, columns] - medians).values

            above_group = sub_group
            iteration += 1
    
    return train

In [15]:
facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS',
                 'WS', 'D','PS', 'BS']
model_columns = train.columns[11:]

# 4. TPOT

TPOT uses a genetic algorithm to tune model parameters for the most effective fit. This can take quite a while to process if you want to re-run this part!

In [18]:
# Input file paths
train_path = r'..\training_data.csv'

# Read training data to dataframe
train = pd.read_csv(train_path)

# TPOT library requires that the target class is renamed to 'class'
train.rename(columns={'Facies': 'class'}, inplace=True)

train = feature_extraction(train)

In [8]:
alt_model_columns = ['GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'A1', 'SH', 'LM', 'B1',
       'B2', 'B3', 'B4', 'B5', 'C', 'above_delta_Formation',
       'above_delta_Depth', 'above_delta_GR', 'above_delta_ILD_log10',
       'above_delta_DeltaPHI', 'above_delta_PHIND', 'above_delta_PE',
       'above_delta_NM_M', 'above_delta_RELPOS', 'below_delta_Formation',
       'below_delta_Depth', 'below_delta_GR', 'below_delta_ILD_log10',
       'below_delta_DeltaPHI', 'below_delta_PHIND', 'below_delta_PE',
       'below_delta_NM_M', 'below_delta_RELPOS', 'formation_delta_Formation',
       'formation_delta_Depth', 'formation_delta_GR',
       'formation_delta_ILD_log10', 'formation_delta_DeltaPHI',
       'formation_delta_PHIND', 'formation_delta_PE', 'formation_delta_NM_M',
       'formation_delta_RELPOS', 'formation_delta_med_Formation',
       'formation_delta_med_Depth', 'formation_delta_med_GR',
       'formation_delta_med_ILD_log10', 'formation_delta_med_DeltaPHI',
       'formation_delta_med_PHIND', 'formation_delta_med_PE',
       'formation_delta_med_NM_M', 'formation_delta_med_RELPOS']

In [9]:
#-------------------------------
# Z-scale normalisation of features. 
# Should probably exclude boolean features from normalisation, though should make nominal difference.
std_scaler = preprocessing.StandardScaler().fit(train[alt_model_columns])
norm = std_scaler.transform(train[alt_model_columns])

norm_frame = train
for i, column in enumerate(alt_model_columns):
    norm_frame.loc[:, column] = norm[:, i]

train = norm_frame

In [155]:
train[alt_model_columns].describe()

Unnamed: 0,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,A1,SH,LM,...,formation_delta_RELPOS,formation_delta_med_Formation,formation_delta_med_Depth,formation_delta_med_GR,formation_delta_med_ILD_log10,formation_delta_med_DeltaPHI,formation_delta_med_PHIND,formation_delta_med_PE,formation_delta_med_NM_M,formation_delta_med_RELPOS
count,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,...,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0,3232.0
mean,3.209754e-16,4.484861e-16,-3.5175380000000005e-17,-1.363046e-16,5.100431e-16,-3.5175380000000005e-17,1.429e-16,9.233538e-17,-1.934646e-16,-7.035077000000001e-17,...,-4.232038e-17,0.0,1.3190770000000001e-17,3.0778460000000004e-17,-8.793846000000001e-18,-5.496154e-19,-1.7587690000000003e-17,5.496154e-18,3.0778460000000004e-17,2.9679230000000005e-17
std,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,...,1.000155,0.0,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155,1.000155
min,-1.714285,-2.765296,-4.856726,-1.680121,-3.934108,-0.9969107,-1.779569,-0.5625806,-0.9956777,-1.004341,...,-2.494438,0.0,-11.38142,-2.71573,-4.029226,-6.260693,-4.250182,-5.731779,-8.74432,-2.549279
25%,-0.6229169,-0.6202015,-0.4582685,-0.6672647,-0.6975496,-0.9969107,-0.8623869,-0.5625806,-0.9956777,-1.004341,...,-0.8358484,0.0,-0.5487312,-0.4287593,-0.4695474,-0.3941843,-0.4363377,-0.4660069,0.03258753,-0.8335645
50%,-0.01342848,-0.07560721,-0.01140783,-0.1731943,-0.193651,-0.9969107,0.01992191,-0.5625806,-0.9956777,0.9956777,...,-0.002351765,0.0,0.01056513,-0.1604354,0.003090394,0.01516462,-0.1458224,0.0545273,0.03258753,-0.00418404
75%,0.437292,0.7031033,0.5494992,0.3858948,0.6417158,1.003099,0.8612539,-0.5625806,1.004341,0.9956777,...,0.8400663,0.0,0.5698615,0.1757048,0.5204175,0.4910436,0.2205165,0.5211649,0.03258753,0.8323156
max,9.562844,3.462598,2.876809,9.212618,4.876027,1.003099,1.672943,1.777523,1.004341,0.9956777,...,2.495892,0.0,9.81297,11.10082,3.929107,4.480679,11.27884,5.779237,8.809495,2.87907


In [10]:
#------------------------------------
# Train test split
alt_train_f, alt_test_f = train_test_split(train, test_size = 0.1, 
                                   random_state = 68)

In [12]:
# Setup TPOT classifier and train
alt_tpot = TPOTClassifier(verbosity = 2, generations = 5, max_eval_time_mins = 60)
alt_tpot.fit(alt_train_f[alt_model_columns], alt_train_f['class'])

Optimization Progress:  17%|█████████████████████▌                                                                                                           | 100/600 [26:17<7:20:59, 52.92s/pipeline]

Generation 1 - Current best internal CV score: 0.9118500821427303


Optimization Progress:  32%|████████████████████████████████████████▋                                                                                      | 192/600 [1:00:59<2:45:30, 24.34s/pipeline]

Generation 2 - Current best internal CV score: 0.9118500821427303


Optimization Progress:  48%|█████████████████████████████████████████████████████████████▍                                                                 | 290/600 [1:38:36<2:25:11, 28.10s/pipeline]

Generation 3 - Current best internal CV score: 0.91286129457966


Optimization Progress:  65%|██████████████████████████████████████████████████████████████████████████████████▏                                            | 388/600 [2:12:31<1:02:33, 17.71s/pipeline]

Generation 4 - Current best internal CV score: 0.91286129457966


Optimization Progress:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                       | 490/600 [2:52:00<41:39, 22.72s/pipeline]

Generation 5 - Current best internal CV score: 0.91286129457966





Best pipeline: ExtraTreesClassifier(input_matrix, 41, 0.47999999999999998)


In [22]:
print(alt_tpot.score(alt_test_f[alt_model_columns], alt_test_f['class']))
alt_tpot.export('02 contest_export.py')

0.911214630264


In [49]:
result = alt_tpot.predict(train[alt_model_columns])

conf = confusion_matrix(train['class'], result)
display_cm(conf, facies_labels, hide_zeros=True, display_metrics = True)

def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
    acc = total_correct/sum(sum(conf))
    return acc

print(accuracy(conf))

adjacent_facies = np.array([[1], [0,2], [1], [4], [3,5], [4,6,7], [5,7], [5,6,8], [6,7]])

def accuracy_adjacent(conf, adjacent_facies):
    nb_classes = conf.shape[0]
    total_correct = 0.
    for i in np.arange(0,nb_classes):
        total_correct += conf[i][i]
        for j in adjacent_facies[i]:
            total_correct += conf[i][j]
    return total_correct / sum(sum(conf))

print(accuracy_adjacent(conf, adjacent_facies))

     Pred    SS  CSiS  FSiS  SiSh    MS    WS     D    PS    BS Total
     True
       SS   257     2                                             259
     CSiS     1   730     7                                       738
     FSiS           6   608     1                                 615
     SiSh                     182           2                     184
       MS           1           2   209     3           2         217
       WS                       3     1   455           3         462
        D                       2                96                98
       PS                       1     2     7     1   486     1   498
       BS                                               1   160   161

Precision  1.00  0.99  0.99  0.95  0.99  0.97  0.99  0.99  0.99  0.98
   Recall  0.99  0.99  0.99  0.99  0.96  0.98  0.98  0.98  0.99  0.98
       F1  0.99  0.99  0.99  0.97  0.97  0.98  0.98  0.98  0.99  0.98
0.984839108911
0.995668316832


# Workflow for Test Data
All the code below here can be re-run to load the model, fit it and predict on the test dataset.

In [40]:
test_path = r'..\validation_data_nofacies.csv'

# Read training data to dataframe
test = pd.read_csv(test_path)

# Rename 'Facies'
test.rename(columns={'Facies': 'class'}, inplace=True)

frame = feature_extraction(test)

In [41]:
frame.describe()

Unnamed: 0,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,A1,SH,...,formation_delta_RELPOS,formation_delta_med_Formation,formation_delta_med_Depth,formation_delta_med_GR,formation_delta_med_ILD_log10,formation_delta_med_DeltaPHI,formation_delta_med_PHIND,formation_delta_med_PE,formation_delta_med_NM_M,formation_delta_med_RELPOS
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,2987.070482,57.61173,0.666312,2.851964,11.655277,3.654178,1.678313,0.535807,0.266265,0.318072,...,6.420567e-18,0.0,0.010843,3.6024,-0.006436,0.078524,0.47241,0.000482,-0.003614,0.002496
std,94.391925,27.52774,0.288367,3.442074,5.190236,0.649793,0.467405,0.283062,0.442271,0.466009,...,0.2765885,0.0,8.945698,24.054012,0.209007,2.378272,4.271225,0.471993,0.104132,0.276883
min,2808.0,12.036,-0.468,-8.9,1.855,2.113,1.0,0.013,0.0,0.0,...,-0.4959667,0.0,-25.0,-40.3,-0.718,-12.7,-14.395,-2.433,-1.0,-0.506
25%,2911.625,36.77325,0.541,0.41125,7.7,3.1715,1.0,0.3,0.0,0.0,...,-0.2322438,0.0,-4.5,-8.2335,-0.09875,-1.2375,-1.94,-0.232125,0.0,-0.23
50%,2993.75,58.3445,0.675,2.3975,10.95,3.5155,2.0,0.5475,0.0,0.0,...,-0.0001416603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3055.375,73.0515,0.85075,4.6,14.79375,4.1915,2.0,0.778,1.0,1.0,...,0.2336599,0.0,4.5,10.198,0.09675,1.2925,2.70375,0.2415,0.0,0.236375
max,3160.5,220.413,1.507,16.5,31.335,6.321,2.0,1.0,1.0,1.0,...,0.5172903,0.0,25.0,176.675,0.824,8.05,27.76,1.884,1.0,0.562


In [42]:
alt_model_columns = ['GR', 'ILD_log10',
       'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS', 'A1', 'SH', 'LM', 'B1',
       'B2', 'B3', 'B4', 'B5', 'C', 'above_delta_Formation',
       'above_delta_Depth', 'above_delta_GR', 'above_delta_ILD_log10',
       'above_delta_DeltaPHI', 'above_delta_PHIND', 'above_delta_PE',
       'above_delta_NM_M', 'above_delta_RELPOS', 'below_delta_Formation',
       'below_delta_Depth', 'below_delta_GR', 'below_delta_ILD_log10',
       'below_delta_DeltaPHI', 'below_delta_PHIND', 'below_delta_PE',
       'below_delta_NM_M', 'below_delta_RELPOS', 'formation_delta_Formation',
       'formation_delta_Depth', 'formation_delta_GR',
       'formation_delta_ILD_log10', 'formation_delta_DeltaPHI',
       'formation_delta_PHIND', 'formation_delta_PE', 'formation_delta_NM_M',
       'formation_delta_RELPOS', 'formation_delta_med_Formation',
       'formation_delta_med_Depth', 'formation_delta_med_GR',
       'formation_delta_med_ILD_log10', 'formation_delta_med_DeltaPHI',
       'formation_delta_med_PHIND', 'formation_delta_med_PE',
       'formation_delta_med_NM_M', 'formation_delta_med_RELPOS']

std_scaler = preprocessing.StandardScaler().fit(frame[alt_model_columns])
norm = std_scaler.transform(frame[alt_model_columns])

norm_frame = frame
for i, column in enumerate(alt_model_columns):
    norm_frame.loc[:, column] = norm[:, i]

frame = norm_frame
frame.describe()

Unnamed: 0,Depth,GR,ILD_log10,DeltaPHI,PHIND,PE,NM_M,RELPOS,A1,SH,...,formation_delta_RELPOS,formation_delta_med_Formation,formation_delta_med_Depth,formation_delta_med_GR,formation_delta_med_ILD_log10,formation_delta_med_DeltaPHI,formation_delta_med_PHIND,formation_delta_med_PE,formation_delta_med_NM_M,formation_delta_med_RELPOS
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,...,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,2987.070482,-7.704680000000001e-17,-1.198506e-16,-3.424302e-17,-3.210283e-16,2.739442e-16,1.369721e-16,-1.112898e-16,-1.027291e-16,5.1364540000000006e-17,...,8.560756e-18,0.0,-3.424302e-17,1.712151e-17,-1.070094e-17,2.1401890000000002e-17,2.1401890000000002e-17,7.490661e-18,8.774775000000001e-17,-3.424302e-17
std,94.391925,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,...,1.000603,0.0,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603,1.000603
min,2808.0,-1.656627,-3.935939,-3.416269,-1.889353,-2.373228,-1.452107,-1.848086,-0.6024035,-0.6829576,...,-1.794238,0.0,-2.797537,-1.82626,-3.406552,-5.376268,-3.482929,-5.158872,-9.574299,-1.837613
25%,2911.625,-0.7574558,-0.4348191,-0.7095099,-0.7625206,-0.7432663,-1.452107,-0.8335616,-0.6024035,-0.6829576,...,-0.8401789,0.0,-0.5045513,-0.4923518,-0.4419448,-0.5536867,-0.5651456,-0.4931162,0.03473144,-0.8401996
50%,2993.75,0.02663538,0.03014625,-0.1321116,-0.1359673,-0.2135479,0.6886546,0.04133311,-0.6024035,-0.6829576,...,-0.0005124788,0.0,-0.001212864,-0.1498533,0.0308125,-0.03303719,-0.1106695,-0.001021665,0.03473144,-0.009021482
75%,3055.375,0.5612186,0.6399796,0.5081501,0.6050525,0.8274105,0.6886546,0.8561341,1.660017,1.46422,...,0.845302,0.0,0.5021255,0.2743649,0.493995,0.5107522,0.5227272,0.5109474,0.03473144,0.8451947
max,3160.5,5.917647,2.917095,3.967453,3.793968,4.106583,0.6886546,1.640888,1.660017,1.46422,...,1.87138,0.0,2.795112,7.199504,3.975643,3.353814,6.392555,3.992973,9.643762,2.021944


In [43]:
#--------------------------------------
# TPOT Exported Model

from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

exported_pipeline = make_pipeline(
    ExtraTreesClassifier(criterion="entropy", max_features=0.48, n_estimators=500)
)

exported_pipeline.fit(train[alt_model_columns], train['class'])

Pipeline(steps=[('extratreesclassifier', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.48, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))])

In [44]:
frame['Facies'] = exported_pipeline.predict(frame[alt_model_columns])

In [52]:
frame['Facies']

0      3
1      3
2      3
3      3
4      3
5      3
6      3
7      3
8      3
9      2
10     2
11     2
12     2
13     2
14     2
15     2
16     2
17     2
18     2
19     2
20     2
21     2
22     2
23     2
24     2
25     2
26     2
27     2
28     2
29     2
      ..
800    7
801    9
802    7
803    8
804    6
805    6
806    8
807    8
808    6
809    6
810    8
811    8
812    3
813    3
814    3
815    3
816    3
817    3
818    3
819    3
820    3
821    3
822    3
823    3
824    3
825    3
826    3
827    3
828    3
829    3
Name: Facies, dtype: int64

In [46]:
frame.to_csv('02 - Well Facies Prediction - Test Data Set.csv')