In [1]:
# probably not good to get into the habit of ignorning all of the RuntimeWarnings
# but it keeps things cleaner for now

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
import numpy as np
import scipy as sp
from coremdlr.datasets import WellLoader, FaciesDataset
from coremdlr.models import FeaturePredictor, LambdaModel

Using TensorFlow backend.


In [3]:
# Note: for `reduce_function` to work as written, we need single-channel pGR

fdset = FaciesDataset(["205-21b-3", "204-24a-6", "204-20-6a", ],
                    test_wells=["204-19-6"],
                    features=["pseudoGR", "logs"],
                    pseudoGR_args={'features':['mean'], 'per_channel':False},
                    label_resolution=32)

fdset.load_or_generate_data()

Loading Well:  205-21b-3  from  /home/ross/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS
Feature shapes:  [('depth', (3842,)), ('top', (3842,)), ('base', (3842,)), ('pseudoGR', (3842, 32, 1)), ('logs', (3842, 11))]
Loading Well:  204-24a-6  from  /home/ross/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (13006,)), ('top', (13006,)), ('base', (13006,)), ('pseudoGR', (13006, 32, 1)), ('logs', (13006, 11))]
Loading Well:  204-20-6a  from  /home/ross/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (3542,)), ('top', (3542,)), ('base', (3542,)), ('pseudoGR', (3542, 32, 1)), ('logs', (3542, 11))]
Loading Well:  204-19-6  from  /home/ross/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean']


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (1884,)), ('top', (1884,)), ('base', (1884,)), ('pseudoGR', (1884, 32, 1)), ('logs', (1884, 11))]


In [4]:
fdset.X_train

{'pseudoGR': array([[[-1.43032357],
         [-1.44530056],
         [-1.44356918],
         ...,
         [-1.43711217],
         [-1.43515969],
         [-1.43175723]],
 
        [[-1.43999973],
         [-1.43694975],
         [-1.42513606],
         ...,
         [-1.28338159],
         [-1.27688571],
         [-1.2691924 ]],
 
        [[-1.25704702],
         [-1.25234698],
         [-1.25405517],
         ...,
         [-0.87033114],
         [-0.84970345],
         [-0.84871894]],
 
        ...,
 
        [[-1.12578242],
         [-1.05124625],
         [-0.93005796],
         ...,
         [-0.48612563],
         [-0.462301  ],
         [-0.45574232]],
 
        [[-0.43443013],
         [-0.41625653],
         [-0.41264419],
         ...,
         [-0.13868142],
         [-0.09966698],
         [-0.05836392]],
 
        [[-0.07205497],
         [-0.10052565],
         [-0.06141172],
         ...,
         [-0.16115686],
         [-0.13511807],
         [-0.11393803]]]),
 'logs'

In [17]:
import hyperopt
from hyperopt import hp
from hyperopt.pyll.base import scope

from sklearn.metrics import f1_score, log_loss
from sklearn.utils.class_weight import compute_sample_weight

from scipy.stats import mstats


# for balanced training and log_loss computation
train_sample_weights = compute_sample_weight('balanced', fdset.y_train)
test_sample_weights = compute_sample_weight('balanced', fdset.y_test)

# feat_names = ['mean', 'median', 'hmean', 'gmean', 'var', 'IF_0', 'IF_1', 'Chi2', 'p-val']
feat_names = ['mean', 'median', 'hmean', 'gmean', 'var', 'IF_0', 'IF_1']

def reduce_function(x):
    
    feats = []
    # Clean up `x`
    x = np.squeeze(x) 
    x = np.ma.masked_invalid(x)
    x = np.ma.masked_less_equal(x, 0.1)
    
    # central tendency
    feats.append(np.mean(x, axis=-1))
    feats.append(np.median(x, axis=-1))
    feats.append(mstats.hmean(x, axis=-1))
    feats.append(mstats.gmean(x, axis=-1))
    
    # dispersion
    feats.append(np.var(x, axis=-1))
    ideal_fourths = mstats.idealfourths(x, axis=-1)
    feats.append(ideal_fourths[:, 0])
    feats.append(ideal_fourths[:, 1])
    
    # TODO: curvature

    x_feats = np.array(feats).T
    
    return x_feats


feat_model_args = {
    # NOTE: key needs to be feature name AND feature must be specified in model_args
    'pseudoGR': {
        'model' : 'LambdaModel',
        'model_args' : {
            'feature' : 'pseudoGR',
            'lambda_fn' : reduce_function
        }
    }
}


# NOTE: change `gpu_id` if your machine is different
XGB_SEARCH_SPACE = {
    'model_type' : 'XGB',
    'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'objective' : 'multi:softprob',
    'n_jobs' : 2,
    'gamma' : hp.uniform('gamma', 0, 0.5),
    'subsample' : hp.uniform('subsample', 0.3, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 1.0),
    'colsample_bylevel' : 1,
    'reg_alpha' : 0,                                    # L1 penalty
    'reg_lambda' : hp.uniform('reg_lambda', 0.1, 10),   # L2 penalty
    'tree_method' : 'gpu_hist',
    #'gpu_id' : 0
}


def train_xgb_model(model_config):
    
    xgb_predictor = FeaturePredictor(fdset, 
                                     model_args=model_config, 
                                     feature_model_args=feat_model_args)
    
    test_acc = xgb_predictor.fit(fdset, verbose=False, sample_weight=train_sample_weights)
    
    y_pred = xgb_predictor.predict(fdset.X_test)
    
    # Note: have to specify labels here sometimes
    print('F1 score:', f1_score(fdset.y_test, y_pred, labels=list(range(5)), average='macro'))
    
    return log_loss(fdset.y_test, xgb_predictor.predict_proba(fdset.X_test),
                    labels=list(range(5)), sample_weight=test_sample_weights)

In [18]:
best_params = hyperopt.fmin(
    fn=train_xgb_model,
    space=XGB_SEARCH_SPACE,
    algo=hyperopt.rand.suggest,
    max_evals=25
)

Training model for feature:                         
pseudoGR                                            
F1 score:                                           
0.25185189887950343                                 
Training model for feature:                                                 
pseudoGR                                                                    
F1 score:                                                                   
0.2280621336912208                                                          
Training model for feature:                                                  
pseudoGR                                                                     
F1 score:                                                                    
0.23629801055059135                                                          
Training model for feature:                                                  
pseudoGR                                                                     
F1 score:   

In [19]:
best_params

{'colsample_bytree': 0.8014369069545624,
 'gamma': 0.056814642055057374,
 'learning_rate': 0.09204883242094557,
 'max_depth': 5.0,
 'n_estimators': 147.0,
 'reg_lambda': 3.135589972892407,
 'subsample': 0.8449117868256566}

In [20]:
# Error below is because test well doesn't have any 'oilstained'
# I don't really want to fix this everywhere, can we just use test sets with every class?

params = {**XGB_SEARCH_SPACE, 
          **best_params, 
          **{'max_depth': int(best_params['max_depth']), 'n_estimators': int(best_params['n_estimators'])}}
                              
xgb_predictor = FeaturePredictor(fdset, model_args=params, feature_model_args=feat_model_args)
xgb_predictor.fit(fdset, verbose=True)

imps = list(zip(feat_names, xgb_predictor.model.feature_importances_))
imps.sort(key = lambda p: p[1])
print()
[print(pair) for pair in imps[::-1]]

Training model for feature:  pseudoGR


ValueError: Number of classes, 4, does not match size of target_names, 5. Try specifying the labels parameter

In [None]:
np.bincount(xgb_predictor.predict(fdset.X_test))

In [None]:
xgb_predictor.predict_proba(fdset.X_test)

In [None]:
df = xgb_predictor.save_preds(fdset.test_well_names[0])
# ABOVE USES UNSCALED DATA! FIX THAT!

df['y_pred'] = xgb_predictor.predict(fdset.X_test)
df[['proba_0','proba_1','proba_2','proba_3']] = xgb_predictor.predict_proba(fdset.X_test)

In [None]:
df

In [None]:
df.plot(x='top', y='GR', figsize=(15,5))

In [None]:
df.plot(x='top', y='pseudoGR', figsize=(15,5))

In [None]:
df.plot(x='top', y='pseudoGR', kind='scatter', figsize=(15,5))

## Thoughts

Maybe we should do a little more work on filling in or masking the pseudo gamma. It's computed from the min-max normalized image (hence the 0-1 range), but maybe we should also standardize it after computing (probably less important for XGB model, but may help quite a bit for the networks which currently have comparable performance).

In [None]:
df['normed_pGR'] = (df.pseudoGR - df.pseudoGR.mean()) / df.pseudoGR.std()
df.plot(x='top', y=['normed_pGR','GR'], figsize=(15,5), ylim=[-2,2])

## Looking at regression estimate / error / confidence

In [None]:
import seaborn as sns

df['regression_error'] = (df.regression - df.y_true.astype(float)).abs()
sns.scatterplot(x='regression_error', y='confidence', hue='y_true', data=df)

#df.plot(x='regression_error', y='confidence', kind='scatter',  figsize=(15,5))

In [None]:
import seaborn as sns

df['abs_class_distance'] = (df.y_true - df.y_pred).abs()
sns.violinplot(x='abs_class_distance', y='confidence', data=df)

In [None]:
df['class_distance'] = (df.y_true - df.y_pred)
sns.violinplot(x='class_distance', y='confidence', data=df)

## Confusion Matrices

**NOTE**: probably going to move the analysis visualization functions into a different submodule.

In [None]:
from sklearn.metrics import confusion_matrix
from coremdlr.facies.models_utils import make_confusion_fig

In [None]:
classes = fdset.classes
cm = confusion_matrix(df.y_true.values, df.y_pred.values)

# updated function below to have no grid by default
make_confusion_fig(cm, classes)

In [None]:
make_confusion_fig(cm, classes, normalize=True)
