# Testing XGB on Psuedo Gamma datasets 

In [1]:
import numpy as np
import scipy as sp
from corebreakout.facies.datasets import WellLoader, FaciesDataset
from corebreakout.facies.models import FeaturePredictor, LambdaModel

Using TensorFlow backend.


### Picking your training and testing wells

In [2]:
fdset = FaciesDataset(["205-21b-3", "204-24a-6", "204-20-6a"],
                    test_wells=["204-19-6"],
                    features=["pseudoGR"],
                    label_resolution=32)

fdset.load_or_generate_data()

Loading Well:  205-21b-3


  pseudoGR = np.apply_along_axis(lambda x: np.nanmean(x[x.nonzero()]), 1, uimg)


Feature shapes:  [('depth', (3843,)), ('pseudoGR', (3843, 32))]
Loading Well:  204-24a-6
Feature shapes:  [('depth', (12994,)), ('pseudoGR', (12994, 32))]
Loading Well:  204-20-6a
Feature shapes:  [('depth', (3877,)), ('pseudoGR', (3877, 32))]
Loading Well:  204-19-6
Feature shapes:  [('depth', (1882,)), ('pseudoGR', (1882, 32))]


In [3]:
import hyperopt
from hyperopt import hp
from hyperopt.pyll.base import scope
from sklearn.metrics import f1_score, log_loss
from sklearn.utils.class_weight import compute_sample_weight

from scipy.stats import mstats


# for balanced log_loss computation
sample_weights = compute_sample_weight('balanced', fdset.y_test) 

# feat_names = ['mean', 'median', 'hmean', 'gmean', 'var', 'IF_0', 'IF_1', 'Chi2', 'p-val']
feat_names = ['mean', 'median', 'hmean', 'gmean', 'var', 'IF_0', 'IF_1']

def reduce_function(x):
    feats = []
    x = np.ma.masked_invalid(x)
    x = np.ma.masked_less_equal(x, 0.0)
    feats.append(np.mean(x, axis=-1))
    feats.append(np.median(x, axis=-1))
    feats.append(mstats.hmean(x, axis=-1))
    feats.append(mstats.gmean(x, axis=-1))
    feats.append(np.var(x, axis=-1))
    
    # feats.append(sp.signal.qspline1d(x))
    ideal_fourths = mstats.idealfourths(x, axis=-1)
    feats.append(ideal_fourths[:, 0])
    feats.append(ideal_fourths[:, 1])
    
    print(x.shape)
    
    #normal_test = mstats.normaltest(x, axis=1)
    #feats.append(normal_test[0])
    #feats.append(normal_test[1])
    # kur_test = mstats.kurtosistest(x, axis =-1)
    # feats.append(kurtosistest[:, 0])
    # feats.append(kurtosistest[:, 1])
    
    x_feats = np.array(feats).T
    return x_feats


feat_model_args = {
    # NOTE: key needs to be feature name AND feature must be specified in model_args
    'pseudoGR': {
        'model' : 'LambdaModel',
        'model_args' : {
            'feature' : 'pseudoGR',
            'lambda_fn' : reduce_function
        }
    }
}

XGB_SEARCH_SPACE = {
    'model_type' : 'XGB',
    'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'objective' : 'multi:softprob',
    'n_jobs' : 2,
    'gamma' : hp.uniform('gamma', 0, 0.5),
    'subsample' : hp.uniform('subsample', 0.3, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 1.0),
    'colsample_bylevel' : 1,
    'reg_alpha' : 0,                                    # L1 penalty
    'reg_lambda' : hp.uniform('reg_lambda', 0.1, 10),   # L2 penalty
    'tree_method' : 'gpu_exact',
}

def train_xgb_model(model_config):
    xgb_predictor = FeaturePredictor(fdset, 
                                     model_args=model_config, 
                                     feature_model_args=feat_model_args)
    test_acc = xgb_predictor.fit(fdset, verbose=False)
    y_pred = xgb_predictor.predict(fdset.X_test)
    print('F1 score:', f1_score(fdset.y_test, y_pred, average='macro'))
    return log_loss(fdset.y_test, xgb_predictor.predict_proba(fdset.X_test)) #, sample_weight=sample_weights)

In [4]:
best_params = hyperopt.fmin(
    fn=train_xgb_model,
    space=XGB_SEARCH_SPACE,
    algo=hyperopt.rand.suggest,
    max_evals=25
)

Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)


(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.41291559061090055


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4354596195852577


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.43152606488387196


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.42579958828825704


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4228132650098939


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4365252943467586


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.42869033396119227


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4307666414502741


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4373439584841785


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.430785212514208


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.44664826668035784


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.41230597806482516


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43094400590694937


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4401722896581993


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4216875077272633


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43414910600255435


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4463598486664395


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.45134787566219803


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.44276014103013417


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.43883672781549427


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43376053150251137


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4457172855879753


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.43403090167238156


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4128389242737993


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.44262649516097724


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.42862008159715864


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4300274998382356


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.43251660929349034


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43381078313038324


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4317783626607252


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.42959772694650533


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.42557759413831053


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4182655860842683


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4297423687889523


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4253162553365072


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.424110124904009


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4370781144207419


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.40323702265641953


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.44246357567840044


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.40919784280210814


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.42859973154666614


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.45284582154543196


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4295791760053292


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43179789388079265


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.43314571366229804


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4360467593131333


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.4217962669167563


  if diff:


(1882, 32)
Training model for feature:  pseudoGR
(20714, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(1882, 32)
(1882, 32)
F1 score: 0.43429774815674005


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.4270099041946446


  if diff:


(1882, 32)
Training model for feature:  pseudoGR


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(20714, 32)
(1882, 32)
(1882, 32)
F1 score: 0.44941026888890295
(1882, 32)


  if diff:


In [5]:
best_params

{'colsample_bytree': 0.415924641666763,
 'gamma': 0.39858045346661647,
 'learning_rate': 0.08512983849851587,
 'max_depth': 3.0,
 'n_estimators': 885.0,
 'reg_lambda': 8.617016926113337,
 'subsample': 0.8962515177451702}

In [6]:
params = {**XGB_SEARCH_SPACE, **best_params, **{'max_depth': 4, 'n_estimators': 250}}
xgb_predictor = FeaturePredictor(fdset, model_args=params, feature_model_args=feat_model_args)
xgb_predictor.fit(fdset, verbose=True)

imps = list(zip(feat_names, xgb_predictor.model.feature_importances_))
imps.sort(key = lambda p: p[1])
[print(pair) for pair in imps[::-1]]

  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)


Training model for feature:  pseudoGR
(20714, 32)
(1882, 32)
                      precision    recall  f1-score   support

           sandstone       0.77      0.83      0.80       981
clay-prone sandstone       0.00      0.00      0.00       222
      sandy mudstone       0.31      0.43      0.36       263
            mudstone       0.63      0.62      0.62       416

         avg / total       0.58      0.63      0.60      1882

Total accuracy Score :  0.6296493092454836
Confusion Matrix: 
 [[815  10 111  45]
 [160   0  37  25]
 [ 59  10 112  82]
 [ 29  24 105 258]]
('var', 0.31923383)
('IF_1', 0.215004)
('IF_0', 0.17366321)
('median', 0.12801278)
('mean', 0.05698324)
('gmean', 0.055067837)
('hmean', 0.052035116)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


[None, None, None, None, None, None, None]

In [7]:
fdset_unlabeled = FaciesDataset(['204-20-1Z'], [], features=['pseudoGR'])
fdset_unlabeled.load_or_generate_data()

Loading well with dummy labels. DO NOT TRAIN ON THIS WELL!
Loading Well:  204-20-1Z


  pseudoGR = np.apply_along_axis(lambda x: np.nanmean(x[x.nonzero()]), 1, uimg)


Feature shapes:  [('depth', (2127,)), ('pseudoGR', (2127, 32))]


In [8]:
preds = xgb_predictor.predict(fdset_unlabeled.X)
preds.shape

(2127, 32)


  a.partition(kth, axis=axis, kind=kind, order=order)
  log_a = np.log(a)
  if diff:


(2127,)

In [9]:
fdset_unlabeled.wells[0].make_striplog(labels=preds, save_csv='./test_pred_picks.csv')

[ 428    0 2127]
[ 428    0 1553   86  193  295]
none
mudstone
sandy mudstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
sandstone
clay-prone sandstone
mudstone
sandstone
clay-prone sandstone
sandstone
mudstone
sandstone
sandy mudstone
mudstone
clay-prone sandstone
mudstone
sandy mudstone
sandstone
sandy mudstone
mudstone
sandy mudstone
mudstone
sandy mudstone
mudstone
sandy mudstone
sandstone
none
sandy mudstone
sandstone
sandy mudstone
mudstone
sandy mudstone
sandstone
sandy mudstone
mudstone
sandstone
sandy mudstone
mudstone
sandy mudstone
mudstone
sandy mudstone
mudstone
sandy mudstone
sandstone
sandy mudstone
sandstone
mudstone
clay-prone sandstone
mudstone
sandstone
clay-prone sandstone
none
mudstone
clay-prone sandstone
sandy mudstone
mudstone
sandy mudstone
sandstone
sandy mud

Striplog(298 Intervals, start=2673.0042966263527, stop=2685.995377749442)