# Testing XGB on well log datasets 

In [1]:
import numpy as np
from corebreakout.facies.datasets import WellLoader, FaciesDataset
from corebreakout.facies.models import FeaturePredictor, LambdaModel

### Picking your training and testing wells

In [2]:
fdset = FaciesDataset(["205-21b-3", "204-19-6", "204-24a-6"],
                    test_wells=["204-20-6a"],
                    features=["logs"])

In [3]:
fdset.load_or_generate_data()

Loading Well:  205-21b-3


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS


  log_vectors -= np.nanmin(log_vectors, axis=0)
  log_vectors /= np.nanmax(log_vectors, axis=0)


Feature shapes:  [('depth', (3842,)), ('logs', (3842, 10))]
Loading Well:  204-19-6


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (1885,)), ('logs', (1885, 10))]
Loading Well:  204-24a-6


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (12994,)), ('logs', (12994, 10))]
Loading Well:  204-20-6a


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (3873,)), ('logs', (3873, 10))]


In [4]:
import hyperopt
from hyperopt import hp
from hyperopt.pyll.base import scope
from sklearn.metrics import f1_score, log_loss
from sklearn.utils.class_weight import compute_sample_weight

# for balanced log_loss computation
sample_weights = compute_sample_weight('balanced', fdset.y_test) 

fmodel_args = {
    'logs': {
        'model': 'LambdaModel',
        'model_args': {}
    }
}

XGB_SEARCH_SPACE = {
    'model_type' : 'XGB',
    'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'objective' : 'multi:softprob',
    'n_jobs' : 2,
    'gamma' : hp.uniform('gamma', 0, 0.5),
    'subsample' : hp.uniform('subsample', 0.3, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 1.0),
    'colsample_bylevel' : 1,
    'reg_alpha' : 0,                                    # L1 penalty
    'reg_lambda' : hp.uniform('reg_lambda', 0.1, 10),   # L2 penalty
    'tree_method' : 'gpu_exact',
}

def train_xgb_model(model_config):
    xgb_predictor = FeaturePredictor(fdset, model_args=model_config, feature_model_args=fmodel_args)
    test_acc = xgb_predictor.fit(fdset, verbose=False)
    y_pred = xgb_predictor.predict(fdset.X_test)
    print('F1 score:', f1_score(fdset.y_test, y_pred, average='macro'))
    return log_loss(fdset.y_test, xgb_predictor.predict_proba(fdset.X_test), sample_weight=sample_weights)

In [5]:
best_params = hyperopt.fmin(
    fn=train_xgb_model,
    space=XGB_SEARCH_SPACE,
    algo=hyperopt.rand.suggest,
    max_evals=25
)

KeyError: 'model_args'

In [None]:
best_params

In [None]:
params = {**XGB_SEARCH_SPACE, **best_params, **{'max_depth':7, 'n_estimators':195}}
xgb_predictor = FeaturePredictor(fdset, model_args=params, feature_model_args=fmodel_args)
xgb_predictor.fit(fdset, verbose=True)

imps = list(zip(feat_names, xgb_predictor.model.feature_importances_))
imps.sort(key = lambda p: p[1])
[print(pair) for pair in imps[::-1]]

In [None]:
f1_score(fdset.y_test, xgb_predictor.predict(fdset.X_test), average='macro')