# Testing XGB on well log datasets 

In [1]:
import numpy as np
import pandas as pd

from coremdlr.datasets import WellLoader, FaciesDataset
from coremdlr.models import FeaturePredictor, LambdaModel

import matplotlib.pyplot as plt
from coremdlr.viz import model_plots

Using TensorFlow backend.
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/home/administrator/anaconda3/envs/core-dev/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/administrator/anaconda3/envs/core-dev/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/administrator/anaconda3/envs/core-dev/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/administrator/anaconda3/envs/core-dev/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/administrator/anaconda3/envs/core-dev/lib/python3.6/site-packages/ipyk

In [2]:
from coremdlr.datasets.utils import available_wells

wells = list(available_wells())
wells, len(wells)

(['204-20-3',
  '205-21b-3',
  '204-20a-7',
  '204-24a-7',
  '204-19-7',
  '204-20-6a',
  '204-20-2',
  '204-20-1',
  '204-24a-6',
  '204-19-3A',
  '204-19-6',
  '204-20-1Z'],
 12)

### Picking your training and testing wells

In [3]:
# May need to fiddle with these to make sure there is at least
# one example of 'oilstained' in the test wells, particularly an
# issue with higher resolution multipliers.
TRAIN_WELLS = wells[:10]
TEST_WELLS = wells[10:]

In [4]:
import hyperopt

from hyperopt import hp
from hyperopt.pyll.base import scope
from sklearn.metrics import f1_score, log_loss, accuracy_score
from sklearn.utils.class_weight import compute_sample_weight

fmodel_args = {
    'logs': {
        'model': 'LambdaModel',
        'model_args': {'feature': 'logs'}
    }
}

XGB_SEARCH_SPACE = {
    'model_type' : 'XGB',
    'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'objective' : 'multi:softprob',
    'n_jobs' : 2,
    'gamma' : hp.uniform('gamma', 0, 0.5),
    'subsample' : hp.uniform('subsample', 0.3, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 1.0),
    'colsample_bylevel' : 1,
    'reg_alpha' : 0,                                    # L1 penalty
    'reg_lambda' : hp.uniform('reg_lambda', 0.1, 10),   # L2 penalty
    'tree_method' : 'gpu_exact',
}

def train_xgb_model(model_config, fdset):
    """
    Train function given model_config dict of params, and FaciesDataset instance
    """
    xgb_predictor = FeaturePredictor(fdset, model_args=model_config, feature_model_args=fmodel_args)
    
    test_acc = xgb_predictor.fit(fdset, verbose=False)
    
    y_pred = xgb_predictor.predict(fdset.X_test)
    print('F1 score:', f1_score(fdset.y_test, y_pred, average='macro'))
    
    sample_weights = compute_sample_weight('balanced', fdset.y_test) 
    
    return log_loss(fdset.y_test, xgb_predictor.predict_proba(fdset.X_test), sample_weight=sample_weights)


def make_params_usable(params):
    """
    Convert params returned by hyperopt to usable `model_args`
    """
    return {
        **XGB_SEARCH_SPACE,
        **params,
        **{'max_depth' : int(params['max_depth']),
           'n_estimators' : int(params['n_estimators'])}
    }

In [5]:
EVALS_PER_SEARCH = 15
RESOLUTION_MULTIPLIERS = [2, 8, 16, 64, 256]

results = {'resolution_multiplier' : [], 'acc' : [], 'f1' : []}

for rmult in RESOLUTION_MULTIPLIERS:
    
    fdset = FaciesDataset(TRAIN_WELLS, test_wells=TEST_WELLS,
            features=["logs"],
            label_resolution=32*rmult, # 32 pixels ~ .5cm
            logs_args={'scaler_kind': 'standard'})
    
    fdset.load_or_generate_data()
    
    best_params = make_params_usable(
        hyperopt.fmin(
            fn=lambda p: train_xgb_model(p, fdset),
            space=XGB_SEARCH_SPACE,
            algo=hyperopt.rand.suggest,
            max_evals=EVALS_PER_SEARCH
        )
    )
    
    best_model = FeaturePredictor(fdset, model_args=best_params, feature_model_args=fmodel_args)
    
    print(f'Fitting best model for resolution multiplier: {rmult}')
    best_model.fit(fdset, verbose=True)
    
    y_pred = best_model.predict(fdset.X_test)
    
    fig, ax = plt.subplots(figsize=(10,10))
    model_plots.confusion_matrix_plot({'y_true' : fdset.y_test, 'y_pred' : y_pred}, 
                                       fdset.classes, title=f'Resolution {rmult*32}', ax=ax)
    plt.show()
    
    results['resolution_multiplier'].append(rmult)
    results['acc'].append(accuracy_score(fdset.y_test, y_pred))
    results['f1'].append(f1_score(fdset.y_test, y_pred, average='macro'))

Loading Well:  204-20-3  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Feature shapes:  [('depth', (5460,)), ('top', (5460,)), ('base', (5460,)), ('logs', (5460, 11))]
Loading Well:  205-21b-3  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS
Feature shapes:  [('depth', (1920,)), ('top', (1920,)), ('base', (1920,)), ('logs', (1920, 11))]
Loading Well:  204-20a-7  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (1548,)), ('top', (1548,)), ('base', (1548,)), ('logs', (1548, 11))]
Loading Well:  204-24a-7  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS
Feature shapes:  [('depth', (9650,)), ('top', (9650,)), ('base', (9650,)), ('logs', (9650, 11))]
Loading Well:  204-19-7  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (4905,)), ('top', (4905,)), ('base', (4905,)), ('logs', (4905, 11))]
Loading Well:  204-20-6a  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (1770,)), ('top', (1770,)), ('base', (1770,)), ('logs', (1770, 11))]
Loading Well:  204-20-2  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Feature shapes:  [('depth', (1465,)), ('top', (1465,)), ('base', (1465,)), ('logs', (1465, 11))]
Loading Well:  204-20-1  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (5600,)), ('top', (5600,)), ('base', (5600,)), ('logs', (5600, 11))]
Loading Well:  204-24a-6  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (6500,)), ('top', (6500,)), ('base', (6500,)), ('logs', (6500, 11))]
Loading Well:  204-19-3A  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (4793,)), ('top', (4793,)), ('base', (4793,)), ('logs', (4793, 11))]
Loading Well:  204-19-6  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (944,)), ('top', (944,)), ('base', (944,)), ('logs', (944, 11))]
Loading Well:  204-20-1Z  from  /home/administrator/Dropbox/core_data/facies/train_data


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (957,)), ('top', (957,)), ('base', (957,)), ('logs', (957, 11))]
Training model for feature:  logs


  'recall', 'true', average, warn_for)


F1 score: 0.1950588428558214


ValueError: y_true and y_pred contain different number of classes 4, 5. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2 4]

params = {
    **XGB_SEARCH_SPACE, 
    **best_params, 
    **{'max_depth' : int(best_params['max_depth']), 
       'n_estimators' : int(best_params['n_estimators'])}
}
xgb_predictor = FeaturePredictor(fdset, model_args=params, feature_model_args=fmodel_args)
xgb_predictor.fit(fdset, verbose=True)

imps = list(zip(fdset.wells[0].logs_args['which_logs'], xgb_predictor.model.feature_importances_))
imps.sort(key = lambda p: p[1])
for pair in imps[::-1]:
    print(pair)

import matplotlib.pyplot as plt
from coremdlr.viz import model_plots 

pred_dfs = []

for test_well_name in fdset.test_well_names:
    df = xgb_predictor.preds_dataframe(test_well_name, logs=fdset.logs_args['which_logs'])
    df['well'] = test_well_name
    
    fig, ax = plt.subplots(figsize=(10,10))
    model_plots.confusion_matrix_plot(df, fdset.classes, title=test_well_name, ax=ax)
    
    pred_dfs.append(df)
    
pred_df = pd.concat(pred_dfs)