In [2]:
import numpy as np
from collections import deque
from sklearn.metrics import confusion_matrix

from coremdlr.viz import model_plots, CorePlotter
from coremdlr.models import LambdaModel, FeaturePredictor
from coremdlr.datasets import FaciesDataset

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import matplotlib.pyplot as plt
%matplotlib inline

## `LambdaModel` feature reduction

In [3]:
def reduce_function(x):
    feats = []
    
    feats.append(np.mean(x, axis=1))
    feats.append(np.median(x, axis=1))
    feats.append(np.var(x, axis=1))
    
    x_feats = np.concatenate(feats, axis=-1)
    
    return x_feats


feat_model_args = {
    # NOTE: key needs to be feature name AND feature must be specified in model_args
    'pseudoGR': {
        'model' : 'LambdaModel',
        'model_args' : {
            'feature' : 'pseudoGR',
            'lambda_fn' : reduce_function
        }
    }
}

## `FeaturePredictor` search space definition

In [4]:
import hyperopt
from hyperopt import hp
from hyperopt.pyll.base import scope
from sklearn.metrics import f1_score, log_loss
from sklearn.utils.class_weight import compute_sample_weight


XGB_SEARCH_SPACE = {
    'model_type' : 'XGB',
    'max_depth' : scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'n_estimators' : scope.int(hp.quniform('n_estimators', 10, 1000, 1)),
    'objective' : 'multi:softprob',
    'n_jobs' : 2,
    'gamma' : hp.uniform('gamma', 0, 0.5),
    'subsample' : hp.uniform('subsample', 0.3, 1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 1.0),
    'colsample_bylevel' : 1,
    'reg_alpha' : 0,                                    # L1 penalty
    'reg_lambda' : hp.uniform('reg_lambda', 0.1, 10),   # L2 penalty
    'tree_method' : 'gpu_exact',
}

In [5]:
def train_xgb_model(fdset, model_config):
    
    xgb_predictor = FeaturePredictor(fdset, 
                                     model_args=model_config, 
                                     feature_model_args=feat_model_args)
    
    test_acc = xgb_predictor.fit(fdset, verbose=False)
    
    y_proba = xgb_predictor.predict_proba(fdset.X_test)
    y_pred = np.argmax(y_proba, axis=-1)
    
    print('F1 score:', f1_score(fdset.y_test, y_pred, average='macro'))
    
    return log_loss(fdset.y_test, y_proba) #, sample_weight=sample_weights)


## Cross Testing

In [6]:
well_names = ["204-19-3A", "204-19-6", "204-19-7", "204-20-1","204-20-1Z","204-20-2","204-20-3", "205-21b-3", "204-20-6a", "204-24a-6","204-24a-7" ]
names_deq = deque(well_names)

results = {}

for i in range(len(names_deq)):
    
    # Set up dataset, rotate well_names
    fdset = FaciesDataset(list(names_deq)[:-1], test_wells=[names_deq[-1]],
                        features=["pseudoGR", "logs"],
                        pseudoGR_args={'features': ['mean', 'var', 'p10', 'p90'], 
                                                    'per_channel' : True},
                        label_resolution=32)
    
    names_deq.rotate()
    
    fdset.load_or_generate_data()
    
    best_params = hyperopt.fmin(
        fn=lambda x: train_xgb_model(fdset, x),
        space=XGB_SEARCH_SPACE,
        algo=hyperopt.rand.suggest,
        max_evals=25
    )
    
    params = {
        **XGB_SEARCH_SPACE, 
        **best_params, 
        **{'max_depth': int(best_params['max_depth']), 
           'n_estimators': int(best_params['n_estimators'])}
    }

    xgb_predictor = FeaturePredictor(fdset, model_args=params, feature_model_args=feat_model_args)
    
    xgb_predictor.fit(fdset, verbose=True)
    
    # Feature importances
    raw_feat_names = fdset.wells[0].pGR_feat_names
    feat_names = [f+'_mean' for f in raw_feat_names] + \
                 [f+'_median' for f in raw_feat_names] + \
                 [f+'_var' for f in raw_feat_names]
            
    imps = list(zip(feat_names, xgb_predictor.model.feature_importances_))
    imps.sort(key = lambda p: p[1])
    print('\nFeature Importances:')
    [print(pair) for pair in imps[::-1]]
    
    # Get test results
    test_well = fdset.test_well_names[0]
    
    results[test_well] = xgb_predictor.preds_dataframe(test_well, logs=['GR', 'RDEP', 'PEF', 'SP'])

Loading Well:  204-19-3A  from  /home/administrator/Dropbox/core_data/facies/train_data


  output_features.append(np.nanmean(img, axis=1))
  output_features.append(np.nanvar(img, axis=1))
  overwrite_input, interpolation)


Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (9590,)), ('top', (9590,)), ('base', (9590,)), ('pseudoGR', (9590, 32, 16)), ('logs', (9590, 11))]
Loading Well:  204-19-7  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (9807,)), ('top', (9807,)), ('base', (9807,)), ('pseudoGR', (9807, 32, 16)), ('logs', (9807, 11))]
Loading Well:  204-20-1  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (11210,)), ('top', (11210,)), ('base', (11210,)), ('pseudoGR', (11210, 32, 16)), ('logs', (11210, 11))]
Loading Well:  205-21b-3  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS
Feature shapes:  [('depth', (3842,)), ('top', (3842,)), ('base', (3842,)), ('pseudoGR', (3842, 32, 16)), ('logs', (3842, 11))]
Loading Well:  204-20-6a  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (3542,)), ('top', (3542,)), ('base', (3542,)), ('pseudoGR', (3542, 32, 16)), ('logs', (3542, 11))]
Loading Well:  204-20-1Z  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (1917,)), ('top', (1917,)), ('base', (1917,)), ('pseudoGR', (1917, 32, 16)), ('logs', (1917, 11))]
Loading Well:  204-24a-6  from  /home/administrator/Dropbox/core_data/facies/train_data
Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  SP
Adding NaN log:  DTS1
Adding NaN log:  DTS2
Feature shapes:  [('depth', (13006,)), ('top', (13006,)), ('base', (13006,)), ('pseudoGR', (13006, 32, 16)), ('logs', (13006, 11))]
Loading Well:  204-19-6  from  /home/administrator/Dropbox/core_data/facies/train_data


  output_features.append(np.nanmean(img, axis=1))
  output_features.append(np.nanvar(img, axis=1))
  overwrite_input, interpolation)


Extracted pGR features:  ['Umean', 'Rmean', 'Gmean', 'Bmean', 'Uvar', 'Rvar', 'Gvar', 'Bvar', 'Up10', 'Rp10', 'Gp10', 'Bp10', 'Up90', 'Rp90', 'Gp90', 'Bp90']


Header section Parameter regexp=~P was not found.


Adding NaN log:  DTS
Feature shapes:  [('depth', (1884,)), ('top', (1884,)), ('base', (1884,)), ('pseudoGR', (1884, 32, 16)), ('logs', (1884, 11))]
Training model for feature:  pseudoGR


  'recall', 'true', average, warn_for)


F1 score: 0.40727815293188757


ValueError: y_true and y_pred contain different number of classes 4, 5. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2 4]

In [None]:
import pandas as pd

dfs = []
for well_name, results_df in results.items():
    results_df['well_name'] = well_name
    dfs.append(results_df)
    
df = pd.concat(dfs, ignore_index=True)
df

In [None]:
df.to_csv('cross_test_xgb.csv', index=False)

In [None]:
axes = df.regression.hist(by=df.y_true, figsize=(15,15), alpha=0.4)
print(axes)

for i, ax in enumerate(axes.flat):
    df.regression[df.y_pred==i].hist(ax=ax, color='blue', alpha=0.4)

In [None]:
axes = df.y_true.hist(by=df.well_name, figsize=(15,15), alpha=0.4)

well_names = ["204-19-6", "204-20-1Z", "204-20-6a", "204-24a-6", "205-21b-3"]

for i, (ax, name) in enumerate(zip(axes.flat, well_names)):
    print(name)
    df.y_pred[df.well_name==name].hist(ax=ax, color='blue', alpha=0.4)