# Apply cross validation

In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.dummy import DummyClassifier

from geone.img import readImageGslib, readPointSetGslib
from geone.deesseinterface import DeesseClassifier
from geone.imgplot import drawImage2D
from mpstool.cv_metrics import brier_score, zero_one_score, balanced_linear_score, SkillScore

In [2]:
DATA_DIR = 'data_roussillon/'
SAMPLES_DIR = DATA_DIR
OUTPUT_DIR = 'output/'
COLOR_SCHEME_BINARY = [ 
        [x/255 for x in [166,206,227]],
        [x/255 for x in [31,120,180]],
      ]

In [3]:
# Stratified 5-fold cross-validation with randomly shuffled data
cv = StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=20191201,
                    )

scoring = {
    'brier':brier_score,
    'skill_brier':SkillScore(DummyClassifier(strategy='prior'), 0, brier_score),
}

### Training image selection

## Roussillon

In [4]:
ti_true = readImageGslib(DATA_DIR+'trueTI.gslib')
mask = readImageGslib(DATA_DIR+'mask.gslib')
trend = readImageGslib(DATA_DIR+'trend.gslib')
im_angle = readImageGslib(DATA_DIR+'orientation.gslib')

In [5]:
nx, ny, nz = mask.nx, mask.ny, mask.nz      # number of cells
sx, sy, sz = mask.sx, mask.sy, mask.sz      # cell unit
ox, oy, oz = mask.ox, mask.oy, mask.oz      # origin (corner of the "first" grid cell)

deesse_roussillon = DeesseClassifier(
    varnames=['X','Y','Z','Facies'],
    nx=nx, ny=ny, nz=nz,
    sx=sx, sy=sy, sz=sz,
    ox=ox, oy=oy, oz=oz,
    nv=2, varname=['Facies', 'trend'],
    nTI=1, TI=ti_true,
    mask=mask.val,
    rotationUsage=1,            # use rotation without tolerance
    rotationAzimuthLocal=True,  #    rotation according to azimuth: local
    rotationAzimuth=im_angle.val[0,:,:,:],      #    rotation azimuth: map of values
    dataImage=trend,
    outputVarFlag=[True, False],
    distanceType=[0,1],
    nneighboringNode=[50,1],
    distanceThreshold=[0.05, 0.05],
    maxScanFraction=0.5,
    npostProcessingPathMax=1,
    seed=20191201,
    nrealization=40,
    nthreads=40,
)

In [6]:
# fill here
scan_fractions = [0.1, 0.2, 0.4, 0.8]
eps=1e-5
parameter_selector = GridSearchCV(deesse_roussillon,
                    param_grid=[{'maxScanFraction': scan_fractions,
                                'nneighboringNode': [[8, 1]],
                                'distanceThreshold': [[t+eps, 0.1] for t in [2/16, 4/16]]},
{'maxScanFraction': scan_fractions,
                                'nneighboringNode': [[16, 1]],
                                'distanceThreshold': [[t+eps, 0.1] for t in [2/16, 3/16, 4/16]]},
                                {'maxScanFraction': scan_fractions,
                                'nneighboringNode': [[32, 1]],
                                'distanceThreshold': [[t+eps, 0.1] for t in [1/16, 2/16, 3/16, 4/16]]},
                                {'maxScanFraction': scan_fractions,
                                'nneighboringNode': [[64, 1]],
                                'distanceThreshold': [[t+eps, 0.1] for t in [1/32, 1/16, 2/16, 3/16, 4/16]]},
                               
                               ],
                    scoring=scoring,
                    n_jobs=1,
                    cv=cv,
                    refit=False,
                    verbose=0,
                    error_score='raise',
                    return_train_score=False,
                   )

In [7]:
try:
    results = pd.read_csv('df_roussillon.csv', index_col=0)
except FileNotFoundError:
    df = pd.DataFrame(readPointSetGslib(SAMPLES_DIR + 'roussillon_observations_600.gslib').to_dict())
    parameter_selector.fit(df[['X','Y','Z']], df['Facies_real00000'])
    results = pd.DataFrame(parameter_selector.cv_results_)
    results.to_csv('df_roussillon.csv')

In [8]:
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_distanceThreshold,param_maxScanFraction,param_nneighboringNode,params,split0_test_brier,split1_test_brier,...,std_test_brier,rank_test_brier,split0_test_skill_brier,split1_test_skill_brier,split2_test_skill_brier,split3_test_skill_brier,split4_test_skill_brier,mean_test_skill_brier,std_test_skill_brier,rank_test_skill_brier
0,0.000977,6e-05,133.192147,0.964267,"[0.12501, 0.12501]",0.1,"[8, 1]","{'distanceThreshold': [0.12501, 0.12501], 'max...",-0.374104,-0.407677,...,0.046722,38,0.211764,0.141026,0.338754,0.407659,0.37448,0.294737,0.101548,38
1,0.000979,7.1e-05,126.318612,0.643584,"[0.12501, 0.12501]",0.2,"[8, 1]","{'distanceThreshold': [0.12501, 0.12501], 'max...",-0.39101,-0.363833,...,0.044353,29,0.176143,0.233405,0.378831,0.38759,0.423203,0.319834,0.096823,30
2,0.00094,7e-05,121.645797,0.658246,"[0.12501, 0.12501]",0.4,"[8, 1]","{'distanceThreshold': [0.12501, 0.12501], 'max...",-0.389406,-0.351958,...,0.040956,27,0.179523,0.258425,0.359912,0.398224,0.415093,0.322235,0.089727,27
3,0.000943,7.4e-05,118.091318,1.159128,"[0.12501, 0.12501]",0.8,"[8, 1]","{'distanceThreshold': [0.12501, 0.12501], 'max...",-0.397854,-0.379156,...,0.036661,44,0.161723,0.201119,0.25138,0.349391,0.371991,0.267121,0.081828,44
4,0.000977,6e-05,98.699392,0.64077,"[0.25001, 0.25001]",0.1,"[8, 1]","{'distanceThreshold': [0.25001, 0.25001], 'max...",-0.348479,-0.370365,...,0.030353,28,0.265756,0.219643,0.354118,0.38443,0.38598,0.321985,0.067324,28


In [9]:
# fill here
scan_fractions = [0.001, 0.002, 0.004, 0.006, 0.008, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.2, 0.4, 0.6]
eps=1e-5
parameter_selector_dsbc = GridSearchCV(deesse_roussillon,
                    param_grid={'maxScanFraction': scan_fractions,
                                'nneighboringNode': [[64, 1], [32, 1], [16,1], [8, 1]],
                                'distanceThreshold': [[eps, 0.1]]},
                    scoring=scoring,
                    n_jobs=1,
                    cv=cv,
                    refit=False,
                    verbose=0,
                    error_score='raise',
                    return_train_score=False,
                   )

In [None]:
try:
    results_dsbc = pd.read_csv('df_dsbc_roussillon.csv', index_col=0)
except FileNotFoundError:
    df = pd.DataFrame(readPointSetGslib(SAMPLES_DIR + 'roussillon_observations_600.gslib').to_dict())
    parameter_selector_dsbc.fit(df[['X','Y','Z']], df['Facies_real00000'])
    results_dsbc = pd.DataFrame(parameter_selector_dsbc.cv_results_)
    results_dsbc.to_csv('df_dsbc_roussillon.csv')