# Apply cross validation

In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.dummy import DummyClassifier

from geone.img import readImageGslib, readPointSetGslib
from geone.deesseinterface import DeesseClassifier
from geone.imgplot import drawImage2D
from mpstool.cv_metrics import brier_score, zero_one_score, balanced_linear_score, SkillScore

In [None]:
DATA_DIR = 'data/'
SAMPLES_DIR = 'samples/'
OUTPUT_DIR = 'output/'
COLOR_SCHEME_BINARY = [ 
        [x/255 for x in [166,206,227]],
        [x/255 for x in [31,120,180]],
      ]

In [None]:
# Stratified 5-fold cross-validation with randomly shuffled data
cv = StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=20191201,
                    )

scoring = {
    'brier':brier_score,
    'zero_one':zero_one_score,
    'linear':balanced_linear_score,
    'skill_brier':SkillScore(DummyClassifier(strategy='prior'), 0, brier_score),
    'skill_zero_one':SkillScore(DummyClassifier(strategy='prior'), 1, zero_one_score),
}

## Training image selection

In [None]:
# Load training images
ti_A = readImageGslib(DATA_DIR+'A.gslib')
ti_B = readImageGslib(DATA_DIR+'B.gslib')
ti_C = readImageGslib(DATA_DIR+'C.gslib')

### Sensitivity analysis

In [None]:
# Load observations
observations_A = readPointSetGslib(SAMPLES_DIR + 'sample_A_50.gslib').to_dict()
observations_B = readPointSetGslib(SAMPLES_DIR + 'sample_B_50.gslib').to_dict()
observations_C = readPointSetGslib(SAMPLES_DIR + 'sample_C_50.gslib').to_dict()

In [None]:
# We can conveniently use pandas to look at observations
df_A = pd.DataFrame(observations_A)
df_B = pd.DataFrame(observations_B)
df_C = pd.DataFrame(observations_C)
df_A.head()

In [None]:
# Define the interface for the DeeSse simulation tool
deesse_classifier = DeesseClassifier(
    varnames = ['X','Y','Z', 'facies'],
        nx=100, ny=100, nz=1,     # dimension of the simulation grid (number of cells)
        sx=1.0, sy=1.0, sz=1.0,   # cells units in the simulation grid (here are the default values)
        ox=0.0, oy=0.0, oz=0.0,   # origin of the simulation grid (here are the default values)
        nv=1, varname='facies',   # number of variable(s), name of the variable(s)
        nTI=1, TI=ti_A,           # number of TI(s), TI (class dsi.Img)
        distanceType=0,           # distance type: proportion of mismatching nodes (categorical var., default)
        nneighboringNode=60,      # max. number of neighbors (for the patterns)
        distanceThreshold=0.1,    # acceptation threshold (for distance between patterns)
        maxScanFraction=0.25,     # max. scanned fraction of the TI (for simulation of each cell)
        npostProcessingPathMax=1, # number of post-processing path(s)
        seed=20191201,            # seed (initialization of the random number generator)
        nrealization=30,           # number of realization(s)
        nthreads=4)

In [None]:
%%time
# Do an example simulation using the parameters and plot it
example_simulation = deesse_classifier.simulate()['sim'][0]
drawImage2D(example_simulation, categ=True, categCol=COLOR_SCHEME_BINARY)

In [None]:
# Use sklearn's grid search CV to check results for different parameters: here number of realisations and training images
sensitivity_checker = GridSearchCV(deesse_classifier,
                    param_grid={'TI': [ti_A, ti_B, ti_C],
                                'nrealization' : range(1, 50, 3)},
                    scoring=scoring,
                    n_jobs=8,
                    cv=cv,
                    refit=False,
                    verbose=0,
                    error_score='raise',
                    return_train_score=False,
                   )

In [None]:
%%time
for df, testcase in zip([df_A, df_B, df_C], ['A', 'B', 'C']):
    sensitivity_checker.fit(df[['X','Y','Z']], df['code_real00000'])
    results = pd.DataFrame(sensitivity_checker.cv_results_)
    results.to_csv(OUTPUT_DIR+'sensitivity_{}.csv'.format(testcase))

### Training image selection

In [None]:
%%time
# Evaluate score for each observation set with the three TIs
observation_files = [file for file in os.listdir(SAMPLES_DIR) if file.endswith(".gslib") and file.startswith("sample")]
for observation_file in observation_files:
    observation = readPointSetGslib(SAMPLES_DIR + observation_file).to_dict()
    df = pd.DataFrame(observation)    
    ti_selector =  GridSearchCV(deesse_classifier,
                        param_grid={'TI': [ti_A, ti_B, ti_C]},
                        scoring=scoring,
                        n_jobs=3,
                        cv=cv,
                        refit=False,
                        verbose=0,
                        error_score='raise',
                        return_train_score=False,
                       )
    ti_selector.fit(df[['X','Y','Z']], df['code_real00000'])
    results = pd.DataFrame(ti_selector.cv_results_)
    results.to_csv(OUTPUT_DIR+observation_file.split('.')[0]+'.csv')

## Roussillon

In [None]:
ti_true = readImageGslib(DATA_DIR+'trueTI.gslib')
ti_analog = readImageGslib(DATA_DIR+'analogTI.gslib')
mask = readImageGslib(DATA_DIR+'mask.gslib')
trend = readImageGslib(DATA_DIR+'trend.gslib')
im_angle = readImageGslib(DATA_DIR+'orientation.gslib')

In [None]:
nx, ny, nz = mask.nx, mask.ny, mask.nz      # number of cells
sx, sy, sz = mask.sx, mask.sy, mask.sz      # cell unit
ox, oy, oz = mask.ox, mask.oy, mask.oz      # origin (corner of the "first" grid cell)

deesse_roussillon = DeesseClassifier(
    varnames=['X','Y','Z','Facies'],
    nx=nx, ny=ny, nz=nz,
    sx=sx, sy=sy, sz=sz,
    ox=ox, oy=oy, oz=oz,
    nv=2, varname=['Facies', 'trend'],
    nTI=1, TI=ti_true,
    mask=mask.val,
    rotationUsage=1,            # use rotation without tolerance
    rotationAzimuthLocal=True,  #    rotation according to azimuth: local
    rotationAzimuth=im_angle.val[0,:,:,:],      #    rotation azimuth: map of values
    dataImage=trend,
    outputVarFlag=[True, False],
    distanceType=[0,1],
    nneighboringNode=[50,1],
    distanceThreshold=[0.05, 0.05],
    maxScanFraction=0.5,
    npostProcessingPathMax=1,
    seed=20191201,
    nrealization=30,
    nthreads=32,
)

In [None]:
parameter_selector = GridSearchCV(deesse_roussillon,
                    param_grid={'TI': [ti_true, ti_analog],
                                'maxScanFraction': [0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 0.8],
                                'nneighboringNode': [[40, 1], [20, 1], [10, 1], [5, 1]],
                                'distanceThreshold': [[0.01, 0.05], [0.05, 0.05], [0.1, 0.05], [0.2, 0.05]]},
                    scoring=scoring,
                    n_jobs=1,
                    cv=cv,
                    refit=False,
                    verbose=0,
                    error_score='raise',
                    return_train_score=False,
                   )

In [None]:
%%time
roussillon_files = [file for file in os.listdir(SAMPLES_DIR) if file.endswith(".gslib") and file.startswith("roussillon")]
for observation_file in roussillon_files:
    df = pd.DataFrame(readPointSetGslib(SAMPLES_DIR + observation_file).to_dict())
    parameter_selector.fit(df[['X','Y','Z']], df['Facies_real00000'])
    results = pd.DataFrame(parameter_selector.cv_results_)
    results.to_csv(OUTPUT_DIR+observation_file.split('.')[0]+'.csv')