# Analysis of cross-validation results

In [1]:
import pickle
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

from mpstool.cv_metrics import brier_score, zero_one_score, balanced_linear_score

from geone.img import readImageGslib, readPointSetGslib
from geone.img import Img
from geone.imgplot import drawImage2D
from geone.deesseinterface import DeesseEstimator

In [2]:
OUTPUT_DIR = 'output/'
DATA_DIR = 'data_roussillon/'
SAMPLES_DIR = DATA_DIR

## Training image selection

The benchmark case of training image selection with three candidate training images.
First, we check sensitivity to number of realisations for probabilities estimation.
Second, training image selection for different data sets.

## Parameter selection (Roussillon)

Three datasets with: 50, 150, 600 points each. In output directory: roussillon_observations_50.csv, etc.

In [3]:
df_roussillon = pd.read_csv('df_roussillon.csv', index_col=0)
df_roussillon.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_distanceThreshold,param_maxScanFraction,param_nneighboringNode,params,split0_test_brier,split1_test_brier,...,std_test_brier,rank_test_brier,split0_test_skill_brier,split1_test_skill_brier,split2_test_skill_brier,split3_test_skill_brier,split4_test_skill_brier,mean_test_skill_brier,std_test_skill_brier,rank_test_skill_brier
0,0.000993,2.4e-05,129.224922,0.462563,"[0.12501, 0.1]",0.1,"[8, 1]","{'distanceThreshold': [0.12501, 0.1], 'maxScan...",-0.396292,-0.453594,...,0.049748,21,0.165015,0.04428,0.173004,0.330193,0.329941,0.208487,0.109249,21
1,0.000976,5.6e-05,126.292234,4.236635,"[0.12501, 0.1]",0.2,"[8, 1]","{'distanceThreshold': [0.12501, 0.1], 'maxScan...",-0.411146,-0.450948,...,0.047465,22,0.133717,0.049855,0.209306,0.321237,0.313057,0.205434,0.104265,22
2,0.001001,1.3e-05,129.49873,0.648454,"[0.12501, 0.1]",0.4,"[8, 1]","{'distanceThreshold': [0.12501, 0.1], 'maxScan...",-0.396156,-0.44526,...,0.043132,24,0.1653,0.061838,0.149871,0.30801,0.305741,0.198152,0.095536,24
3,0.000901,4.6e-05,124.915347,1.118605,"[0.12501, 0.1]",0.8,"[8, 1]","{'distanceThreshold': [0.12501, 0.1], 'maxScan...",-0.407844,-0.467865,...,0.045956,28,0.140675,0.014211,0.097569,0.275324,0.273259,0.160208,0.101639,28
4,0.000933,7e-05,108.69839,0.544352,"[0.25001, 0.1]",0.1,"[8, 1]","{'distanceThreshold': [0.25001, 0.1], 'maxScan...",-0.487073,-0.603667,...,0.050528,41,-0.026261,-0.271923,-0.182244,-0.04413,0.034688,-0.097974,0.112251,41


In [4]:
df_sorted = df_roussillon.sort_values('rank_test_brier', ascending=True)
df_sorted[['mean_score_time', 'param_distanceThreshold', 'param_maxScanFraction', 'param_nneighboringNode', 'mean_test_brier', 'std_test_brier']].head(-10)

Unnamed: 0,mean_score_time,param_distanceThreshold,param_maxScanFraction,param_nneighboringNode,mean_test_brier,std_test_brier
20,182.169288,"[0.06251, 0.1]",0.1,"[32, 1]",-0.312848,0.046924
21,178.729551,"[0.06251, 0.1]",0.2,"[32, 1]",-0.322479,0.032288
23,294.807481,"[0.06251, 0.1]",0.8,"[32, 1]",-0.324254,0.034422
36,191.765087,"[0.03126, 0.1]",0.1,"[64, 1]",-0.329104,0.034577
22,172.108105,"[0.06251, 0.1]",0.4,"[32, 1]",-0.32935,0.033295
37,179.712859,"[0.03126, 0.1]",0.2,"[64, 1]",-0.331606,0.032176
39,379.288599,"[0.03126, 0.1]",0.8,"[64, 1]",-0.334727,0.028549
38,303.787415,"[0.03126, 0.1]",0.4,"[64, 1]",-0.335873,0.028401
40,184.619265,"[0.06251, 0.1]",0.1,"[64, 1]",-0.342706,0.036357
41,175.666733,"[0.06251, 0.1]",0.2,"[64, 1]",-0.345396,0.033509


In [5]:
df_roussillon.mean_score_time.sum()*5/3600

12.160736646519767

In [6]:
df_dsbc_roussillon = pd.read_csv('df_dsbc_roussillon.csv', index_col=0)

In [7]:
df_dsbc_roussillon.mean_score_time.sum()*5/3600

13.856117417414984

In [9]:
df_sorted = df_dsbc_roussillon.sort_values('rank_test_brier', ascending=True)
df_sorted[['mean_score_time', 'param_distanceThreshold', 'param_maxScanFraction', 'param_nneighboringNode', 'mean_test_brier', 'std_test_brier']].head(-10)

Unnamed: 0,mean_score_time,param_distanceThreshold,param_maxScanFraction,param_nneighboringNode,mean_test_brier,std_test_brier
26,170.815525,"[1e-05, 0.1]",0.02,"[16, 1]",-0.28784,0.036715
34,170.174281,"[1e-05, 0.1]",0.06,"[16, 1]",-0.292675,0.033997
30,169.88036,"[1e-05, 0.1]",0.04,"[16, 1]",-0.294056,0.028436
46,165.139261,"[1e-05, 0.1]",0.2,"[16, 1]",-0.29486,0.025568
38,167.183764,"[1e-05, 0.1]",0.08,"[16, 1]",-0.294935,0.029452
22,170.241511,"[1e-05, 0.1]",0.01,"[16, 1]",-0.297571,0.038998
37,181.118044,"[1e-05, 0.1]",0.08,"[32, 1]",-0.297837,0.031636
29,181.388542,"[1e-05, 0.1]",0.04,"[32, 1]",-0.298404,0.031788
42,168.204787,"[1e-05, 0.1]",0.1,"[16, 1]",-0.298802,0.034168
33,180.554313,"[1e-05, 0.1]",0.06,"[32, 1]",-0.300129,0.039439


In [None]:
def best_results_for_each_TI(nsamples, score, score_name):
    df = pd.read_csv(OUTPUT_DIR+'roussillon_observations_{}.csv'.format(nsamples))
    info = ['param_TI',
        'param_distanceThreshold',
        'param_maxScanFraction',
        'param_nneighboringNode',
        ]
    ref = reference_score(observation_filename=SAMPLES_DIR+'roussillon_observations_{}.gslib'.format(nsamples),
                          score=score, varname='Facies_real00000')
    df['ref_score'] = ref
    df1 = df[df['param_TI'] == 'data/trueTI.gslib'].sort_values('mean_test_'+score_name,ascending=False).head(1)
    df2 = df[df['param_TI'] == 'data/analogTI.gslib'].sort_values('mean_test_'+score_name,ascending=False).head(1)
    return df1.append(df2, ignore_index=True)

In [None]:
info = ['param_TI',
        'param_distanceThreshold',
        'param_maxScanFraction',
        'param_nneighboringNode',
        'mean_test_score',
        'score_method',
        'nsamples',
        'ref_score'
       ]

df_best_roussillon = pd.DataFrame()
for nsamples in [50, 150, 600]:
    for score in [(brier_score, 'brier'), (zero_one_score, 'zero_one'), (balanced_linear_score, 'linear')]:
        df = best_results_for_each_TI(nsamples, score[0], score[1])
        df['mean_test_score'] = df['mean_test_{}'.format(score[1])]
        df['score_method'] = score[1]
        df['nsamples'] = nsamples
        df_best_roussillon = df_best_roussillon.append(df[info],ignore_index=True)
df_best_roussillon  

In [None]:
new_columns = {
    "param_TI" : "TI",
    "param_distanceThreshold" : "t",
    "param_maxScanFraction" : "f",
    "param_nneighboringNode" : "n",
    "mean_test_score" : "score",
    "ref_score" : "reference",   
    "score_method" : "function",
    "nsamples" : "wells",
}

df_renamed = df_best_roussillon.rename(columns=new_columns)
df_renamed.head()

In [None]:
def transform_entries(df_original):
    df = df_original.copy()
    df['TI'] = df['TI'].apply(lambda x: x.split('/')[1].split('TI')[0])
    df['t'] = df['t'].apply(lambda x: eval(x)[0])
    df['n'] = df['n'].apply(lambda x: eval(x)[0])
    df['score'] = df['score'].round(2)
    df['reference'] = df['reference'].round(2)
    df['function'] = df['function'].apply(lambda x: x.replace('_', '-'))
    return df

df_publication = transform_entries(df_renamed)
df_publication.head()

In [None]:
df_latex = df_publication[['wells', 'function', 'TI', 'score', 'reference', 't', 'f', 'n']].to_latex('tables/table-roussillon.tex', index=False)


# Figures for publication

In [None]:
COLOR_SCHEME = [ 
        [x/255 for x in [166,206,227]],  
        [x/255 for x in [31,120,180]],
        [x/255 for x in [51,160,44]],
      ]
FONTSIZE = 16
FIG_DIR = 'figures/'
DPI=600

import matplotlib

matplotlib.rcParams["image.interpolation"] = None
matplotlib.rcParams['pdf.fonttype'] = 42

In [None]:
def sensitivity_plot(test_case, title, ylabel, scoring, filename, score, ylim=[-1.0, 0.0], loc='lower right'):
    fig = plt.figure(figsize=(5,5))
    ax = plt.axes()
    df = sensitivity_results[test_case]
    colors=COLOR_SCHEME
    labels=['TI: A', 'TI: B', 'TI: C', 'ref.']
    markers=['o', 'x', '*']

    for i, ti in enumerate(['A', 'B', 'C']):
        df_plot = df[df['param_TI'] == 'data/{}.gslib'.format(ti)]
        ax.plot(df_plot['param_nrealization'], df_plot[scoring], label=labels[i], linestyle='--', color=colors[i], marker=markers[i])
    # reference line
    x = np.linspace(0,50)
    ax.plot(x, np.ones(len(x))*reference_score(SAMPLES_DIR+'sample_{}_50.gslib'.format(test_case), score), linestyle='--', color='black', label=labels[-1])
    
    ax.legend(loc=loc, fontsize=FONTSIZE, ncol=2)
    ax.set_xlabel("#realizations", fontsize=FONTSIZE)
    ax.set_ylim(ylim)
    ax.set_ylabel(" ", fontsize=FONTSIZE)  
    ax.set_title("{}, {}".format(ylabel,title), fontsize=FONTSIZE)
    ax.tick_params(axis='both', which='major', labelsize=FONTSIZE)
    plt.savefig(filename, dpi=DPI, bbox_inches='tight')
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
sensitivity_plot('A', 'a)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_A_brier.pdf', brier_score)

In [None]:
sensitivity_plot('B', 'b)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_B_brier.pdf', brier_score)

In [None]:
sensitivity_plot('C', 'c)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_C_brier.pdf', brier_score)

In [None]:
sensitivity_plot('A', 'a)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_A_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc=None)

In [None]:
sensitivity_plot('B', 'b)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_B_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc='upper right')

In [None]:
sensitivity_plot('C', 'c)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_C_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc=(0.2,0.6))

In [None]:
sensitivity_plot('A', 'a)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_A_linear.pdf', score=balanced_linear_score, ylim=[0.3,1], loc="upper right")

In [None]:
sensitivity_plot('B', 'b)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_B_linear.pdf', score=balanced_linear_score, ylim=[0.3,1])

In [None]:
sensitivity_plot('C', 'c)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_C_linear.pdf', score=balanced_linear_score, ylim=[0.3,1])

## TI selection

In [None]:
def ti_selection_plot(test_case, title, ylabel, scoring, filename, score_ref, ylim=[-1,0], loc="lower right"):
    fig = plt.figure(figsize=(5,5))
    ax = plt.axes()
    df = df_ti_selection[df_ti_selection.type==test_case].sort_values(by = 'nsamples')
    colors=COLOR_SCHEME
    labels=['TI: A', 'TI: B', 'TI: C', 'ref']
    markers=['o', 'x', '*']
    for i, ti in enumerate(['A', 'B', 'C']):
        df_plot = df[df['param_TI'] == 'data/{}.gslib'.format(ti)]
        ax.semilogx(df_plot['nsamples'], df_plot[scoring], label=labels[i], linestyle='--', color=colors[i], marker=markers[i])
    ax.semilogx(df_plot['nsamples'], df_plot[score_ref], label=labels[-1], linestyle='--', color='black')
    #ax.legend(loc=loc, fontsize=FONTSIZE, ncol=2)
    ax.set_xlabel("#samples", fontsize=FONTSIZE)
    ax.set_ylim(ylim)
    ax.set_title("{}, {}".format(ylabel,title), fontsize=FONTSIZE)
    ax.tick_params(axis='both', which='major', labelsize=FONTSIZE)
    plt.savefig(filename, dpi=DPI, bbox_inches='tight')
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
ti_selection_plot('A', 'a)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_A_brier.pdf', 'ref_brier')

In [None]:
ti_selection_plot('B', 'b)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_B_brier.pdf', score_ref='ref_brier')

In [None]:
ti_selection_plot('C', 'c)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_C_brier.pdf', score_ref='ref_brier')

In [None]:
ti_selection_plot('A', 'a)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_A_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one', loc="upper left")

In [None]:
ti_selection_plot('B', 'b)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_B_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one', loc="upper left")

In [None]:
ti_selection_plot('C', 'c)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_C_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one')

In [None]:
ti_selection_plot('A', 'a)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_A_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

In [None]:
ti_selection_plot('B', 'b)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_B_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

In [None]:
ti_selection_plot('C', 'c)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_C_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

## Roussillon

Plot example simulations.

In [None]:
ti_true = readImageGslib(DATA_DIR+'trueTI.gslib')
ti_analog = readImageGslib(DATA_DIR+'analogTI.gslib')
mask = readImageGslib(DATA_DIR+'mask.gslib')
trend = readImageGslib(DATA_DIR+'trend.gslib')
im_angle = readImageGslib(DATA_DIR+'orientation.gslib')
nx, ny, nz = mask.nx, mask.ny, mask.nz      # number of cells
sx, sy, sz = mask.sx, mask.sy, mask.sz      # cell unit
ox, oy, oz = mask.ox, mask.oy, mask.oz      # origin (corner of the "first" grid cell)

deesse = DeesseEstimator(
    varnames=['X','Y','Z','Facies'],
    nx=nx, ny=ny, nz=nz,
    sx=sx, sy=sy, sz=sz,
    ox=ox, oy=oy, oz=oz,
    nv=2, varname=['Facies', 'trend'],
    nTI=1, TI=ti_true,
    mask=mask.val,
    rotationUsage=1,            # use rotation without tolerance
    rotationAzimuthLocal=True,  #    rotation according to azimuth: local
    rotationAzimuth=im_angle.val[0,:,:,:],      #    rotation azimuth: map of values
    dataImage=trend,
    outputVarFlag=[True, False],
    distanceType=[0,1],
    nneighboringNode=[50,1],
    distanceThreshold=[0.05, 0.05],
    maxScanFraction=0.5,
    npostProcessingPathMax=1,
    seed=20191201,
    nrealization=1,
    nthreads=8,
)

def simulate(nsamples, TI, nneighboringNode,distanceThreshold, maxScanFraction):
    hd = pd.DataFrame(readPointSetGslib(SAMPLES_DIR+'roussillon_observations_{}.gslib'.format(nsamples)).to_dict())
    deesse.set_params(
        TI=TI,
        nneighboringNode=nneighboringNode,
        distanceThreshold=distanceThreshold,
        maxScanFraction=maxScanFraction
    )
    deesse.fit(hd[['X', 'Y', 'Z']], hd['Facies_real00000'])
    return deesse.simulate()

In [None]:
def plot_example_roussillon(score_name, nsamples, n, t, f, ti_name, removeColorbar=True):

    ti = readImageGslib(ti_name)
    if ti_name == 'data/trueTI.gslib':
        ti_shortname = 'true'
    else:
        ti_shortname = 'analog'
        
    if score_name == 'brier':
        score_name = 'quadratic'
    elif score_name == 'zero_one':
        score_name = 'zero-one'
    filename = FIG_DIR + "ex_roussillon_{0}_{1}_{2}.pdf".format(nsamples, score_name, ti_shortname)


    FONT_SIZE = 16
    COLOR_SCHEME_ROUSSILLON = [ 
            [x/255 for x in [166,206,227]],
            [x/255 for x in [178,223,138]],   
            [x/255 for x in [31,120,180]],
            [x/255 for x in [51,160,44]],
          ]
    LEGEND = ['alluvial fan', 'flood plain', 'splay', 'river bed']
    EXCLUDED_VAL = -9999999

    image = simulate(nsamples, ti, n, t, f)['sim'][0]

    fig = plt.figure(figsize=(5,5))
    fig.subplots_adjust(left=0.05, right=0.9)
    xmin, xmax = [int(x) for x in [image.xmin(), image.xmax()]]
    ymin, ymax = [int(y) for y in [image.ymin(), image.ymax()]]
    drawImage2D(image, excludedVal=EXCLUDED_VAL,
                title = "{0} wells, {1}, {2} TI".format(nsamples, score_name, ti_shortname),
                removeColorbar=removeColorbar,
                categ=True,
                categColbad='white',
                categCol=COLOR_SCHEME_ROUSSILLON,
                cticklabels=LEGEND,
                title_fontsize=FONT_SIZE,
                cticklabels_fontsize=FONT_SIZE,
                xlabels_fontsize=FONT_SIZE,
                ylabels_fontsize=FONT_SIZE,
                xticklabels = [xmin, xmax],
                yticklabels = [ymin, ymax],
                xticklabels_fontsize=FONT_SIZE,
                yticklabels_fontsize=FONT_SIZE,
                xticks=[xmin, xmax],
                yticks=[ymin, ymax],
                ylabel_rotation=0,
               )
    
    #plt.scatter(point_set_roussillon.x(), point_set_roussillon.y(), marker= 'x', s=30, c='black')
    plt.savefig(filename, dpi=DPI, bbox_inches="tight")
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
plot_example_roussillon('brier', 150, [10, 1], [0.5, 0.1], 0.001, 'data/trueTI.gslib')

In [None]:
for index, row in df_best_roussillon.iterrows():
    plot_example_roussillon(row['score_method'], row['nsamples'], eval(row['param_nneighboringNode']),
                           eval(row['param_distanceThreshold']), row['param_maxScanFraction'],
                           row['param_TI'])