# Analysis of cross-validation results

In [None]:
import pickle
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold

from mpstool.cv_metrics import brier_score, zero_one_score, balanced_linear_score

from geone.img import readImageGslib, readPointSetGslib
from geone.img import Img
from geone.imgplot import drawImage2D
from geone.deesseinterface import DeesseEstimator

In [None]:
OUTPUT_DIR = 'output/'
DATA_DIR = 'data/'
SAMPLES_DIR = 'samples/'

## Training image selection

The benchmark case of training image selection with three candidate training images.
First, we check sensitivity to number of realisations for probabilities estimation.
Second, training image selection for different data sets.

### Sensitivity analysis

First, check sensitivity of cross-validation to the number of realisations. There are 3 csv files: sensitivity_A.csv,sensitivity_B.csv and sensitivity_C.csv

In [None]:
# We performed the analysis for three dataset types A, B, C
sensitivity_results = dict()
for test_case in ['A', 'B', 'C']:
    sensitivity_results[test_case] = pd.read_csv(OUTPUT_DIR+'sensitivity_{}.csv'.format(test_case))

In [None]:
# example of data frame for dataset A
df_sensitivity = sensitivity_results['A']
df_sensitivity[['param_TI', 'param_nrealization', 'mean_test_brier']].head()

In [None]:
def reference_score(observation_filename, score, varname='code_real00000'):
    """
    Computes reference score for a given dataset using DummyClassifier with prior strategy
    """
    cv = StratifiedKFold(n_splits=5,
                     shuffle=True,
                     random_state=20191201,
                    )
    df = pd.DataFrame(readPointSetGslib(observation_filename).to_dict())
    dummy = DummyClassifier(strategy='prior')
    cv_results = cross_validate(dummy, X=df[['X', 'Y', 'Z']], y=df[varname], cv=cv, scoring=score)
    return np.mean(cv_results['test_score'])

In [None]:
def plot_sensitivities(score, score_column):
    """
    Produces three plots (one for each dataset type) with score depending on number of realizations
    """
    for test_case in ['A', 'B', 'C']:
        plt.figure()
        ax = plt.gca()
        df = sensitivity_results[test_case]
        df[df['param_TI'] == 'data/A.gslib'].plot(x='param_nrealization', y=score_column, ax=ax)
        df[df['param_TI'] == 'data/B.gslib'].plot.line(x='param_nrealization', y=score_column, ax=ax)
        df[df['param_TI'] == 'data/C.gslib'].plot.line(x='param_nrealization', y=score_column, ax=ax)
        ax.axhline(reference_score(SAMPLES_DIR+'sample_{}_50.gslib'.format(test_case), score), c='black')
        plt.legend(['A', 'B', 'C', 'reference'])

In [None]:
# Sensitivity using brier score
plot_sensitivities(brier_score, 'mean_test_brier')

In [None]:
# Sensitivity using zero-one score
plot_sensitivities(zero_one_score, 'mean_test_zero_one')

In [None]:
# Sensitivity using balanced linear score
plot_sensitivities(balanced_linear_score, 'mean_test_linear')

### Ti selection

Now, let's move to the training images selection depending on different datasets (number of samples).
Each result file (csv) has 3 entries (one line for each candidate TI). We need to gather the files in order to present them in common plots.

The results are in OUTPUT_DIR, start with 'sample'. Each name is the same as the name of observation set but has .csv extension

In [None]:
result_files = [file for file in os.listdir(OUTPUT_DIR) if file.endswith(".csv") and file.startswith("sample")]

In [None]:
def case_and_samples(result_filename):
    """
    Gets A, B or C (case) and number of samples from string representing name of result file
    """
    try:
        case, nsamples = result_filename.split('sample_')[1].split('.csv')[0].split('_')
        return case, int(nsamples)
    except IndexError:
        print(result_filename)
        raise

In [None]:
# gather all results in one DataFrame and add reference scores
df_ti_selection = pd.DataFrame()
for result in result_files:
    df = pd.read_csv(OUTPUT_DIR+result)
    df['type'], df['nsamples'] = case_and_samples(result)
    df['ref_brier'] = reference_score(SAMPLES_DIR+result.split('.')[0]+'.gslib', brier_score)
    df['ref_zero_one'] = reference_score(SAMPLES_DIR+result.split('.')[0]+'.gslib', zero_one_score)
    df['ref_linear'] = reference_score(SAMPLES_DIR+result.split('.')[0]+'.gslib', balanced_linear_score)
    df_ti_selection = df_ti_selection.append(df)

In [None]:
df_ti_selection[['param_TI', 'mean_test_brier']].head()

In [None]:
def plot_ti_selection(ref_score_name, score_name):
    for observation_type in ['A', 'B', 'C']:
        plt.figure()
        ax = plt.gca()
        df = df_ti_selection[df_ti_selection.type==observation_type].sort_values(by = 'nsamples')
        df[df.param_TI=='data/A.gslib'].plot(x='nsamples', y=score_name, ax=ax, logx=True)
        df[df.param_TI=='data/B.gslib'].plot(x='nsamples', y=score_name, ax=ax, logx=True)
        df[df.param_TI=='data/C.gslib'].plot(x='nsamples', y=score_name, ax=ax, logx=True)
        df.plot(x='nsamples', y=ref_score_name, ax=ax, logx=True)
        plt.legend(['A', 'B', 'C', 'ref'])

In [None]:
plot_ti_selection('ref_brier', 'mean_test_brier')

In [None]:
plot_ti_selection('ref_zero_one', 'mean_test_zero_one')

In [None]:
plot_ti_selection('ref_linear', 'mean_test_linear')

## Parameter selection (Roussillon)

Three datasets with: 50, 150, 600 points each. In output directory: roussillon_observations_50.csv, etc.

In [None]:
df_roussillon = pd.read_csv(OUTPUT_DIR+'roussillon_observations_50.csv')
df_roussillon.head()

In [None]:
def best_results_for_each_TI(nsamples, score, score_name):
    df = pd.read_csv(OUTPUT_DIR+'roussillon_observations_{}.csv'.format(nsamples))
    info = ['param_TI',
        'param_distanceThreshold',
        'param_maxScanFraction',
        'param_nneighboringNode',
        ]
    ref = reference_score(observation_filename=SAMPLES_DIR+'roussillon_observations_{}.gslib'.format(nsamples),
                          score=score, varname='Facies_real00000')
    df['ref_score'] = ref
    df1 = df[df['param_TI'] == 'data/trueTI.gslib'].sort_values('mean_test_'+score_name,ascending=False).head(1)
    df2 = df[df['param_TI'] == 'data/analogTI.gslib'].sort_values('mean_test_'+score_name,ascending=False).head(1)
    return df1.append(df2, ignore_index=True)

In [None]:
info = ['param_TI',
        'param_distanceThreshold',
        'param_maxScanFraction',
        'param_nneighboringNode',
        'mean_test_score',
        'score_method',
        'nsamples',
        'ref_score'
       ]

df_best_roussillon = pd.DataFrame()
for nsamples in [50, 150, 600]:
    for score in [(brier_score, 'brier'), (zero_one_score, 'zero_one'), (balanced_linear_score, 'linear')]:
        df = best_results_for_each_TI(nsamples, score[0], score[1])
        df['mean_test_score'] = df['mean_test_{}'.format(score[1])]
        df['score_method'] = score[1]
        df['nsamples'] = nsamples
        df_best_roussillon = df_best_roussillon.append(df[info],ignore_index=True)
df_best_roussillon  

In [None]:
new_columns = {
    "param_TI" : "TI",
    "param_distanceThreshold" : "t",
    "param_maxScanFraction" : "f",
    "param_nneighboringNode" : "n",
    "mean_test_score" : "score",
    "ref_score" : "reference",   
    "score_method" : "function",
    "nsamples" : "wells",
}

df_renamed = df_best_roussillon.rename(columns=new_columns)
df_renamed.head()

In [None]:
def transform_entries(df_original):
    df = df_original.copy()
    df['TI'] = df['TI'].apply(lambda x: x.split('/')[1].split('TI')[0])
    df['t'] = df['t'].apply(lambda x: eval(x)[0])
    df['n'] = df['n'].apply(lambda x: eval(x)[0])
    df['score'] = df['score'].round(2)
    df['reference'] = df['reference'].round(2)
    df['function'] = df['function'].apply(lambda x: x.replace('_', '-'))
    return df

df_publication = transform_entries(df_renamed)
df_publication.head()

In [None]:
df_latex = df_publication[['wells', 'function', 'TI', 'score', 'reference', 't', 'f', 'n']].to_latex('tables/table-roussillon.tex', index=False)


# Figures for publication

In [None]:
COLOR_SCHEME = [ 
        [x/255 for x in [166,206,227]],  
        [x/255 for x in [31,120,180]],
        [x/255 for x in [51,160,44]],
      ]
FONTSIZE = 16
FIG_DIR = 'figures/'
DPI=600

import matplotlib

matplotlib.rcParams["image.interpolation"] = None
matplotlib.rcParams['pdf.fonttype'] = 42

In [None]:
def sensitivity_plot(test_case, title, ylabel, scoring, filename, score, ylim=[-1.0, 0.0], loc='lower right'):
    fig = plt.figure(figsize=(5,5))
    ax = plt.axes()
    df = sensitivity_results[test_case]
    colors=COLOR_SCHEME
    labels=['TI: A', 'TI: B', 'TI: C', 'ref.']
    markers=['o', 'x', '*']

    for i, ti in enumerate(['A', 'B', 'C']):
        df_plot = df[df['param_TI'] == 'data/{}.gslib'.format(ti)]
        ax.plot(df_plot['param_nrealization'], df_plot[scoring], label=labels[i], linestyle='--', color=colors[i], marker=markers[i])
    # reference line
    x = np.linspace(0,50)
    ax.plot(x, np.ones(len(x))*reference_score(SAMPLES_DIR+'sample_{}_50.gslib'.format(test_case), score), linestyle='--', color='black', label=labels[-1])
    
    ax.legend(loc=loc, fontsize=FONTSIZE, ncol=2)
    ax.set_xlabel("#realizations", fontsize=FONTSIZE)
    ax.set_ylim(ylim)
    ax.set_ylabel(" ", fontsize=FONTSIZE)  
    ax.set_title("{}, {}".format(ylabel,title), fontsize=FONTSIZE)
    ax.tick_params(axis='both', which='major', labelsize=FONTSIZE)
    plt.savefig(filename, dpi=DPI, bbox_inches='tight')
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
sensitivity_plot('A', 'a)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_A_brier.pdf', brier_score)

In [None]:
sensitivity_plot('B', 'b)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_B_brier.pdf', brier_score)

In [None]:
sensitivity_plot('C', 'c)','mean quadratic score', 'mean_test_brier', FIG_DIR+'sensitivity_C_brier.pdf', brier_score)

In [None]:
sensitivity_plot('A', 'a)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_A_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc=None)

In [None]:
sensitivity_plot('B', 'b)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_B_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc='upper right')

In [None]:
sensitivity_plot('C', 'c)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'sensitivity_C_zero_one.pdf', score=zero_one_score, ylim=[0.5,1], loc=(0.2,0.6))

In [None]:
sensitivity_plot('A', 'a)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_A_linear.pdf', score=balanced_linear_score, ylim=[0.3,1], loc="upper right")

In [None]:
sensitivity_plot('B', 'b)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_B_linear.pdf', score=balanced_linear_score, ylim=[0.3,1])

In [None]:
sensitivity_plot('C', 'c)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'sensitivity_C_linear.pdf', score=balanced_linear_score, ylim=[0.3,1])

## TI selection

In [None]:
def ti_selection_plot(test_case, title, ylabel, scoring, filename, score_ref, ylim=[-1,0], loc="lower right"):
    fig = plt.figure(figsize=(5,5))
    ax = plt.axes()
    df = df_ti_selection[df_ti_selection.type==test_case].sort_values(by = 'nsamples')
    colors=COLOR_SCHEME
    labels=['TI: A', 'TI: B', 'TI: C', 'ref']
    markers=['o', 'x', '*']
    for i, ti in enumerate(['A', 'B', 'C']):
        df_plot = df[df['param_TI'] == 'data/{}.gslib'.format(ti)]
        ax.semilogx(df_plot['nsamples'], df_plot[scoring], label=labels[i], linestyle='--', color=colors[i], marker=markers[i])
    ax.semilogx(df_plot['nsamples'], df_plot[score_ref], label=labels[-1], linestyle='--', color='black')
    #ax.legend(loc=loc, fontsize=FONTSIZE, ncol=2)
    ax.set_xlabel("#samples", fontsize=FONTSIZE)
    ax.set_ylim(ylim)
    ax.set_title("{}, {}".format(ylabel,title), fontsize=FONTSIZE)
    ax.tick_params(axis='both', which='major', labelsize=FONTSIZE)
    plt.savefig(filename, dpi=DPI, bbox_inches='tight')
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
ti_selection_plot('A', 'a)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_A_brier.pdf', 'ref_brier')

In [None]:
ti_selection_plot('B', 'b)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_B_brier.pdf', score_ref='ref_brier')

In [None]:
ti_selection_plot('C', 'c)','mean quadratic score', 'mean_test_brier', FIG_DIR+'ti_selection_C_brier.pdf', score_ref='ref_brier')

In [None]:
ti_selection_plot('A', 'a)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_A_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one', loc="upper left")

In [None]:
ti_selection_plot('B', 'b)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_B_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one', loc="upper left")

In [None]:
ti_selection_plot('C', 'c)','mean zero-one score', 'mean_test_zero_one', FIG_DIR+'ti_selection_C_zero_one.pdf', ylim=[0.5,1], score_ref='ref_zero_one')

In [None]:
ti_selection_plot('A', 'a)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_A_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

In [None]:
ti_selection_plot('B', 'b)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_B_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

In [None]:
ti_selection_plot('C', 'c)','mean balanced linear score', 'mean_test_linear', FIG_DIR+'ti_selection_C_linear.pdf', ylim=[0.3,1], score_ref='ref_linear')

## Roussillon

Plot example simulations.

In [None]:
ti_true = readImageGslib(DATA_DIR+'trueTI.gslib')
ti_analog = readImageGslib(DATA_DIR+'analogTI.gslib')
mask = readImageGslib(DATA_DIR+'mask.gslib')
trend = readImageGslib(DATA_DIR+'trend.gslib')
im_angle = readImageGslib(DATA_DIR+'orientation.gslib')
nx, ny, nz = mask.nx, mask.ny, mask.nz      # number of cells
sx, sy, sz = mask.sx, mask.sy, mask.sz      # cell unit
ox, oy, oz = mask.ox, mask.oy, mask.oz      # origin (corner of the "first" grid cell)

deesse = DeesseEstimator(
    varnames=['X','Y','Z','Facies'],
    nx=nx, ny=ny, nz=nz,
    sx=sx, sy=sy, sz=sz,
    ox=ox, oy=oy, oz=oz,
    nv=2, varname=['Facies', 'trend'],
    nTI=1, TI=ti_true,
    mask=mask.val,
    rotationUsage=1,            # use rotation without tolerance
    rotationAzimuthLocal=True,  #    rotation according to azimuth: local
    rotationAzimuth=im_angle.val[0,:,:,:],      #    rotation azimuth: map of values
    dataImage=trend,
    outputVarFlag=[True, False],
    distanceType=[0,1],
    nneighboringNode=[50,1],
    distanceThreshold=[0.05, 0.05],
    maxScanFraction=0.5,
    npostProcessingPathMax=1,
    seed=20191201,
    nrealization=1,
    nthreads=8,
)

def simulate(nsamples, TI, nneighboringNode,distanceThreshold, maxScanFraction):
    hd = pd.DataFrame(readPointSetGslib(SAMPLES_DIR+'roussillon_observations_{}.gslib'.format(nsamples)).to_dict())
    deesse.set_params(
        TI=TI,
        nneighboringNode=nneighboringNode,
        distanceThreshold=distanceThreshold,
        maxScanFraction=maxScanFraction
    )
    deesse.fit(hd[['X', 'Y', 'Z']], hd['Facies_real00000'])
    return deesse.simulate()

In [None]:
def plot_example_roussillon(score_name, nsamples, n, t, f, ti_name, removeColorbar=True):

    ti = readImageGslib(ti_name)
    if ti_name == 'data/trueTI.gslib':
        ti_shortname = 'true'
    else:
        ti_shortname = 'analog'
        
    if score_name == 'brier':
        score_name = 'quadratic'
    elif score_name == 'zero_one':
        score_name = 'zero-one'
    filename = FIG_DIR + "ex_roussillon_{0}_{1}_{2}.pdf".format(nsamples, score_name, ti_shortname)


    FONT_SIZE = 16
    COLOR_SCHEME_ROUSSILLON = [ 
            [x/255 for x in [166,206,227]],
            [x/255 for x in [178,223,138]],   
            [x/255 for x in [31,120,180]],
            [x/255 for x in [51,160,44]],
          ]
    LEGEND = ['alluvial fan', 'flood plain', 'splay', 'river bed']
    EXCLUDED_VAL = -9999999

    image = simulate(nsamples, ti, n, t, f)['sim'][0]

    fig = plt.figure(figsize=(5,5))
    fig.subplots_adjust(left=0.05, right=0.9)
    xmin, xmax = [int(x) for x in [image.xmin(), image.xmax()]]
    ymin, ymax = [int(y) for y in [image.ymin(), image.ymax()]]
    drawImage2D(image, excludedVal=EXCLUDED_VAL,
                title = "{0} wells, {1}, {2} TI".format(nsamples, score_name, ti_shortname),
                removeColorbar=removeColorbar,
                categ=True,
                categColbad='white',
                categCol=COLOR_SCHEME_ROUSSILLON,
                cticklabels=LEGEND,
                title_fontsize=FONT_SIZE,
                cticklabels_fontsize=FONT_SIZE,
                xlabels_fontsize=FONT_SIZE,
                ylabels_fontsize=FONT_SIZE,
                xticklabels = [xmin, xmax],
                yticklabels = [ymin, ymax],
                xticklabels_fontsize=FONT_SIZE,
                yticklabels_fontsize=FONT_SIZE,
                xticks=[xmin, xmax],
                yticks=[ymin, ymax],
                ylabel_rotation=0,
               )
    
    #plt.scatter(point_set_roussillon.x(), point_set_roussillon.y(), marker= 'x', s=30, c='black')
    plt.savefig(filename, dpi=DPI, bbox_inches="tight")
    #!convert -trim $filename $filename
    !pdfcrop $filename $filename

In [None]:
plot_example_roussillon('brier', 150, [10, 1], [0.5, 0.1], 0.001, 'data/trueTI.gslib')

In [None]:
for index, row in df_best_roussillon.iterrows():
    plot_example_roussillon(row['score_method'], row['nsamples'], eval(row['param_nneighboringNode']),
                           eval(row['param_distanceThreshold']), row['param_maxScanFraction'],
                           row['param_TI'])