In [None]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
import os
os.environ["PYTHONWARNINGS"] = "ignore"  # "ignore::FutureWarning"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)
import time

import re
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'
import geopandas as gpd

import sklearn
print(f'sklearn verion: {sklearn.__version__}')
from sklearn import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    # AdaBoostRegressor,
    # GradientBoostingRegressor,
    # HistGradientBoostingRegressor,
)
from xgboost import XGBRegressor
# from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
# from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    import sys
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    import os
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

import prepare_data
import geo
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from plots import repNCV_score_plots, ncv_pie, ensemble_pred_histograms, per_err_agg_bar, scatter_chart
from cv import compete_rep_ncv, compara_rep_ncv, best_scored, append_agg_cv_scores, aggregation, aggregate_folds_only, transform_score_df, process_results, make_setup_dict, make_header, rensembling, augment_predictions, performance
from cv_helpers import generate_feature_sets, SelectFeatures, unnegate, inter_rank, fix_feature_combis, ensemble_predict, aggregate_predictions, iqm, check_testset_duplicates

## Create set of keys in locals() originating from kernel initialisation and imports
_lokeys = [k for k in locals().keys()]
## later this could be used to pickle all user defined variables for complete workspace serialisation

from settings import Config, shortnames, target, featurelist

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0)
sdd_cau = sdd_cau.join(prepare_data.fix_gradistat_names(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0)), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()
sdd_cau['Dist_Land'] = geo.get_distance_to_shore(sdd_cau['LON'], sdd_cau['LAT'])
sdd_cau = prepare_data.impute_cau(sdd_cau)
sdd_cau = geo.get_wwtp_influence(sdd_cau, tracks_file='../data/BAW_tracer_simulations.zip', file_postfix='_CAU')

In [None]:
## Split data into samples used for building the model and samples used for predicting.

samples_with_response_and_predictor_data = sdd_iow.loc[~sdd_iow[target].isna()].set_index('Sample')
samples_with_only_predictor_data = sdd_iow.loc[sdd_iow[target].isna()]
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')
samples_with_only_predictor_data = samples_with_only_predictor_data.drop('S09')  # apparently there is no MP nor sediment data for S09, so drop it.

In [None]:
## Samples which are not suitable for ("hydrodynamic outliers") are moved from modelling data to prediction data
droplist = ['S32','S05']
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, samples_with_response_and_predictor_data.loc[droplist,:]])
samples_with_response_and_predictor_data = samples_with_response_and_predictor_data.drop(droplist)
all_samples = pd.concat([samples_with_response_and_predictor_data, samples_with_only_predictor_data])

In [None]:
## Limit dataframes to features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.
model_X = samples_with_response_and_predictor_data[featurelist]
model_y = samples_with_response_and_predictor_data[target]
pred_X = samples_with_only_predictor_data[featurelist]

In [None]:
## Alternatively to above four cells read samples data from the prediction output of a previous model run
## This allows to replace the second model stage (originally geospatial interpolation),
## by a re-run of the first NCV pipeline (using the ~200 predicted points as input and only
## predictors which are available at the grid points of the geospatial raster for second prediction)
# droplist = ['S32','S05']
# output_from_1st_model = pd.read_csv('../data/exports/models/predictions/20230403_233901_Concentration_predictions.csv').set_index('Sample').drop(droplist)
# model_X = output_from_1st_model[featurelist]
# model_y = output_from_1st_model[target]

In [None]:
# Predefined test set optionally to be used instead of CV:
# test_set = ('S30', 'S03', 'S15', 'S06', 'S31', 'S25', 'S20')  # possible samples to use as a predefined test set: ordered by relevance
# test_set_size = 7  # Requires int, should be 0 < test_set_size <= len(test_set), for using the n first samples as test_set
# test_set = test_set[0:test_set_size]
# test_set = model_X.index.isin(test_set).astype(int) - 1  # returns array of len(model_X.index) with -1 for training samples and 0 for testing samples

In [None]:
scaler = None  # this is just needed for reporting as long as scaler is manually switched outside the pipeline

In [None]:
# Scale data using a scaler

# scaler = StandardScaler()
# model_X.loc[:] = scaler.fit_transform(model_X)
# pred_X.loc[:] = scaler.transform(pred_X)

In [None]:
## Check some basic statistics of the target variable

# model_y.describe()
# model_y.hist()
# model_X.info()
# model_X.describe()

## Model building

### Custom preprocessing functions to be used in the model pipeline
Create a custom feature selector which creates exhaustively all combinations
of available features to be tested as individual feature sets.
It respects constrictions on non-allowed combinations, which can be defined
by the user in Config.mutual_exclusive and Config.exclusive_keywords to sace
computation time by not testing meaningless combinations.

In [None]:
num_feat = (2,5)  # allowed number of features:
                  #     if int: all possible combinations of this length will be created
                  #     if tuple (min, max): all possible combinations of length min upt to length max will be created
                  #     if 'all' all possible combinations of all possible lengths will be created
feature_candidates_list = generate_feature_sets(model_X, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=num_feat, n_jobs=1, save=True)

CustomFeatureSelector = FunctionTransformer(SelectFeatures)#, feature_names_out='one-to-one')

### Creating the model pipeline

In [None]:
## Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      # ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

pipe = Pipeline(steps=[
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())
    ])

preprocessor_params = [
    {
    'preprocessor__selector': [CustomFeatureSelector],
    'preprocessor__selector__kw_args': [{'feature_set': i,
                                         'feature_sets': feature_candidates_list
                                        } for i in range(len(feature_candidates_list))],
    
    # 'preprocessor__scaler': [StandardScaler()],#MaxAbsScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer(), Normalizer()],
    #    'preprocessor__scaler__with_mean': [True],
    #    'preprocessor__scaler__with_std': [True],
    }
]

regressor_params = [
    {
     'regressor': [TweedieRegressor(max_iter=100000)],  # , warm_start': True)],
         'regressor__power': [2],  # 'regressor__power': [0, 1, 1.2, 1.5, 1.9, 2, 3],
         'regressor__alpha': [0], 
         'regressor__link': ['log'],#, 'identity', 'auto'],
         # 'regressor__fit_intercept': [True, False],
    },

    {
    'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 1)],
        'regressor__booster': ['gblinear'],
        'regressor__objective': ['reg:gamma'],
        'regressor__n_estimators': [300],
        'regressor__reg_alpha': [0], #[0, 0.1, 1],  # L1-regularisation
        'regressor__reg_lambda': [0],  # L2-regularisation (OBS: it's called 'alpha' in sklearn GLM models)
    },

    {
    'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 0)],
        'regressor__booster': ['gbtree'],
        'regressor__objective': ['reg:gamma'],
        'regressor__n_estimators': [300],
        'regressor__reg_alpha': [0],  # L1-regularisation
        'regressor__reg_lambda': [0],  # L2-regularisation (OBS: it's called 'alpha' in sklearn GLM models)
        'regressor__tree_method': ['exact'],  # OBS: 'exact' doesn't work with max_depth=0
        'regressor__learning_rate': [0.1, 0.25], #alias for eta
        'regressor__max_depth': [2, 3],
        'regressor__min_child_weight': [5, 7],  # also a regularising parameter: higher values will stop splitting further, when childs have less then 
        # 'regressor__min_split_loss': [0], # alias for gamma
        # 'regressor__grow_policy': ['depthwise', 'lossguide'],
        # 'regressor__subsample': [0.5, 1.0],
        # 'regressor__colsample_bytree': [1/3, 1.0],
    # #     'regressor__colsample_bylevel': [0.5, 1.0],
    # #     'regressor__colsample_bynode': [0.5, 1.S0],
    },
    
    {
     'regressor': [RandomForestRegressor(random_state=np.random.RandomState(0))],
          'regressor__n_estimators': [10, 100, 300],
          'regressor__max_depth': [None, 2, 4],
          'regressor__max_features': [None, 1/3],
          # 'regressor__min_samples_split': [2, 10],
          # 'regressor__min_samples_leaf': [1, 3, 5],
          # 'regressor__bootstrap': [True, False],
          # 'regressor__oob_score': [True, False],
          # 'regressor__warm_start': [True, False],
    },
]

params = [{**dict_pre, **dict_reg} for dict_reg in regressor_params for dict_pre in preprocessor_params]

# Replace the full featurset lists with:
# -> (2,3)-combinations feature sets for linear models,
# -> (5,5)-combinations feature sets for tree models
Config.lin_combis = [2, 3]
Config.tree_combis = [5]
params = fix_feature_combis(params, feature_candidates_list, lin_combis=Config.lin_combis, tree_combis=Config.tree_combis)

### Set options for NCV
scoring, aggregation, repetitions, etc.

In [None]:
repetitions = 100  # number of times the whole NCV is repeated, each time using a different randon_state to make the train/test-splits. Enter 1 for single shuffled NCV, 0 (or False) for single non-shuffled NCV.
inner_reps = 10 #repetitions/10  # repetitions of inner KFold: int or fracton of `repetitions` (will be cast to int and fixed to 1 if smaller)
folds = (3, 3)  # tuple of n_folds (outer_folds, inner_folds) or int if number of folds for inner and out CVs should be the same.
outer_strata = 3 #[0, 1000, 10000, 100000] #len(model_y)//folds[0]  # Number of strata for outer StratifiedKFold splitting. Useful values range between n_folds and n_samples//n_folds. Use False in either or both n_strata values for unstratified splitting. E.g. `strata=(len(model_y)//n_folds[0], False)`  --> only stratify outer folds not inner (as done by https://doi.org/10.1186/1758-2946-6-10).
inner_strata = 3 #[0, 1000, 10000, 100000] #(len(model_y) - outer_strata)//folds[1]  # like outer strata but for inner folds. OBS: n_samples in inner CV is already reduced by outer_strata, so inner_strata is calculated from the remaining samples.
stratification_mode = 'quantile'  # 'quantile', 'interval', or None
setup = make_setup_dict(repeats=(repetitions,inner_reps), folds=folds, strata=(outer_strata, inner_strata), stratification_mode=stratification_mode, n_jobs=(1,-1), verbosity=(0,1))

scorers = [dict(zip(Config.scorers,t)) for t in zip(*Config.scorers.values())][1]  # dict 'scorers' will contain the string representation of negated scorers used by GridSearchCV

Config.refit_scorer = 'R2'  # must be one of the keys in Config.scorers
Config.select_best = 'median'  # 'mean', 'median', 'iqm'
Config.ncv_mode = 'comp'+'ara'+'tive'  # 'competitive' (run all activated models in grid against each other) or 'comparative' (run all activated models in separate sequential repeated NCV runs)

### Training the model

In [None]:
load_savestamp = '20231004_153340' # '20230831_083952'  # enter savestamp of model run here, write None to train a new model

if load_savestamp is not None:
    ## Load serialised results instead of training anew

    sp = Path(f'../data/exports/models/serialised/{load_savestamp}/')
    NCV = pd.read_pickle(sp/f'NCV_{load_savestamp}.pkl')
    with open(sp/f'starttime_{load_savestamp}.pkl', 'rb') as f:
        starttime = pickle.load(f)
    with open(sp/f'time_needed_{load_savestamp}.pkl', 'rb') as f:
        time_needed = pickle.load(f)
    with open(sp/f'setup_{load_savestamp}.pkl', 'rb') as f:
        setup = pickle.load(f)
    try:
        with open(sp/f'Config_{load_savestamp}.pkl', 'rb') as f:
            Config = pickle.load(f)
    except FileNotFoundError:
        print('No Config file found. Adjust Config manually according to logs (e.g. Config.select_best, Config.refit_scorer, Config.ncv_mode)')
        
        
else:
    if repetitions < 1:
        ## Run single NCV without shuffle split
        raise NotImplementedError('Running single NCV without shuffle split corresponds to the old version before repeated NCV was included and was not re-implemented after rewriting codebase for the new rep_ncv function.')
    if Config.ncv_mode == 'competitive':
        NCV, setup, starttime, time_needed = compete_rep_ncv(pipe, params, model_X, model_y, scorers, setup)
    elif Config.ncv_mode == 'comparative':
        NCV, setup, starttime, time_needed = compara_rep_ncv(pipe, params, model_X, model_y, scorers, setup)
    else: raise ValueError(f'No valid NCV mode: {Config.ncv_mode}')

## CV results postprocessing

In [None]:
## Take an unaltered backup for pickling
NCV_bak = NCV.copy()

In [None]:
## Add ranking to outer fold test scores, all together, not regarding the repetitions
NCV = inter_rank(NCV, Config.refit_scorer)
## Get additional scorings, get feature names, etc. and arrange into a dataframe
NCV = process_results(NCV, model_X, model_y, refitOnAll=True)
## When run in comparative mode: append results that would have come out of an equivalent run in competitive mode
NCV = rensembling(NCV)  # OBS: the intra- and inter rep rankings are not created again for the new block of the competitive run
## Reversing the higher-is-better sklearn negated scores
NCV = unnegate(NCV, scorers)
## Sort NCV
NCV.sort_index(level=[0, 1, 3], sort_remaining=False, inplace=True)
## Check for duplicates in test sets
dup_testsets, testset_length_freq = check_testset_duplicates(NCV)  # TODO: add this output to the printed header
## Generate report header
savestamp = starttime.strftime("%Y%m%d_%H%M%S")
header = make_header(NCV, setup, savestamp, time_needed, droplist, model_X, num_feat, feature_candidates_list, scaler, regressor_params, dup_testsets)
## Display results
print(header)
pd.set_option('display.max_colwidth', None, 'display.max_columns', None)
NCV.drop(['fit_time', 'score_time', 'estimator', 'estimator_refit_on_all'], axis=1, errors='ignore')

In [None]:
## Simulate shorter runs (fewer repetitions) and repeat score aggregation
scored_multi = pd.DataFrame()
for reps in range(1, setup['repeats'][0]+1):
    scored_short = aggregation(NCV.query(f'NCV_repetition <= {reps-1}'), setup, r=reps)
    scored_short = pd.concat({reps: scored_short}, names=['NCV_repetitions'])
    scored_multi = pd.concat([scored_multi, scored_short])
scored_long = transform_score_df(scored_multi)
    
## Also calculate the scores aggregated over folds within each repetition
separate_reps_scored = aggregate_folds_only(NCV)

In [None]:
## Plot score evolution
score_chart = repNCV_score_plots(pd.concat([separate_reps_scored, scored_long]).reset_index())
score_chart

The plots above show, how the aggregated outer test scores are evolving while accumulating more and more repetitions.
That means, that e.g. at value 5 on the x-axis, the corresponding y-values are aggregations of the repetitions 0 to 4.

- `fold_aggregator`: aggregating (as median, iqm or mean, depending on color) the outer test scores of all outer folds within each repetition
- `rep_aggregator`: aggregating (as median, iqm or mean, depending on stroke) pre-aggregated scores across all repetitions (up to the number of repetitions on the x-axis)

Where `rep_aggregator` is 'none', outer test scores of all outer folds from all repetitions are aggregated together (up to the number of repetitions on the x-axis).
Note that the aggregation type 'mean(mean)' is yielding the identical result as '(none(mean)' (i.e. per-repetition grouped aggregation is equal to the ungrouped aggregation), because the mean of means of equally sized groups is equal to the mean of all.
Areas represent the span of one standard deviation among the per-repetition aggregated values.

**The assembling of the ensemble should ressemble the assembling of the NCV generalisation score.** So, here you should decide which scorer and aggregation scheme seem reasonable to use.
The chosen values will control which members will be elected into the prediction ensemble.

In [None]:
## Control the way the ensemble is composed
## `reduced_ensemble` allows for two different ways to put the ensemble together:
##    reduced_ensemble = True:
##        - selects the one model in each repetition, which comes closest (with its outer test score) to the aggregated score (using `fold_aggreagator`) of all models in this repetition
##        - this will yield an ensemble of `n_outer_rep` members
##        - when predicting, the `n_outer_rep` predictions are aggregated according to `rep_aggregator`
##        - a reduced ensemble will result in lower cpu time needed for prediction
##        - using a reduced ensemble indicates a higher level of trust that the target distribution in the dataset is repesentatitve for the unknown true distribution
##    reduced_ensemble = False:
##        - uses all "best_estimator_" models found
##        - this will yield an ensemble of `n_outer_rep` * `n_outer_folds` members
##        - when predicting, the `n_outer_folds` predictions in each outer repetition are aggregated first according to `folds_aggregator`
##        - the `n_outer_rep` aggregates of the predictions are then aggregated according to `rep_aggregator`
##        - using a full ensemble indicates a lower level of trust that the target distribution in the dataset is repesentatitve for the unknown true distribution
reduced_ensemble = False  # boolean
rep_aggregator = 'mean'  # Can be 'mean' or 'iqm' or 'median'. Will be used to aggregate the predictions
fold_aggregator = 'median'  # Can be 'median', 'mean' or 'iqm'. Will be used to select models for the ensemble if reduced_ensemble=True, otherwise to aggregate the predictions within repetitions
election_scorer = 'R2'  # Can be any of the scorers in the plot above. Will be used together with fold_aggregator to select one model per repetition if reduced_ensemble=True, neglected otherwise.

## Predict
When predicting, each member will cast its individual vote. Then these votes will be aggregated into a single final predicted value, using the `rep_aggregator` if not None, and the `fold_aggregator` otherwise.
This procedure ensures that the ways of creating the final generalisation score and the final predictions are as identical as possible.

In [None]:
## Assembling the ensemble...

## select model run to use for prediction
sel = 'Competitive'  # substring to be search for in the `run_with` index level (needs to distinguish the wanted run from the rest)
selRun = NCV.filter(regex=re.compile(sel, re.IGNORECASE), axis=0) if Config.ncv_mode == 'comparative' else NCV

if reduced_ensemble:  # if `reduced_ensemble` is True, use fold_aggregator and election_scorer to select one model per repetition
    diff = selRun.groupby('NCV_repetition', as_index=False)[f'test_{election_scorer}'].apply(lambda x: np.abs(x - Config.aggregators[fold_aggregator](x))).droplevel(None)
    idx = diff.groupby('NCV_repetition').idxmin()
else:
    idx = selRun.index
esti = pd.concat([
    selRun.loc[idx][['outer_test_set_samples', 'estimator_refit_on_all', 'features', 'regressor']],
    selRun.loc[idx].filter(regex='regressor__')
], axis=1)

if 'regressor__booster' in esti.columns:
        esti.regressor[~esti.regressor__booster.isna()] = esti.regressor[~esti.regressor__booster.isna()] + ' ' + esti.regressor__booster[~esti.regressor__booster.isna()]
        esti.drop(columns='regressor__booster', inplace=True)

In [None]:
##Make pie charts to see what the ensemble is composed of
pie_chart, pie_chart_df = ncv_pie(esti, show_top=5)
pie_chart

In [None]:
## Run the predictions...
## (returning the individual predicted values of all members for all samples)
pred_y_seenSamples_df = ensemble_predict(esti, model_X)  # ...for original MP samples
pred_y_unseenSamples_df = ensemble_predict(esti, pred_X)  # ...for other samples

In [None]:
## Plot a histogram of how many members predicted what values (split into stacked colors for different model types)
## First view shows all stations combined: change to another Sample on the dropdon below, to see single sample histograms
ensemble_pred_histograms(esti, pred_y_seenSamples_df, model_y)

In [None]:
if reduced_ensemble:  # currently only implemented in a way which makes sense when the ensemble was already reduced to on member per repetition
    ## Comparing different ways to aggregate the votes of the ensemble members
    pred_agg_df = pd.DataFrame(  # makes a df with samples as rows and aggregators as columns

        np.array([agg(s)
                  for _, s in pred_y_seenSamples_df.items()
                  for agg in Config.aggregators.values()
                  ]
                ).reshape(
                    pred_y_seenSamples_df.shape[1],
                    len(Config.aggregators)),
       index=pred_y_seenSamples_df.columns,
       columns=Config.aggregators.keys()
      ).join(model_y)

    ## Compare their performances in a table
    pd.concat([
        performance(pred_agg_df[target],
                    pred_agg_df[agg_name]
                   ).drop(columns='Info'
                   ).rename(columns={'Value': agg_name})
        for agg_name in Config.aggregators.keys()
    ], axis=1).T.drop_duplicates().T

In [None]:
if reduced_ensemble:  # currently only implemented in a way which makes sense when the ensemble was already reduced to on member per repetition
    ## Make a plot of the percentage error of the aggregations
    per_err_agg_bar(pred_agg_df)

In [None]:
## Aggregate the ensemble votes into the final prediction.
# rep_aggregator = ...  # only activate to try out prediction aggregations deviating from how the score aggregation was made

pred_y_seenSamples, pred_y_unseenSamples = [aggregate_predictions(
    df,
    (Config.aggregators[rep_aggregator], Config.aggregators[fold_aggregator]) if reduced_ensemble else (Config.aggregators[rep_aggregator],),
    target,
) for df in [
    pred_y_seenSamples_df,
    pred_y_unseenSamples_df]
                             ]

In [None]:
df_o = augment_predictions(pred_y_seenSamples, samples_with_response_and_predictor_data, target=target, kind='observed')
df_p = augment_predictions(pred_y_unseenSamples, samples_with_only_predictor_data, kind='predicted')
df_a = pd.concat([df_o, df_p])

## For samples previously dropped (outliers with existing observed target value), replace the predicted value with the original observed one
df_a.loc[df_a.Type=='observed', 'outlier_excl'] = False
for o in droplist:
    df_a.loc[o, f'{target}_observed'] = samples_with_only_predictor_data.loc[o, target]
    df_a.loc[o, 'Type'] = 'observed'
    df_a.loc[o, 'outlier_excl'] = True

df_a.insert(0,target, df_a[f'{target}_observed'].combine_first(df_a[f'{target}_predicted']))

if target == 'Concentration':  # ...add MassConcentration columns also
    df_a.insert(3, 'MassConcentration_observed', all_samples.MassConcentration)
    for k, v in Config.massConc_from_numConc.items():  # iterating through dict, currently only one key: 'MassConcentration_predicted'
        df_a.insert(3, k, prepare_data.patsy_transform(v, df_a))  # OBS: MassConc is in µg/kg. Divide by 1e9 to get MassConc in kg MP per kg dry sediment!
    df_a.insert(3, 'MassConcentration', df_a[k])  # alike 'Concentration': make combined column for 'MassConcentration'
    df_a.loc[df_a.Type=='observed', 'MassConcentration'] = df_a.loc[df_a.Type=='observed', 'MassConcentration_observed']

In [None]:
## Quick check predicted vs. observed:
id_max, id_min = df_a[target].idxmax(), df_a[target].idxmin()
print('Maximum predicted value: ', f'Sample {id_max}: ', round(df_a.loc[id_max, f"{target}_predicted"]), f' (observed: {round(df_a.loc[id_max, f"{target}_observed"])}).')
print('Minimum predicted value: ', f'Sample {id_min}: ', round(df_a.loc[id_min, f"{target}_predicted"]), f' (observed: {round(df_a.loc[id_min, f"{target}_observed"])}).')

pred_vs_obs =scatter_chart(df_a.join(sdd_iow.set_index('Sample').regio_sep).dropna().drop(droplist),
                     f'{target}_observed', f'{target}_predicted', 'regio_sep',
                     xscale='log', yscale='log',
                     # xtransform = 'log', ytransform= 'log', 
                     # equal_axes= True,
                     identity = True,
                     # reg= 'linear',
                     labels = 'Sample',
                     width=300, height=300
                    )[0]
pred_vs_obs

## Single model (for comparison)

In [None]:
## Alternative to ensembling:
## The classical approach to NCV is, that it is followed by a non-nested GS-CV on the same param grid.
## The `best_estimator_` of this can be directly used for prediction. Its score is discarded as it is optimistically biased.
singleGSCV = GridSearchCV(
        pipe,
        params,
        scoring=scorers,
        refit=best_scored,
        cv=setup['cv_scheme'][1],
        verbose=1,
        n_jobs=-1,
        ).fit(model_X, model_y)

singleGSCV_bak = clone(singleGSCV)
append_agg_cv_scores(singleGSCV.cv_results_, 'median')
append_agg_cv_scores(singleGSCV.cv_results_, 'iqm')

In [None]:
singleGSCV_df = pd.DataFrame(singleGSCV.cv_results_)
singleGSCV_df = unnegate(singleGSCV_df, scorers)

sb = Config.select_best
rf = Config.refit_scorer
bi = singleGSCV_df[f'{sb}_test_{rf}'].idxmax()
bi_ = singleGSCV.best_index_

best_model_class = singleGSCV.best_estimator_.get_params()['regressor'].__class__.__name__
best_model_hyperparams = singleGSCV_df.loc[bi_, [c for c in singleGSCV_df.columns if 'regressor__' in c]].dropna()#.reset_index()

best_feat_sets_dict = singleGSCV.best_estimator_.get_params()['preprocessor__selector__kw_args']
best_feat_set_idx = best_feat_sets_dict['feature_set']
allowed_feat_sets = best_feat_sets_dict['feature_sets']
best_feat_set = allowed_feat_sets[best_feat_set_idx]

overoptimistic_scores = pd.DataFrame({
    s: [singleGSCV_df.loc[bi_, f'{a}_test_{s}']
        for a in Config.aggregators.keys()]
    for s in Config.scorers.keys()
}, index=Config.aggregators.keys())

singleModel_predictions = pd.concat([
    pd.Series(singleGSCV.best_estimator_.predict(model_X), index=model_X.index),
    pd.Series(singleGSCV.best_estimator_.predict(pred_X), index=pred_X.index),
]).rename(f'{target}_predictedBySingleModel')

print(f'''

--------------------------------------------------------------------------------------------------------------------------------------
                      Best single model configuration is:
--------------------------------------------------------------------------------------------------------------------------------------

Model class:      {best_model_class}
Hyper parameters: { {k.split('__')[1]: v for k, v in best_model_hyperparams.to_dict().items()} }

Feature set (index {best_feat_set_idx} of {len(allowed_feat_sets)}): {best_feat_set}

    max       {sb}_test_{rf} (found at row index {bi} with rank {singleGSCV_df.loc[bi, f'rank_by_{sb}_test_{rf}']}): {singleGSCV_df.loc[bi, f'{sb}_test_{rf}']}
"best_index_" {sb}_test_{rf} (found at row index {bi_} with rank {singleGSCV_df.loc[bi_, f'rank_by_{sb}_test_{rf}']}): {singleGSCV_df.loc[bi_, f'{sb}_test_{rf}']}


Probably overoptimistic score of single best model (aggregated over all folds of the non-nested CV):

{overoptimistic_scores}
--------------------------------------------------------------------------------------------------------------------------------------
      ''')

print('    best_estimator_:')
singleGSCV.best_estimator_

In [None]:
## Include the single model predictions into the predictions df
df_a.insert(2, singleModel_predictions.name, singleModel_predictions)

In [None]:
## Compare prediction by 
PSM_vs_PEM =scatter_chart(df_a.join(sdd_iow.set_index('Sample').regio_sep).dropna().drop(droplist),
                     f'{target}_predicted', f'{target}_predictedBySingleModel', 'regio_sep',
                     # xscale='log', yscale='log',
                     # xtransform = 'log', ytransform= 'log', 
                     equal_axes= True,
                     identity = True,
                     # reg= 'linear',
                     labels = 'Sample',
                     width=600, height=600
                    )[0]
PSM_vs_PEM

## Save

In [None]:
### only save if not loaded from already serialised model:
if load_savestamp is None:
    ## Save model settings and results report
    fp = '../data/exports/models/'
    fn = fp + 'model_NCV_result.csv'
    with open(fn, mode='a' if Path(fn).exists() else 'w', encoding='utf-8') as f:
        f.write(header)
    NCV.drop(['fit_time', 'score_time', 'estimator'], axis=1).to_csv(fn, mode='a', sep=';')
    
    ## Save figures
    score_chart.save(f'../data/exports/plots/repNCV_score_evolution_{savestamp}.html')
    pie_chart.save(f'../data/exports/plots/repNCV_ensemble_composition_{savestamp}.html')
    pred_vs_obs.save(f'../data/exports/plots/repNCV_pred_vs_obs_{savestamp}.html')
    
    ## Save predictions
    df_a.to_csv(f'{fp}/predictions/{savestamp}_{target}_predictions.csv')

    ## Serialise results
    sp = Path(fp)/'serialised'/savestamp
    sp.mkdir(parents=True, exist_ok=True)
    NCV_bak.to_pickle(sp/f'NCV_{savestamp}.pkl')
    with open(sp/f'starttime_{savestamp}.pkl', 'wb') as f:
        pickle.dump(starttime, f)
    with open(sp/f'time_needed_{savestamp}.pkl', 'wb') as f:
        pickle.dump(time_needed, f)
    with open(sp/f'singleModelBestEstimator_{savestamp}.pkl', 'wb') as f:
        pickle.dump(singleGSCV.best_estimator_, f)
    with open(sp/f'setup_{savestamp}.pkl', 'wb') as f:
        pickle.dump(setup, f)
    with open(sp/f'Config_{savestamp}.pkl', 'wb') as f:
        pickle.dump(Config, f)
    with open(sp/f'locals_{savestamp}.pkl', 'wb') as f:
        _lokeys.append('f')
        key_list = [k for k in locals().keys() if k not in _lokeys and not k.startswith('_') ]
        locdi = {}
        for k in key_list:
            locdi[k] = locals()[k]
        pickle.dump(locdi, f)  # pickling might need to be done with dill instead...

In [None]:
time.sleep(360000)  # to keep the kernel running for 100 h after model run, when no client is connected