In [None]:
%reload_ext autoreload
%autoreload 2

import warnings
import os
from pathlib import Path
from datetime import datetime
import pickle
import joblib
import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'
from tqdm import tqdm

import sklearn
print(f'sklearn verion: {sklearn.__version__}')
from sklearn.utils import parallel_backend
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import GridSearchCV, LeavePOut, LeaveOneOut, cross_validate, KFold, PredefinedSplit

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
)
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

from sklearn.metrics import r2_score, mean_absolute_percentage_error, median_absolute_error

from sklearn import clone, set_config
# set_config(transform_output='pandas')  # only works for sklearn >= 1.2

try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    import sys
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    import os
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target
from plots import scatter_chart
from cv import generate_feature_sets, best_scored, get_median_cv_scores, SelectFeatures

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()

In [None]:
## Additional variable generation (e.g. predictor derivatives)
 
# sdd_iow['Dist_WWTP_revsq'] = ((1/sdd_iow['Dist_WWTP'])**3)*10000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = (((sdd_iow['Dist_WWTP'].max()-sdd_iow['Dist_WWTP'])+1)**3)/100000000000  # calculates the squared of the reversed Distance 
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**3)/100  # calculates the squared of the reversed Distance
# sdd_iow['Dist_WWTP_revsq'] = ((sdd_iow['Dist_WWTP'].max()/sdd_iow['Dist_WWTP'])**2)  # calculates the squared of the reversed Distance

# sdd_iow

In [None]:
## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow.Concentration.isna()].set_index('Sample')
pred_data = sdd_iow.loc[sdd_iow.Concentration.isna()]
pred_data = pd.concat([pred_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## Potential outlier exclusion
droplist = []
model_data = model_data.drop(droplist)

In [None]:
## List of features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.

featurelist = [
    'Depth',
    # 'LON', 'LAT',
    'Dist_Land',
    # 'Dist_Marina',
    'Dist_WWTP',
    # 'WWTP_influence_as_tracer_mean_dist',
    # 'WWTP_influence_as_cumulated_residence',
    # 'WWTP_influence_as_mean_time_travelled',
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_allseasons_444',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_allseasons_444',
    #'WWTP_influence_as_cumulated_residence__sed_18µm_allseasons_444',
    #'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444',
    #'WWTP_influence_as_tracer_mean_dist__nosed_18µm_spring_444',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_spring_444',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_spring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_spring_444',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_spring_444',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_spring_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_spring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_spring_444',
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_autumn_222',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_autumn_222',
    'WWTP_influence_as_cumulated_residence__sed_18µm_autumn_222',
    'WWTP_influence_as_mean_time_travelled__sed_18µm_autumn_222',
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_autumn_444',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_autumn_444',
    #'WWTP_influence_as_cumulated_residence__sed_18µm_autumn_444',
    #'WWTP_influence_as_mean_time_travelled__sed_18µm_autumn_444',
    #'WWTP_influence_as_tracer_mean_dist__nosed_18µm_autumn_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_autumn_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_autumn_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_autumn_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_autumn_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_autumn_222',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_autumn_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_autumn_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_allseasons_444',            # *
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_allseasons_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_allseasons_444',         # *
    #'WWTP_influence_as_tracer_mean_dist__sed_allsizes_allseasons_444',              # *
    #'WWTP_influence_as_endpoints_mean_dist__sed_allsizes_allseasons_444',
    #'WWTP_influence_as_cumulated_residence__sed_allsizes_allseasons_444',
    #'WWTP_influence_as_mean_time_travelled__sed_allsizes_allseasons_444',           # *
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_allseasons_222',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_allseasons_222',
    #'WWTP_influence_as_cumulated_residence__sed_18µm_allseasons_222',
    #'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_18µm_allseasons_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_allseasons_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_allseasons_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_18µm_spring_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_spring_222',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_spring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_spring_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_allseasons_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_allseasons_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_autumn_444',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_autumn_444',
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_autumn_444',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_autumn_444',
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_spring_444',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_spring_444',
    #'WWTP_influence_as_cumulated_residence__sed_18µm_spring_444',
    #'WWTP_influence_as_mean_time_travelled__sed_18µm_spring_444',
    #'WWTP_influence_as_tracer_mean_dist__nosed_18µm_autumn_444',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_18µm_autumn_444',
    'WWTP_influence_as_cumulated_residence__nosed_18µm_autumn_444',
    'WWTP_influence_as_mean_time_travelled__nosed_18µm_autumn_444',
    #'WWTP_influence_as_tracer_mean_dist__sed_18µm_spring_222',
    #'WWTP_influence_as_endpoints_mean_dist__sed_18µm_spring_222',
    #'WWTP_influence_as_cumulated_residence__sed_18µm_spring_222',
    #'WWTP_influence_as_mean_time_travelled__sed_18µm_spring_222',
    #'WWTP_influence_as_tracer_mean_dist__sed_allsizes_allseasons_222',
    #'WWTP_influence_as_endpoints_mean_dist__sed_allsizes_allseasons_222',
    #'WWTP_influence_as_cumulated_residence__sed_allsizes_allseasons_222',
    #'WWTP_influence_as_mean_time_travelled__sed_allsizes_allseasons_222',
    #'WWTP_influence_as_tracer_mean_dist__nosed_allsizes_spring_222',
    #'WWTP_influence_as_endpoints_mean_dist__nosed_allsizes_spring_222',
    
    'WWTP_influence_as_cumulated_residence__nosed_allsizes_spring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_allsizes_spring_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_autumn_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_autumn_222',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_autumn_222',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_autumn_222',
    'WWTP_influence_as_tracer_mean_dist__sed_0µm_allseasons_222',
    'WWTP_influence_as_endpoints_mean_dist__sed_0µm_allseasons_222',
    'WWTP_influence_as_cumulated_residence__sed_0µm_allseasons_222',
    'WWTP_influence_as_mean_time_travelled__sed_0µm_allseasons_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_summer_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_summer_444',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_summer_444',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_summer_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_autumnspring_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_autumnspring_222',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_autumnspring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_autumnspring_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_autumn_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_autumn_444',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_autumn_444',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_autumn_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_spring_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_spring_222',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_spring_222',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_spring_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_autumnspring_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_autumnspring_444',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_autumnspring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_autumnspring_444',
    'WWTP_influence_as_tracer_mean_dist__sed_0µm_allseasons_444',
    'WWTP_influence_as_endpoints_mean_dist__sed_0µm_allseasons_444',
    'WWTP_influence_as_cumulated_residence__sed_0µm_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__sed_0µm_allseasons_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_summer_222',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_summer_222',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_summer_222',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_summer_222',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_spring_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_spring_444',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_spring_444',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_spring_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_allseasons_444',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_allseasons_444',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_allseasons_444',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_allseasons_444',
    'WWTP_influence_as_tracer_mean_dist__nosed_0µm_allseasons_222_',
    'WWTP_influence_as_endpoints_mean_dist__nosed_0µm_allseasons_222_',
    'WWTP_influence_as_cumulated_residence__nosed_0µm_allseasons_222_',
    'WWTP_influence_as_mean_time_travelled__nosed_0µm_allseasons_222_',
    
    # 'Dist_WWTP2',
    # 'Dist_WWTP_revsq',
    #'MODE 1 (µm)',
    # 'D10 (µm)',
    'D50 (µm)',
    # 'D90 (µm)',
    # 'perc GRAVEL',
    # 'perc SAND',
    'perc MUD',
    # 'perc CLAY',
    # 'OM_D50',
    'TOC',
    # 'Hg',
    # 'TIC',
    # 'regio_sep',
    'PC1',
    'PC2'
    ]
model_X = model_data[featurelist]
model_y = model_data[target]
pred_X = pred_data[featurelist]

In [None]:
scaler = None  # this is just needed for reporting as long as scaler is manually switched outside the pipeline

In [None]:
# Scale data using StandardScaler

# scaler = StandardScaler()
# model_X.values[:] = scaler.fit_transform(model_X)
# pred_X.values[:] = scaler.transform(pred_X)

In [None]:
## Check some basic statistics of the target variable

# model_y.describe()
# model_y.hist()
# model_X.info()


## Model building

### Custom preprocessing functions to be used in the model pipeline

In [None]:
## Create exhaustive feature selector, using leave-p-out on columns labels to generate a boolean matrix.
num_feat = (2,3)  # allowed number of features:
                  #     if int: all possible combinations of this length will be created
                  #     if tuple (min, max): all possible combinations of length min upt to length max will be created 
feature_candidates_list = generate_feature_sets(model_X, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=num_feat, n_jobs=1, save=True)

CustomFeatureSelector = FunctionTransformer(SelectFeatures)#, feature_names_out='one-to-one')

### Creating the model pipeline

In [None]:
## Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      # ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

pipe = Pipeline(steps=[
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())
    ])

preprocessor_params = [{
    # 'preprocessor__selector': [CustomFeatureSelector],
      'preprocessor__selector__kw_args': [{'feature_set': i,
                                           'feature_sets': feature_candidates_list
                                          } for i in range(len(feature_candidates_list))],
    
    # 'preprocessor__scaler': [StandardScaler()],#MaxAbsScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer(), Normalizer()],
    #    'preprocessor__scaler__with_mean': [True],
    #    'preprocessor__scaler__with_std': [True],
    }]

regressor_params = [
    # {
    # 'regressor': [DummyRegressor()],
    #     'regressor__strategy': ['median'],
    # },
    
    # {
    # 'regressor': [SVR()],
    #    'regressor__C': [0.1, 1, 1.5, 10, 20],
    #    'regressor__kernel': ['linear', 'rbf', 'poly'],
    #    'regressor__degree': [2, 3, 4, 5],
    # },

    {
    'regressor': [TweedieRegressor(max_iter=100000)],
      'regressor__power': [2],
      # 'regressor__power': [0, 1, 1.2, 1.5, 1.9, 2, 3],
      # 'regressor__alpha': [0, 0.2, 1, 2, 5, 10], 
      'regressor__link': ['log'],#, 'identity', 'auto'],
      # 'regressor__fit_intercept': [True, False],
      # 'regressor__warm_start': [True, False],
      ## 'regressor__fit_params__sample_weights': [None, model_data.loc[model_X.index, 'Mass'].to_numpy()],  # FIXME: fit_params seem not to be accepted from gridsearch params, only as argument in fit method directly...
    },
  
#     {
#     'regressor': [RadiusNeighborsRegressor()],
#         'regressor__radius': [1000, 10000, 100000],
#         'regressor__weights': ['uniform', 'distance'],
#         'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#         'regressor__leaf_size': [10, 20, 30, 40, 50],
#     },
    
    # {
    # 'regressor': [RandomForestRegressor()],
    #      'regressor__random_state': [0],
    #      'regressor__n_estimators': [20, 150, 500],
    #      'regressor__max_depth': [None, 3, 7],
    #      'regressor__max_features': [None, 'sqrt', 'log2'],
    #      'regressor__min_samples_split': [2, 4, 10],
    #      'regressor__min_samples_leaf': [1, 3, 5],
    #      'regressor__bootstrap': [True, False],
    #      'regressor__oob_score': [True, False],
    #      'regressor__warm_start': [True, False],
    # },
    
    # {
    # 'regressor': [GradientBoostingRegressor()],
    #     'regressor__loss': ['squared_error', 'huber', 'quantile'],
    #     'regressor__learning_rate': [0.01, 0.1, 0.5],  
    #     'regressor__n_estimators': [100, 200, 500],
    #     'regressor__subsample': [0.5, 1.0],
    #     'regressor__criterion': ['squared_error', 'friedman_mse'],
    #     'regressor__min_samples_split': [2, 10],
    #     'regressor__min_samples_leaf': [1, 5],
    #     'regressor__max_depth': [2, 3, 5],
    #     'regressor__min_weight_fraction_leaf': [0.0, 0.1],
    #     'regressor__max_features': [None, 'sqrt', 'log2'],
#         'regressor__max_leaf_nodes': [None, 5, 10],
#         'regressor__min_impurity_decrease': [0.0, 0.1],
#         'regressor__min_impurity_split': [None, 0.1],
#         'regressor__alpha': [0.9, 0.95, 0.99, 0.999],
#         'regressor__tol': [0.0001, 0.001, 0.01],
#         'regressor__validation_fraction': [0.1, 0.2],
#         'regressor__n_iter_no_change': [None, 5, 10],
#         'regressor__ccp_alpha': [0.0, 0.1],
#         'regressor__warm_start': [True, False],
    # },
]

params = [{**dict_pre, **dict_reg} for dict_reg in regressor_params for dict_pre in preprocessor_params]
# params = regressor_params

### Training the model

In [None]:
## The pipeline is run by searching the provided parameter space using scorings of a crossvalidation technique to find out how each model candidate performs.

warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
os.environ["PYTHONWARNINGS"] = "ignore"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)

Config.scoring = {  # this dict is defined in settings.Config, but may be overwritten here for convenience
    'R2': 'r2',
    'MAPE': 'neg_mean_absolute_percentage_error',
    'MedAE': 'neg_median_absolute_error',
    # 'MSLE': 'neg_mean_squared_log_error',
  }
Config.refit_scorer = 'MedAE'
Config.select_best = 'median'

# Predefined test set instead of CV:
test_set = ('S30', 'S03', 'S15', 'S06', 'S31', 'S25', 'S20')  # possible samples to use as a predefined test set: ordered by relevance
test_set_size = 7  # Requires int, should be 0 < test_set_size <= len(test_set), for using the n first samples as test_set
test_set = test_set[0:test_set_size]
test_set = model_X.index.isin(test_set).astype(int) - 1  # returns array of len(model_X.index) with -1 for training samples and 0 for testing samples

cv_scheme_inner = LeaveOneOut()
cv_scheme_outer = 10  # use `PredefinedSplit(test_set)` for a single fold with test set as defined above

# with parallel_backend('loky', n_jobs=-1):
innerCV = GridSearchCV(
    pipe,
    params,
    scoring=Config.scoring,
    refit=best_scored,
    cv=cv_scheme_inner,
    verbose=1,
    n_jobs=-1
    )

outerCV = cross_validate(
    innerCV,
    model_X,
    model_y,
    scoring=Config.scoring,
    cv=cv_scheme_outer,
    return_train_score=True,
    return_estimator=True,
    verbose=1,
    # n_jobs=-1
    )

get_median_cv_scores(outerCV)

In [None]:
# saving the model
savelist = [
    'model_X',
    'model_y',
    'feature_candidates_list',
    'outerCV',
    'params'    
]
savedict = {key: eval(key) for key in savelist}
savestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
fp = '../data/exports/models/'
with open(fp+f'{savestamp}.pkl', 'wb') as f:
    # pickle.dump(outerCV, f)
    joblib.dump(savedict, f, compress=1)

# write settings of exported run to csv
header = f'''


---------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------


Model run: {savestamp}
Outliers excluded: {droplist}
Scaler: {scaler}
n feature combinations: {num_feat}
Regressors: {regressor_params}

Scorer      for evaluation: {Config.refit_scorer}  (outer fold results are sorted by this!)
Aggregation for evaluation: {Config.select_best}

CV schemes:
    inner: {cv_scheme_inner}
    outer: {cv_scheme_outer}
    
---------------------------------------------------------------------------------------------

'''

fn = fp + 'model_results.csv'
with open(fn, mode='a' if Path(fn).exists() else 'w', encoding='utf-8') as f:
    f.write(header)

## Evaluating the model

In [None]:
# make a df of all outer cv results and show it sorted by the best scoring metric
outerCV_df = pd.DataFrame(outerCV)
outerCV_df.rename_axis(index='outerCV_fold', inplace=True)

## Get best model params for each of the outer cv folds:
best_params_df = pd.DataFrame()
for i, model in enumerate(outerCV['estimator']):
    best_params = model.best_params_
    # best_params_df = pd.concat([best_params_df, pd.DataFrame(best_params, index=[i])])  # this does not work when RandomForestRegressor is used, because some internals call len() on the values of the best_params dict, which raises AttributeError: 'RandomForestRegressor' object has no attribute 'estimators_'
    # instead filling df with for-loop...:
    current_best_params_df = pd.DataFrame()
    for key, value in best_params.items():
        current_best_params_df[key] = [value]
        current_best_params_df.index = [i]
    best_params_df = pd.concat([best_params_df, current_best_params_df])

results = outerCV_df.join(best_params_df)

In [None]:
results_summary = results.copy().drop(['estimator', 'fit_time', 'score_time'], axis=1)

# get names of features used by the models
if 'preprocessor__selector__kw_args' in results.columns:
    results_summary.rename(columns={'preprocessor__selector__kw_args': 'features'}, inplace=True)
    s = results_summary.features.apply(lambda x: [x['feature_set'], feature_candidates_list[x['feature_set']]])
    d = pd.DataFrame.from_dict(dict(zip(s.index, s.values))).T
    results_summary.features, results_summary['feature_combi_ID'] = d[1], d[0]
results_summary.drop(list(results_summary.filter(regex='regressor__')), axis=1, inplace=True)

# calculate scores of the best model for each outer cv fold against all data
results_summary['allSamples_R2'] = [r2_score(model_y, outerCV['estimator'][i].predict(model_X)) for i in range(len(results_summary))]
results_summary['allSamples_MAPE'] = [mean_absolute_percentage_error(model_y, outerCV['estimator'][i].predict(model_X)) for i in range(len(results_summary))]
results_summary['allSamples_MedAE'] = [median_absolute_error(model_y, outerCV['estimator'][i].predict(model_X)) for i in range(len(results_summary))]

# now refit all models in outerCV on all data
outerCV['estimator_refit_on_all'] = [clone(outerCV['estimator'][i].best_estimator_.named_steps['regressor']).fit(model_X[results_summary.features.loc[i]], model_y) for i, _ in enumerate(outerCV['estimator'])]

# calculate scores against all data again after refitting
results_summary['allSamples_R2_refit'] = [r2_score(model_y, outerCV['estimator_refit_on_all'][i].predict(model_X[results_summary.features.loc[i]])) for i in range(len(results_summary))]
results_summary['allSamples_MAPE_refit'] = [mean_absolute_percentage_error(model_y, outerCV['estimator_refit_on_all'][i].predict(model_X[results_summary.features.loc[i]])) for i in range(len(results_summary))]
results_summary['allSamples_MedAE_refit'] = [median_absolute_error(model_y, outerCV['estimator_refit_on_all'][i].predict(model_X[results_summary.features.loc[i]])) for i in range(len(results_summary))]

# Sort results
results_summary.sort_values(by=f'test_{Config.refit_scorer}', ascending=False, inplace=True)

# Save results
results_summary.to_csv(fn, mode='a', sep=';')

pd.set_option('display.max_colwidth', None)
results_summary

In [None]:
# get top k results of inner CV sorted by best Config.refit_scorer 

# outer_fold = 0  # manuaklly chose which outer fold to look at
# top_k = 10  # how many model candidates from the inner model to show?
# pd.DataFrame(outerCV['estimator'][outer_fold].cv_results_).sort_values(f'rank_by_median_test_{Config.refit_scorer}', ascending=True).head(top_k)


In [None]:
## Printing score of the best performing model candidate and its parameters.

outer_fold = 1  # OBS: using only one of the outer-folds models here... e.g. [0]

# print(f'{Config.scoring[Config.refit_scorer]}: {outerCV["estimator"][outer_fold].score(model_X, model_y)}')  
#print(outerCV['estimator'][outer_fold].best_params_)

## Plotting the results of the best performing model candidate.
df = pd.DataFrame(zip(model_y, outerCV['estimator_refit_on_all'][outer_fold].predict(model_X[results_summary.features.loc[outer_fold]])), index=model_X.index, columns=[target, 'predicted'])
from plots import scatter_chart
scatter_chart(df.reset_index(), target, 'predicted',
                                 labels='Sample',
                                 identity=True,
                                 equal_axes=True,
                                 # xscale='log', yscale='log',
                                #  xtransform=True, ytransform=True,
                                 width=800, height=800,
                                 title='yhat vs. y')[0]


In [None]:
outer_fold = 0

model = outerCV['estimator_refit_on_all'][outer_fold]
print(best_params_df.loc[outer_fold, 'regressor'])
print(f'R2 of model retrained with all samples, tested against all samples: {r2_score(model_y, model.predict(model_X[results_summary.features.loc[outer_fold]]))}')
print(f'Intercept: {model.intercept_}')
print(f'Coeffs: {model.coef_}')
print(feature_candidates_list[best_params_df.loc[outer_fold, 'preprocessor__selector__kw_args']['feature_set']])

In [None]:
# print('Number of coefficients in each outer folds best model: ', [outerCV['estimator'][i].best_estimator_.named_steps['regressor'].n_features_in_ for i in range(len(outerCV['estimator']))])
# print('Coeffs: ', *[outerCV['estimator'][i].best_estimator_.named_steps['regressor'].coef_ for i in range(len(outerCV['estimator']))], sep='\n')
# results.estimator[0].best_estimator_.named_steps['preprocessor'].transformers[0][1].get_feature_names_out()

In [None]:
# [outerCV['estimator'][0].best_estimator_.named_steps['regressor'].estimators_[i].get_n_leaves() for i in range(100)]

In [None]:
# results['estimator'].apply(lambda x: x.score(model_X, model_y))
# pd.DataFrame.from_dict(dict(zip(s.values)))


In [None]:
# r2_all_but_no_refit = results['estimator'].apply(lambda x: r2_score(model_y, x.predict(model_X)))
# r2_all_but_no_refit

In [None]:
# get inner cv results of the outer cv fold which achieved the best scoring metric
# innerCV_df = pd.DataFrame(outerCV_df.loc[outerCV_df[f'test_score'].idxmax(), 'estimator'].cv_results_)
# innerCV_df.sort_values(by=f'rank_test_{scoring[0]}', ascending=True)

In [None]:
# outerCV['estimator'][0].best_estimator_.named_steps['preprocessor'].get_feature_names_out()

In [None]:
## Extracting feature names
# [grid.best_estimator_.named_steps['preprocessor'].named_transformers_['selector'].get_feature_names_out(input_features=model_X.columns.tolist())]