In [None]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
import os
os.environ["PYTHONWARNINGS"] = "ignore"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)
import time

import re
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'
import geopandas as gpd

import sklearn
print(f'sklearn verion: {sklearn.__version__}')
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    # AdaBoostRegressor,
    # GradientBoostingRegressor,
    # HistGradientBoostingRegressor,
)
from xgboost import XGBRegressor
# from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
# from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    import sys
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    import os
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

import prepare_data
import geo
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from plots import repNCV_score_plots
from cv import compete_rep_ncv, compara_rep_ncv, aggregation, process_results, make_setup_dict, make_header, rensembling
from cv_helpers import generate_feature_sets, SelectFeatures, unnegate, inter_rank, fix_feature_combis

## Create set of keys in locals() originating from kernel initialisation and imports
_lokeys = {k for k in locals().keys()}
## later this could be used to pickle all user defined variables for complete workspace serialisation, using something like
# d = locals()
# with open('../data/exports/models/test.pkl', 'wb') as f:
#     pickle.dump({k: d[k] for k in d.keys() - _lokeys}, f)  # pickling might need to be done with dill instead...

from settings import Config, shortnames, target, featurelist

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0)
sdd_cau = sdd_cau.join(prepare_data.fix_gradistat_names(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0)), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()
sdd_cau['Dist_Land'] = geo.get_distance_to_shore(sdd_cau['LON'], sdd_cau['LAT'])
sdd_cau = prepare_data.impute_cau(sdd_cau)
sdd_cau = geo.get_wwtp_influence(sdd_cau, tracks_file='../data/BAW_tracer_simulations.zip', file_postfix='_CAU')

In [None]:
## Split data into samples used for building the model and samples used for predicting.

samples_with_response_and_predictor_data = sdd_iow.loc[~sdd_iow[target].isna()].set_index('Sample')
samples_with_only_predictor_data = sdd_iow.loc[sdd_iow[target].isna()]
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## Samples which are not suitable for ("hydrodynamic outliers") are moved from modelling data to prediction data
droplist = ['S32','S05']
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, samples_with_response_and_predictor_data.loc[droplist,:]])
samples_with_response_and_predictor_data = samples_with_response_and_predictor_data.drop(droplist)

In [None]:
## Limit dataframes to features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.
model_X = samples_with_response_and_predictor_data[featurelist]
model_y = samples_with_response_and_predictor_data[target]
pred_X = samples_with_only_predictor_data[featurelist]
pred_X = pred_X.drop('S09')  # TODO: no sediment data for S09???

In [None]:
# Predefined test set optionally to be used instead of CV:
test_set = ('S30', 'S03', 'S15', 'S06', 'S31', 'S25', 'S20')  # possible samples to use as a predefined test set: ordered by relevance
test_set_size = 7  # Requires int, should be 0 < test_set_size <= len(test_set), for using the n first samples as test_set
test_set = test_set[0:test_set_size]
test_set = model_X.index.isin(test_set).astype(int) - 1  # returns array of len(model_X.index) with -1 for training samples and 0 for testing samples

In [None]:
scaler = None  # this is just needed for reporting as long as scaler is manually switched outside the pipeline

In [None]:
# Scale data using a scaler

# scaler = StandardScaler()
# model_X.values[:] = scaler.fit_transform(model_X)
# pred_X.values[:] = scaler.transform(pred_X)

In [None]:
## Check some basic statistics of the target variable

# model_y.describe()
# model_y.hist()
# model_X.info()


## Model building

### Custom preprocessing functions to be used in the model pipeline
Create a custom feature selector which creates exhaustively all combinations
of available features to be tested as individual feature sets.
It respects constrictions on non-allowed combinations, which can be defined
by the user in Config.mutual_exclusive and Config.exclusive_keywords to sace
computation time by not testing meaningless combinations.

In [None]:
num_feat = (2,5)  # allowed number of features:
                  #     if int: all possible combinations of this length will be created
                  #     if tuple (min, max): all possible combinations of length min upt to length max will be created
                  #     if 'all' all possible combinations of all possible lengths will be created
feature_candidates_list = generate_feature_sets(model_X, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=num_feat, n_jobs=1, save=True)

CustomFeatureSelector = FunctionTransformer(SelectFeatures)#, feature_names_out='one-to-one')

### Creating the model pipeline

In [None]:
## Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      # ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

pipe = Pipeline(steps=[
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())
    ])

preprocessor_params = [
    {
    'preprocessor__selector': [CustomFeatureSelector],
    'preprocessor__selector__kw_args': [{'feature_set': i,
                                         'feature_sets': feature_candidates_list
                                        } for i in range(len(feature_candidates_list))],
    
    # 'preprocessor__scaler': [StandardScaler()],#MaxAbsScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer(), Normalizer()],
    #    'preprocessor__scaler__with_mean': [True],
    #    'preprocessor__scaler__with_std': [True],
    }
]

regressor_params = [
    {
     'regressor': [TweedieRegressor(max_iter=100000)],  # , warm_start': True)],
         'regressor__power': [2],  # 'regressor__power': [0, 1, 1.2, 1.5, 1.9, 2, 3],
         'regressor__alpha': [1], 
         'regressor__link': ['log'],#, 'identity', 'auto'],
         # 'regressor__fit_intercept': [True, False],
    },

    {
    'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 1)],
        'regressor__booster': ['gblinear'],
        'regressor__objective': ['reg:gamma'],
        'regressor__n_estimators': [300],
        'regressor__reg_alpha': [0], #[0, 0.1, 1],  # L1-regularisation
        'regressor__reg_lambda': [1],  # L2-regularisation (OBS: it's called 'alpha' in sklearn GLM models)
    },

    {
    'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 0)],
        'regressor__booster': ['gbtree'],
        'regressor__objective': ['reg:gamma'],
        'regressor__n_estimators': [300],
        'regressor__reg_alpha': [0],  # L1-regularisation
        'regressor__reg_lambda': [1],  # L2-regularisation (OBS: it's called 'alpha' in sklearn GLM models)
        'regressor__tree_method': ['exact'],  # OBS: 'exact' doesn't work with max_depth=0
        'regressor__learning_rate': [0.1, 0.25], #alias for eta
        'regressor__max_depth': [2, 3],
        'regressor__min_child_weight': [5, 7],  # also a regularising parameter: higher values will stop splitting further, when childs have less then 
        # 'regressor__min_split_loss': [0], # alias for gamma
        # 'regressor__grow_policy': ['depthwise', 'lossguide'],
        # 'regressor__subsample': [0.5, 1.0],
        # 'regressor__colsample_bytree': [1/3, 1.0],
    # #     'regressor__colsample_bylevel': [0.5, 1.0],
    # #     'regressor__colsample_bynode': [0.5, 1.S0],
    },
    
    {
     'regressor': [RandomForestRegressor(random_state=np.random.RandomState(0))],
          'regressor__n_estimators': [10, 100, 300],
          'regressor__max_depth': [None, 2, 4],
          'regressor__max_features': [None, 1/3],
          # 'regressor__min_samples_split': [2, 10],
          # 'regressor__min_samples_leaf': [1, 3, 5],
          # 'regressor__bootstrap': [True, False],
          # 'regressor__oob_score': [True, False],
          # 'regressor__warm_start': [True, False],
    },
]

params = [{**dict_pre, **dict_reg} for dict_reg in regressor_params for dict_pre in preprocessor_params]

# Replace the full featurset lists with:
# -> (2,3)-combinations feature sets for linear models,
# -> (5,5)-combinations feature sets for tree models
params = fix_feature_combis(params, feature_candidates_list, lin_combis=[2,3], tree_combis=[5])

### Set options for NCV
scoring, aggregation, repetitions, etc.

In [None]:
repetitions = 100  # number of times the whole NCV is repeated, each time using a different randon_state to make the train/test-splits. Enter 1 for single shuffled NCV, 0 (or False) for single non-shuffled NCV.
inner_reps = repetitions/10  # repetitions of inner KFold: int or fracton of `repetitions` (will be cast to int and fixed to 1 if smaller)
folds = (5, 5)  # tuple (outer_folds, inner_folds) or int if number of folds for inner and out CVs should be the same.
setup = make_setup_dict(repeats=(repetitions,inner_reps), folds=folds, n_jobs=(1,-1), verbosity=(0,1))

scorers = [dict(zip(Config.scorers,t)) for t in zip(*Config.scorers.values())][1]  # dict 'scorers' will contain the string representation of negated scorers used by GridSearchCV
negated = [  # saving a list of scores which will turn up with negated values in the results
    key for key, val in scorers.items()
    if any(sub in str(val)
    for sub in ['neg_', 'greater_is_better=False'])
]
Config.refit_scorer = 'MAPE'  # must be one of the keys in Config.scorers
Config.select_best = 'median'  # 'mean', 'median', 'iqm'
Config.ncv_mode = 'comp'+'ara'+'tive'  # 'competitive' (run all activated models in grid against each other) or 'comparative' (run all activated models in separate sequential repeated NCV runs)

### Training the model

In [None]:
load_savestamp = '20230403_233901'  # enter savestamp of model run here, write None to train a new model

if load_savestamp is not None:
    ## Load serialised results instead of training anew

    sp = Path(f'../data/exports/models/serialised/{load_savestamp}/')
    NCV = pd.read_pickle(sp/f'NCV_{load_savestamp}.pkl')
    with open(sp/f'starttime_{load_savestamp}.pkl', 'rb') as f:
        starttime = pickle.load(f)
    with open(sp/f'time_needed_{load_savestamp}.pkl', 'rb') as f:
        time_needed = pickle.load(f)
    with open(sp/f'setup_{load_savestamp}.pkl', 'rb') as f:
        setup = pickle.load(f)
        
else:
    if repetitions < 1:
        ## Run single NCV without shuffle split
        raise NotImplementedError('Running single NCV without shuffle split corresponds to the old version before repeated NCV was included and was not re-implemented after rewriting codebase for the new rep_ncv function.')
    if Config.ncv_mode == 'competitive':
        NCV, setup, starttime, time_needed = compete_rep_ncv(pipe, params, model_X, model_y, scorers, setup)
    elif Config.ncv_mode == 'comparative':
        NCV, setup, starttime, time_needed = compara_rep_ncv(pipe, params, model_X, model_y, scorers, setup)
    else: raise ValueError(f'No valid NCV mode: {Config.ncv_mode}')

## CV results postprocessing

In [None]:
## Take an unaltered backup for pickling
NCV_bak = NCV.copy()

In [None]:
## Add ranking to outer fold test scores, all together, not regarding the repetitions
NCV = inter_rank(NCV, Config.refit_scorer)
## Get additional scorings, get feature names, etc. and arrange into a dataframe
NCV = process_results(NCV, model_X, model_y, refitOnAll=True)
## When run in comparative mode: append results that would have come out of an equivalent run in competitive mode
NCV = rensembling(NCV)  # OBS: the intra- and inter rep rankings are not created again for the new block of the competitive run
## Reversing the higher-is-better sklearn negated scores
NCV = unnegate(NCV, negated)
## Sort NCV
NCV.sort_index(level=[0, 1, 3], sort_remaining=False, inplace=True)
## Generate report header
savestamp = starttime.strftime("%Y%m%d_%H%M%S")
header = make_header(NCV, setup, savestamp, time_needed, droplist, model_X, num_feat, feature_candidates_list, scaler, regressor_params)
## Display results
print(header)
pd.set_option('display.max_colwidth', None, 'display.max_columns', None)
NCV.drop(['fit_time', 'score_time', 'estimator', 'estimator_refit_on_all'], axis=1, errors='ignore')

In [None]:
## Simulate shorter runs (fewer repetitions) and repeat score aggregation
scored_multi = pd.DataFrame()
for reps in range(1, setup['repeats'][0]+1):
    scored_short = aggregation(NCV.query(f'NCV_repetition <= {reps-1}'), setup, r=reps)
    scored_short = pd.concat({reps: scored_short}, names=['NCV_repetitions'])
    scored_multi = pd.concat([scored_multi, scored_short])
    
## Plot score evolution
chart, chart_df = repNCV_score_plots(scored_multi, return_df=True, ncv_mode=Config.ncv_mode)
chart

## Predict

In [None]:
## select model run to use for prediction
sel = 'Competitive'
evalScore = 'R2'

selRun = NCV.filter(regex=re.compile(sel, re.IGNORECASE), axis=0)
diff = selRun.groupby('NCV_repetition')[f'test_{evalScore}'].apply(lambda x: np.abs(x - x.median()))
idx = diff.groupby('NCV_repetition').idxmin()
esti = selRun.loc[idx][['estimator_refit_on_all', 'features']]  # TODO: also include any columns starting with 'regressor' to check for unique models in ensemble

In [None]:
pred_y_seenSamples = pd.Series([
    np.mean(
        [est['estimator_refit_on_all'].predict(model_X.loc[[P], est.features])
         for _, est in esti.iterrows()]
    ) for P in model_X.index
], index=model_X.index, name=f'{target}_predicted')


pred_y = pd.Series([
    np.mean(
        [est['estimator_refit_on_all'].predict(pred_X.loc[[P], est.features])
         for _, est in esti.iterrows()]
    ) for P in pred_X.index
], index=pred_X.index, name=f'{target}_predicted')

In [None]:
df_o = pred_y_seenSamples.to_frame().join(samples_with_response_and_predictor_data[[target, 'LON', 'LAT', 'SedDryBulkDensity'] + featurelist])
df_o['Type'] = 'observed'
df_o.rename(columns={target: f'{target}_observed'}, inplace=True)
df_p = pred_y.to_frame().join(samples_with_only_predictor_data[['LON', 'LAT', 'SedDryBulkDensity'] + featurelist])
df_p['Type'] = 'predicted'
df_a = pd.concat([df_o, df_p])
            
## For samples previously dropped (outliers with existing observed target value), replace the predicted value with the original observed one
df_a.insert(2, 'outlier_excl', False)
for o in droplist:
    df_a.loc[o, f'{target}_observed'] = samples_with_only_predictor_data.loc[o, target]
    df_a.loc[o, 'Type'] = 'observed'
    df_a.loc[o, 'outlier'] = True
            
df_a.insert(0,target, df_a[f'{target}_observed'].combine_first(df_a[f'{target}_predicted']))

In [None]:
gdf = gpd.GeoDataFrame(df_a, geometry=gpd.points_from_xy(df_a.LON, df_a.LAT))

gdf.plot(target)

## Save

In [None]:
## only save if not loaded from already serialised model:
if load_savestamp is None:
    ## Save model settings and results report
    fp = '../data/exports/models/'
    fn = fp + 'model_NCV_result.csv'
    with open(fn, mode='a' if Path(fn).exists() else 'w', encoding='utf-8') as f:
        f.write(header)
    NCV.drop(['fit_time', 'score_time', 'estimator'], axis=1).to_csv(fn, mode='a', sep=';')

    ## Serialise results
    sp = Path(fp)/'serialised'/savestamp
    sp.mkdir(parents=True, exist_ok=True)
    NCV_bak.to_pickle(sp/f'NCV_{savestamp}.pkl')
    with open(sp/f'starttime_{savestamp}.pkl', 'wb') as f:
        pickle.dump(starttime, f)
    with open(sp/f'time_needed_{savestamp}.pkl', 'wb') as f:
        pickle.dump(time_needed, f)
    with open(sp/f'setup_{savestamp}.pkl', 'wb') as f:
        pickle.dump(setup, f)

    ## Save figure
    chart.save(f'../data/exports/plots/repNCV_score_evolution_{savestamp}.html')
    
    ## Save predictions
    gdf.drop('geometry', axis=1).to_csv(f'{fp}/predictions/{savestamp}_{target}_predictions.csv')

## OLD

In [None]:
# get top k results of inner CV sorted by best Config.refit_scorer 

# outer_fold = 0  # manuaklly chose which outer fold to look at
# top_k = 10  # how many model candidates from the inner model to show?
# pd.DataFrame(outerCV['estimator'][outer_fold].cv_results_).sort_values(f'rank_by_median_test_{Config.refit_scorer}', ascending=True).head(top_k)


In [None]:
## Printing score of the best performing model candidate and its parameters.

# outer_fold = 0

# model = outerCV['estimator_refit_on_all'][outer_fold]
# print(best_params_df.loc[outer_fold, 'regressor'])
# print(f'R2 of model retrained with all samples, tested against all samples: {r2_score(model_y, model.predict(model_X[NCV.features.loc[outer_fold]]))}')
# # if model was TweedieRegressor, print alpha and power
# if 'Tweedie' in str(type(best_params_df.loc[outer_fold, 'regressor'])):
#     print(f'Intercept: {model.intercept_}')
#     print(f'Coeffs: {model.coef_}')
# print(feature_candidates_list[best_params_df.loc[outer_fold, 'preprocessor__selector__kw_args']['feature_set']])
# print(f'{Config.scorers[Config.refit_scorer]}: {outerCV["estimator"][outer_fold].score(model_X, model_y)}')  
#print(outerCV['estimator'][outer_fold].best_params_)

In [None]:
# print('Number of coefficients in each outer folds best model: ', [outerCV['estimator'][i].best_estimator_.named_steps['regressor'].n_features_in_ for i in range(len(outerCV['estimator']))])
# print('Coeffs: ', *[outerCV['estimator'][i].best_estimator_.named_steps['regressor'].coef_ for i in range(len(outerCV['estimator']))], sep='\n')
# results.estimator[0].best_estimator_.named_steps['preprocessor'].transformers[0][1].get_feature_names_out()

In [None]:
# [outerCV['estimator'][0].best_estimator_.named_steps['regressor'].estimators_[i].get_n_leaves() for i in range(100)]

In [None]:
# results['estimator'].apply(lambda x: x.score(model_X, model_y))
# pd.DataFrame.from_dict(dict(zip(s.values)))


In [None]:
# r2_all_but_no_refit = results['estimator'].apply(lambda x: r2_score(model_y, x.predict(model_X)))
# r2_all_but_no_refit

In [None]:
# get inner cv results of the outer cv fold which achieved the best scoring metric
# innerCV_df = pd.DataFrame(outerCV_df.loc[outerCV_df[f'test_score'].idxmax(), 'estimator'].cv_results_)
# innerCV_df.sort_values(by=f'rank_test_{scoring[0]}', ascending=True)

In [None]:
# outerCV['estimator'][0].best_estimator_.named_steps['preprocessor'].get_feature_names_out()

In [None]:
## Extracting feature names
# [grid.best_estimator_.named_steps['preprocessor'].named_transformers_['selector'].get_feature_names_out(input_features=model_X.columns.tolist())]

## The chosen final model

- evaluation by cross-validation
- prediction

In [None]:
# # Activate this block to use same features, model type and hyperparameters a specific outer fold
# #outer_fold = 1
# #final_model_X = model_X[NCV.features.loc[outer_fold]]
# #final_model = outerCV['estimator_refit_on_all'][outer_fold]

# # Alternatively activate this block to use a manually defined model
# final_model_X = model_X[['PC1', 'Depth', 'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444']]
# final_model = TweedieRegressor(alpha=0.1, link='log', max_iter=100000, power=1.5, tol=0.0001)
    
# # for loop to run cross validation on the final model with leave-p-out iterating p from 2 to 10
# allR2s = {}
# allMedianR2s = {}
# allMeanR2s = {}
# R2calcDuration = {}
# for P in range(2, 7):
#     starttime = datetime.now()
#     finalR2s = cross_val_score(final_model, final_model_X, model_y, cv=LeavePOut(P), scoring='r2', n_jobs=-1, verbose=0)
#     duration = datetime.now() - starttime
#     print(f'Cross-validation of final model with leave-{P}-out took {duration.seconds//3600} hours, {(duration.seconds//60)%60} minutes and {duration.seconds%60} seconds.')
#     print(f'Cross-validated MEAN R2 of final model: {finalR2s.mean():.3f} (Standard deviation: {finalR2s.std():.3f})')
#     print(f'Cross-validated MEDIAN R2 of final model: {np.median(finalR2s):.3f} (IQR {np.subtract(*np.percentile(finalR2s, [.75, .25])):.3f})')
#     print()
#     allR2s[P] = finalR2s
#     allMedianR2s[P] = np.median(finalR2s)
#     allMeanR2s[P] = finalR2s.mean()
#     R2calcDuration[P] = duration

# final_model.fit(final_model_X, model_y)
# print(f'R2 of final model retrained with all samples, tested against all samples: {r2_score(model_y, final_model.predict(final_model_X))}')

# # Predictions
# #pred_X = pred_X[model_X.columns.tolist()]
# #pred_y = final_model.predict(pred_X)
# df1 = pd.DataFrame(zip(model_y, final_model.predict(final_model_X), ['IOW']*len(model_y)), index=model_X.index, columns=[target, 'predicted', 'group'])
# #df2 = pd.DataFrame(zip([0]*len(pred_y), pred_y, ['CAU']*len(pred_y)), index=pred_X.index, columns=[target, 'predicted', 'group'])
# df = pd.concat([df1])#, df2])

# scatter_chart(df.reset_index(), target, 'predicted', 'group', labels='Sample', identity=True, equal_axes=True, width=800, height=800, title='yhat vs. y')[0]

In [None]:
# # combine dicts 'allMeanR2s' and 'allMedianR2s' into a dataframe
# final_scores_df = pd.DataFrame.from_dict(allMeanR2s, orient='index', columns=['Mean R2']).join(pd.DataFrame.from_dict(allMedianR2s, orient='index', columns=['Median R2']))

In [None]:
# final_scores_df.plot.line(title='Mean and median R2 of final model with leave-p-out cross-validation', xlabel='p', ylabel='R2', width=800, height=400)

In [None]:
time.sleep(360000)  # to keep the kernel running for 100 h after model run, when no client is connected