In [None]:
%reload_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for single cpu)
import os
os.environ["PYTHONWARNINGS"] = "ignore"  # ignore warnings to avoid flooding the gridsearch output with repetitive messages (works for parallel)

from pathlib import Path
from datetime import datetime
import pickle
import joblib
import numpy as np
import pandas as pd
pd.options.plotting.backend = 'holoviews'
from tqdm import tqdm

import sklearn
print(f'sklearn verion: {sklearn.__version__}')
from sklearn.utils import parallel_backend
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.model_selection import GridSearchCV, LeavePOut, LeaveOneOut, cross_validate, cross_val_score, KFold, PredefinedSplit

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
)
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import TweedieRegressor

from sklearn.metrics import r2_score, mean_absolute_percentage_error, median_absolute_error, make_scorer

from sklearn import clone, set_config
# set_config(transform_output='pandas')  # only works for sklearn >= 1.2

try:  # if on phy-server local modules will not be found if their directory is not added to PATH
    import sys
    sys.path.append("/silod7/lenz/MPSchleiSediments/analysis/")
    import os
    os.chdir("/silod7/lenz/MPSchleiSediments/analysis/")
except Exception:
    pass

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target, featurelist
from plots import scatter_chart
from cv import generate_feature_sets, best_scored, get_median_cv_scores, get_iqm_cv_scores, SelectFeatures, median_absolute_percentage_error, iqm

## Data preparation

In [None]:
%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]
scor_cau = PCOA(grainsize_cau, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

sdd_cau = pd.read_csv('../data/Metadata_CAU_sampling_log.csv', index_col=0).join(prepare_data.fix_gradistat_names(pd.read_csv('../data/GRADISTAT_CAU_vol_log-cau_closed.csv', index_col=0)), how='outer')
sdd_cau = sdd_cau.merge(scor_cau, right_index=True, left_on='Sample', how='outer').reset_index()
# sdd_cau = geo.get_wwtp_influence(sdd_cau, tracks_file='../data/BAW_tracer_simulations.zip', file_postfix='_CAU')  # TODO: activate to get WWTP_influence features at CAU stations

In [None]:
## Split data into samples used for building the model and samples used for predicting.

samples_with_response_and_predictor_data = sdd_iow.loc[~sdd_iow[target].isna()].set_index('Sample')
samples_with_only_predictor_data = sdd_iow.loc[sdd_iow[target].isna()]
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, sdd_cau.drop('Date',axis=1)]).set_index('Sample')

In [None]:
## Samples which are not suitable for ("hydrodynamic outliers") are moved from modelling data to prediction data
droplist = ['S32','S05']
samples_with_only_predictor_data = pd.concat([samples_with_only_predictor_data, samples_with_response_and_predictor_data.loc[droplist,:]])
samples_with_response_and_predictor_data = samples_with_response_and_predictor_data.drop(droplist)

In [None]:
## Limit dataframes to features (predictors) to be used in the model. Beware: depending on the preprocessing steps not all features might be used.
model_X = samples_with_response_and_predictor_data[featurelist]
model_y = samples_with_response_and_predictor_data[target]
pred_X = samples_with_only_predictor_data[featurelist]

In [None]:
# Predefined test set optionally to be used instead of CV:
test_set = ('S30', 'S03', 'S15', 'S06', 'S31', 'S25', 'S20')  # possible samples to use as a predefined test set: ordered by relevance
test_set_size = 7  # Requires int, should be 0 < test_set_size <= len(test_set), for using the n first samples as test_set
test_set = test_set[0:test_set_size]
test_set = model_X.index.isin(test_set).astype(int) - 1  # returns array of len(model_X.index) with -1 for training samples and 0 for testing samples

In [None]:
scaler = None  # this is just needed for reporting as long as scaler is manually switched outside the pipeline

In [None]:
# Scale data using StandardScaler

# scaler = StandardScaler()
# model_X.values[:] = scaler.fit_transform(model_X)
# pred_X.values[:] = scaler.transform(pred_X)

In [None]:
## Check some basic statistics of the target variable

# model_y.describe()
# model_y.hist()
# model_X.info()


## Model building

### Custom preprocessing functions to be used in the model pipeline

In [None]:
## Create exhaustive feature selector, using leave-p-out on columns labels to generate a boolean matrix.
num_feat = (2,2)  # allowed number of features:
                  #     if int: all possible combinations of this length will be created
                  #     if tuple (min, max): all possible combinations of length min upt to length max will be created 
feature_candidates_list = generate_feature_sets(model_X, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=num_feat, n_jobs=1, save=True)

CustomFeatureSelector = FunctionTransformer(SelectFeatures)#, feature_names_out='one-to-one')

### Creating the model pipeline

In [None]:
## Pipeline and parameter grid for model selection, see here for inspiration: https://towardsdatascience.com/getting-the-most-out-of-scikit-learn-pipelines-c2afc4410f1a

PreProcessor = ColumnTransformer([
      ('selector', CustomFeatureSelector, model_X.columns),
      # ('imputer', SimpleImputer(), make_column_selector(dtype_include=np.number)),
      # ('scaler', StandardScaler(), make_column_selector(dtype_include=np.number)),
      # ('encoder', OneHotEncoder(), make_column_selector(dtype_include=object)),
      ])

pipe = Pipeline(steps=[
    ('preprocessor', PreProcessor),
    ('regressor', DummyRegressor())
    ])

preprocessor_params = [{
    'preprocessor__selector': [CustomFeatureSelector],
    'preprocessor__selector__kw_args': [{'feature_set': i,
                                         'feature_sets': feature_candidates_list
                                        } for i in range(len(feature_candidates_list))],
    
    # 'preprocessor__scaler': [StandardScaler()],#MaxAbsScaler(), MinMaxScaler(), RobustScaler(), QuantileTransformer(), Normalizer()],
    #    'preprocessor__scaler__with_mean': [True],
    #    'preprocessor__scaler__with_std': [True],
    }]

regressor_params = [
    # {
    #    'regressor': [DummyRegressor()],
    #    'regressor__strategy': ['median'],
    # },
    
    # {
    # 'regressor': [SVR()],
    #    'regressor__C': [0.1, 1, 1.5, 10, 20],
    #    'regressor__kernel': ['linear', 'rbf', 'poly'],
    #    'regressor__degree': [2, 3, 4, 5],
    # },

    # {
    #  'regressor': [TweedieRegressor(max_iter=100000)],
    #  'regressor__power': [2],
    #   # 'regressor__power': [0, 1, 1.2, 1.5, 1.9, 2, 3],
    #  'regressor__alpha': [0], 
    #  'regressor__link': ['log'],#, 'identity', 'auto'],
    #   # 'regressor__fit_intercept': [True, False],
    #   # 'regressor__warm_start': [True, False],
    #  },
  
    # {
    # 'regressor': [RadiusNeighborsRegressor()],
    #     'regressor__radius': [1000, 10000, 100000],
    #     'regressor__weights': ['uniform', 'distance'],
    #     'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    #     'regressor__leaf_size': [10, 20, 30, 40, 50],
    # },
    
    # {
    # 'regressor': [RandomForestRegressor(random_state=np.random.RandomState(0))],
    #   'regressor__n_estimators': [10, 100],
    #   'regressor__max_depth': [None, 2, 4],
    #   'regressor__max_features': [None, 1/3],
    #  'regressor__min_samples_split': [2, 10],
    #      'regressor__min_samples_leaf': [1, 3, 5],
    #      'regressor__bootstrap': [True, False],
    #      'regressor__oob_score': [True, False],
    #      'regressor__warm_start': [True, False],
    # },
    
    #{
    # 'regressor': [GradientBoostingRegressor(random_state=np.random.RandomState(0))],
    #     'regressor__loss': ['squared_error', 'huber', 'quantile'],
    #     'regressor__learning_rate': [0.01, 0.1, 0.5],  
    #     'regressor__n_estimators': [100, 200, 500],
    #     'regressor__subsample': [0.5, 1.0],
    #     'regressor__criterion': ['squared_error', 'friedman_mse'],
    #     'regressor__min_samples_split': [2, 10],
    #     'regressor__min_samples_leaf': [1, 5],
    #     'regressor__max_depth': [2, 3, 5],
    #     'regressor__min_weight_fraction_leaf': [0.0, 0.1],
    #     'regressor__max_features': [None, 'sqrt', 'log2'],
    #     'regressor__max_leaf_nodes': [None, 5, 10],
    #     'regressor__min_impurity_decrease': [0.0, 0.1],
    #     'regressor__min_impurity_split': [None, 0.1],
    #     'regressor__alpha': [0.9, 0.95, 0.99, 0.999],
    #     'regressor__tol': [0.0001, 0.001, 0.01],
    #     'regressor__validation_fraction': [0.1, 0.2],
    #     'regressor__n_iter_no_change': [None, 5, 10],
    #     'regressor__ccp_alpha': [0.0, 0.1],
    #     'regressor__warm_start': [True, False],
    # },

    # {
    # 'regressor': [HistGradientBoostingRegressor(random_state=np.random.RandomState(0))],
    #     'regressor__loss': ['squared_error', 'poisson', 'quantile'],
    #     'regressor__quantile': [0.1, 0.5, 0.9],
    #     'regressor__learning_rate': [0.01, 0.1, 0.5],  
    #     'regressor__max_iter': [50, 100, 500],  
    #     'regressor__max_depth': [2, 3, 5],
    #     'regressor__min_samples_leaf': [1, 5],
    #     'regressor__l2_regularization': [0.9, 0.95, 0.99, 0.999],
    #     'regressor__max_bins': [None, 0.1],
    #     'regressor__validation_fraction': [0.1, 0.2],
    #     'regressor__tol': [0.0001, 0.001, 0.01],
    #     'regressor__n_iter_no_change': [None, 5, 10],
    # },

    {
    'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 1)],
        'regressor__objective': ['reg:squarederror', 'reg:gamma', 'reg:tweedie'],
        'regressor__booster': ['gblinear'],
        'regressor__n_estimators': [100, 500],
        'regressor__reg_alpha': [0.0, 0.1, 1, 10],
        'regressor__reg_lambda': [0.0, 0.1, 1, 10],
    },

    # {
    # 'regressor': [XGBRegressor(random_state=np.random.RandomState(0), verbosity = 0)],
    #     'regressor__objective': ['reg:squarederror', 'reg:gamma', 'reg:tweedie'],
    #     'regressor__booster': ['gbtree'],
    #     'regressor__n_estimators': [100, 500], #
    #     'regressor__tree_method': ['exact', 'hist'],
    #     'regressor__grow_policy': ['depthwise', 'lossguide'],
    #     'regressor__learning_rate': [0.05, 0.2], #alias for eta
    # #     'regressor__min_split_loss': [0.0, 0.1], # alias for gamma
    # #     'regressor__max_depth': [0, 1, 3, 6],
    # #     'regressor__min_child_weight': [0.5, 1, 5],
    # #     'regressor__subsample': [0.5, 0.8, 1.0],
    # #     'regressor__colsample_bytree': [1/3, 0.5, 1.0],
    # # #     'regressor__colsample_bylevel': [0.5, 1.0],
    # # #     'regressor__colsample_bynode': [0.5, 1.0],
    # #     'regressor__reg_alpha': [0.0, 0.1, 1],
    # #     'regressor__reg_lambda': [0.0, 0.1, 1],
    # },
]

params = [{**dict_pre, **dict_reg} for dict_reg in regressor_params for dict_pre in preprocessor_params]
# params = regressor_params

### Training the model

In [None]:
## The pipeline is run by searching the provided parameter space using scorings of a crossvalidation technique to find out how each model candidate performs.

Config.scoring = {  # this dict is defined in settings.Config, but may be overwritten here for convenience
    'R2': 'r2',
    'MAPE': 'neg_mean_absolute_percentage_error',
    'MedAE': 'neg_median_absolute_error',
    'MedAPE': make_scorer(median_absolute_percentage_error, greater_is_better=False),
    # 'MSLE': 'neg_mean_squared_log_error',
  }
Config.refit_scorer = 'R2'
Config.select_best = 'iqm'  # 'mean', 'median'

cv_scheme_inner = 10 #LeaveOneOut()
cv_scheme_outer = 10 #LeaveOneOut()  # use `PredefinedSplit(test_set)` for a single fold with test set as defined above

starttime = datetime.now()
# with parallel_backend('loky', n_jobs=-1):
innerCV = GridSearchCV(
    pipe,
    params,
    scoring=Config.scoring,
    refit=best_scored,
    cv=cv_scheme_inner,
    verbose=1,
    n_jobs=-1
    )

outerCV = cross_validate(
    innerCV,
    model_X,
    model_y,
    scoring=Config.scoring,
    cv=cv_scheme_outer,
    return_train_score=True,
    return_estimator=True,
    verbose=2,
    # n_jobs=-1
    )

get_median_cv_scores(outerCV)
get_iqm_cv_scores(outerCV)

time_needed = datetime.now() - starttime

## Evaluating the model

In [None]:
# make a df of all outer cv results and show it sorted by the best scoring metric
outerCV_df = pd.DataFrame(outerCV)
outerCV_df.rename_axis(index='outerCV_fold', inplace=True)

## Get best model params for each of the outer cv folds:
best_params_df = pd.DataFrame()
for i, model in enumerate(outerCV['estimator']):
    best_params = model.best_params_
    # best_params_df = pd.concat([best_params_df, pd.DataFrame(best_params, index=[i])])  # this does not work when RandomForestRegressor is used, because some internals call len() on the values of the best_params dict, which raises AttributeError: 'RandomForestRegressor' object has no attribute 'estimators_'
    # instead filling df with for-loop...:
    current_best_params_df = pd.DataFrame()
    for key, value in best_params.items():
        current_best_params_df[key] = [value]
        current_best_params_df.index = [i]
    best_params_df = pd.concat([best_params_df, current_best_params_df])

results = outerCV_df.join(best_params_df)

In [None]:
results_summary = results.copy().drop(['preprocessor__selector', 'estimator', 'fit_time', 'score_time'], axis=1)

# get names of features used by the models
if 'preprocessor__selector__kw_args' in results.columns:
    results_summary.rename(columns={'preprocessor__selector__kw_args': 'features'}, inplace=True)
    s = results_summary.features.apply(lambda x: [x['feature_set'], feature_candidates_list[x['feature_set']]])
    d = pd.DataFrame.from_dict(dict(zip(s.index, s.values))).T
    results_summary.features, results_summary['feature_combi_ID'] = d[1], d[0]
results_summary.drop(list(results_summary.filter(regex='regressor__')), axis=1, inplace=True)

# calculate scores of the best model for each outer cv fold against all data
scorer_dict = {
    'R2': r2_score,
    'MAPE': mean_absolute_percentage_error,
    'MedAE': median_absolute_error,
    'MedAPE': median_absolute_percentage_error
}
for key in Config.scoring:
    results_summary[f'allSamples_{key}'] = [scorer_dict[key](model_y, outerCV['estimator'][i].predict(model_X)) for i in range(len(results_summary))]

# now refit all models in outerCV on all data
outerCV['estimator_refit_on_all'] = [clone(outerCV['estimator'][i].best_estimator_.named_steps['regressor']).fit(model_X[results_summary.features.loc[i]], model_y) for i, _ in enumerate(outerCV['estimator'])]

# calculate scores against all data again after refitting
for key in Config.scoring:
    results_summary[f'allSamples_{key}_refit'] = [scorer_dict[key](model_y, outerCV['estimator_refit_on_all'][i].predict(model_X[results_summary.features.loc[i]])) for i in range(len(results_summary))]

# Sort results
results_summary.sort_values(by=f'test_{Config.refit_scorer}', ascending=False, inplace=True)

pd.set_option('display.max_colwidth', None)
results_summary

In [None]:
savestamp = starttime.strftime("%Y%m%d_%H%M%S")
fp = '../data/exports/models/'
header = f'''

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Model started:;{savestamp}
Duration:;{round(time_needed.seconds / 3600, 3)} h on {joblib.cpu_count()} cpu cores
Outliers excluded:;{droplist}
Samples:;{model_X.index.to_list()}
Vertical merge:;{Config.vertical_merge}
Scaler:;{scaler}
Number of features per candidate (min, max):;{num_feat}
Total number of feature combinations tested:;{len(feature_candidates_list)}
Available features: {len(featurelist)}; {featurelist}

Regressors:;{regressor_params}
Scorer for evaluation (outer fold results are sorted by this!):;{Config.refit_scorer}
Aggregation for evaluation:;{Config.select_best}

CV schemes:
    inner:;{cv_scheme_inner}
    outer:;{cv_scheme_outer}
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Median outer CV test set {Config.refit_scorer}:; {np.median(results_summary[f'test_{Config.refit_scorer}'])}
IQM outer CV test set {Config.refit_scorer}:; {iqm(results_summary[f'test_{Config.refit_scorer}'])}
Mean outer CV test set {Config.refit_scorer}:; {np.mean(results_summary[f'test_{Config.refit_scorer}'])}
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'''

# Save model settings
fn = fp + 'model_NCV_result.csv'
with open(fn, mode='a' if Path(fn).exists() else 'w', encoding='utf-8') as f:
    f.write(header)

# Save results
results_summary.to_csv(fn, mode='a', sep=';')

In [None]:
print(header)

In [None]:
# get top k results of inner CV sorted by best Config.refit_scorer 

# outer_fold = 0  # manuaklly chose which outer fold to look at
# top_k = 10  # how many model candidates from the inner model to show?
# pd.DataFrame(outerCV['estimator'][outer_fold].cv_results_).sort_values(f'rank_by_median_test_{Config.refit_scorer}', ascending=True).head(top_k)


In [None]:
## Printing score of the best performing model candidate and its parameters.

outer_fold = 0

model = outerCV['estimator_refit_on_all'][outer_fold]
print(best_params_df.loc[outer_fold, 'regressor'])
print(f'R2 of model retrained with all samples, tested against all samples: {r2_score(model_y, model.predict(model_X[results_summary.features.loc[outer_fold]]))}')
# if model was TweedieRegressor, print alpha and power
if 'Tweedie' in str(type(best_params_df.loc[outer_fold, 'regressor'])):
    print(f'Intercept: {model.intercept_}')
    print(f'Coeffs: {model.coef_}')
print(feature_candidates_list[best_params_df.loc[outer_fold, 'preprocessor__selector__kw_args']['feature_set']])
# print(f'{Config.scoring[Config.refit_scorer]}: {outerCV["estimator"][outer_fold].score(model_X, model_y)}')  
#print(outerCV['estimator'][outer_fold].best_params_)

In [None]:
# print('Number of coefficients in each outer folds best model: ', [outerCV['estimator'][i].best_estimator_.named_steps['regressor'].n_features_in_ for i in range(len(outerCV['estimator']))])
# print('Coeffs: ', *[outerCV['estimator'][i].best_estimator_.named_steps['regressor'].coef_ for i in range(len(outerCV['estimator']))], sep='\n')
# results.estimator[0].best_estimator_.named_steps['preprocessor'].transformers[0][1].get_feature_names_out()

In [None]:
# [outerCV['estimator'][0].best_estimator_.named_steps['regressor'].estimators_[i].get_n_leaves() for i in range(100)]

In [None]:
# results['estimator'].apply(lambda x: x.score(model_X, model_y))
# pd.DataFrame.from_dict(dict(zip(s.values)))


In [None]:
# r2_all_but_no_refit = results['estimator'].apply(lambda x: r2_score(model_y, x.predict(model_X)))
# r2_all_but_no_refit

In [None]:
# get inner cv results of the outer cv fold which achieved the best scoring metric
# innerCV_df = pd.DataFrame(outerCV_df.loc[outerCV_df[f'test_score'].idxmax(), 'estimator'].cv_results_)
# innerCV_df.sort_values(by=f'rank_test_{scoring[0]}', ascending=True)

In [None]:
# outerCV['estimator'][0].best_estimator_.named_steps['preprocessor'].get_feature_names_out()

In [None]:
## Extracting feature names
# [grid.best_estimator_.named_steps['preprocessor'].named_transformers_['selector'].get_feature_names_out(input_features=model_X.columns.tolist())]

## The chosen final model

- evaluation by cross-validation
- prediction

In [None]:
# Activate this block to use same features, model type and hyperparameters a specific outer fold
#outer_fold = 1
#final_model_X = model_X[results_summary.features.loc[outer_fold]]
#final_model = outerCV['estimator_refit_on_all'][outer_fold]

# Alternatively activate this block to use a manually defined model
final_model_X = model_X[['PC1', 'Depth', 'WWTP_influence_as_mean_time_travelled__sed_18µm_allseasons_444']]
final_model = TweedieRegressor(alpha=0.1, link='log', max_iter=100000, power=1.5, tol=0.0001)
    
# for loop to run cross validation on the final model with leave-p-out iterating p from 2 to 10
allR2s = {}
allMedianR2s = {}
allMeanR2s = {}
R2calcDuration = {}
for P in range(2, 7):
    starttime = datetime.now()
    finalR2s = cross_val_score(final_model, final_model_X, model_y, cv=LeavePOut(P), scoring='r2', n_jobs=-1, verbose=0)
    duration = datetime.now() - starttime
    print(f'Cross-validation of final model with leave-{P}-out took {duration.seconds//3600} hours, {(duration.seconds//60)%60} minutes and {duration.seconds%60} seconds.')
    print(f'Cross-validated MEAN R2 of final model: {finalR2s.mean():.3f} (Standard deviation: {finalR2s.std():.3f})')
    print(f'Cross-validated MEDIAN R2 of final model: {np.median(finalR2s):.3f} (IQR {np.subtract(*np.percentile(finalR2s, [.75, .25])):.3f})')
    print()
    allR2s[P] = finalR2s
    allMedianR2s[P] = np.median(finalR2s)
    allMeanR2s[P] = finalR2s.mean()
    R2calcDuration[P] = duration

final_model.fit(final_model_X, model_y)
print(f'R2 of final model retrained with all samples, tested against all samples: {r2_score(model_y, final_model.predict(final_model_X))}')

# Predictions
#pred_X = pred_X[model_X.columns.tolist()]
#pred_y = final_model.predict(pred_X)
df1 = pd.DataFrame(zip(model_y, final_model.predict(final_model_X), ['IOW']*len(model_y)), index=model_X.index, columns=[target, 'predicted', 'group'])
#df2 = pd.DataFrame(zip([0]*len(pred_y), pred_y, ['CAU']*len(pred_y)), index=pred_X.index, columns=[target, 'predicted', 'group'])
df = pd.concat([df1])#, df2])

scatter_chart(df.reset_index(), target, 'predicted', 'group', labels='Sample', identity=True, equal_axes=True, width=800, height=800, title='yhat vs. y')[0]

In [None]:
final_model_X.shape

In [None]:
# combine dicts 'allMeanR2s' and 'allMedianR2s' into a dataframe
final_scores_df = pd.DataFrame.from_dict(allMeanR2s, orient='index', columns=['Mean R2']).join(pd.DataFrame.from_dict(allMedianR2s, orient='index', columns=['Median R2']))

In [None]:
final_scores_df.plot.line(title='Mean and median R2 of final model with leave-p-out cross-validation', xlabel='p', ylabel='R2', width=800, height=400)